Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

svm_train.py 5.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
  1. import pickle
  2. from datetime import datetime
  3. import pandas as pd
  4. import numpy as np
  5. from sklearn.metrics import accuracy_score, f1_score
  6. from sklearn.feature_extraction.text import TfidfVectorizer
  7. from sklearn.model_selection import KFold, GridSearchCV
  8. from sklearn.svm import SVC
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.ensemble import BaggingClassifier
  11. from sklearn.multiclass import OneVsRestClassifier
  12. from sklearn.multioutput import MultiOutputRegressor
  13. import dagshub
  14. def load_code_blocks(DATASET_PATH, CODE_COLUMN):
  15. df = pd.read_csv(DATASET_PATH, encoding='utf-8', comment='#', sep=',')#, quoting=csv.QUOTE_NONE, error_bad_lines=False)#, sep=','
  16. df.dropna(axis=0, inplace=True)
  17. code_blocks = df[CODE_COLUMN]
  18. # test_size = 0.1
  19. # test_rows = round(df.shape[0]*test_size)
  20. # train_rows = df.shape[0] - test_rows
  21. # train_code_blocks = df[CODE_COLUMN][0:test_rows]
  22. # test_code_blocks = df[CODE_COLUMN][train_rows:]
  23. return df, code_blocks
  24. def tfidf_fit_transform(code_blocks, params, TFIDF_DIR):
  25. vectorizer = TfidfVectorizer(**params)
  26. tfidf = vectorizer.fit(code_blocks)
  27. pickle.dump(tfidf, open(TFIDF_DIR, "wb"))
  28. print('TF-IDF model has been saved')
  29. code_blocks_tfidf = tfidf.transform(code_blocks)
  30. return code_blocks_tfidf
  31. def SVM_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params):
  32. code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
  33. X_train, X_test, y_train, y_test = train_test_split(code_blocks_tfidf, df[TAGS_TO_PREDICT], test_size=0.3)
  34. # grid = {"C": [100]}
  35. # cv = KFold(n_splits=2, shuffle=True, random_state=241)
  36. model = SVC(kernel="linear", random_state=241)
  37. # gs = GridSearchCV(model, grid, scoring="accuracy", cv=cv, verbose=1, n_jobs=-1)
  38. # gs.fit(X_train[:25000], y_train.ravel()[:25000])
  39. # C = gs.best_params_.get('C')
  40. # model = SVC(**SVM_params)
  41. print("Train SVM params:", model.get_params())
  42. n_estimators = 10
  43. clf = BaggingClassifier(model, max_samples=1.0 / n_estimators, n_estimators=n_estimators)
  44. # clf = model
  45. print("starting training..")
  46. clf.fit(X_train, y_train)
  47. print("saving the model")
  48. pickle.dump(clf, open(MODEL_DIR, 'wb'))
  49. print("predicting on the test..")
  50. y_pred = clf.predict(X_test)
  51. accuracy = accuracy_score(y_test, y_pred)
  52. f1 = f1_score(y_test, y_pred, average='weighted')
  53. # confus_matrix = confusion_matrix(model, X_test, y_test)
  54. metrics = {'test_accuracy': accuracy
  55. , 'test_f1_score': f1}
  56. print(metrics)
  57. return metrics
  58. def SVM_multioutput_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params):
  59. code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
  60. X_train, X_test, y_train, y_test = train_test_split(code_blocks_tfidf, df[TAGS_TO_PREDICT], test_size=0.3)
  61. # grid = {"C": [100]}
  62. # cv = KFold(n_splits=2, shuffle=True, random_state=241)
  63. model = SVC(kernel="linear", random_state=241)
  64. # gs = GridSearchCV(model, grid, scoring="accuracy", cv=cv, verbose=1, n_jobs=-1)
  65. # gs.fit(X_train[:25000], y_train.ravel()[:25000])
  66. # C = gs.best_params_.get('C')
  67. # model = SVC(**SVM_params)
  68. print("Train SVM params:", model.get_params())
  69. n_estimators = 10
  70. clf = MultiOutputRegressor(BaggingClassifier(model, max_samples=1.0 / n_estimators, n_estimators=n_estimators))
  71. # clf = model
  72. print("starting training..")
  73. clf.fit(X_train, y_train)
  74. print("saving the model")
  75. pickle.dump(clf, open(MODEL_DIR, 'wb'))
  76. print("predicting on the test..")
  77. y_pred = clf.predict(X_test)
  78. accuracy = accuracy_score(y_test, y_pred)
  79. f1 = f1_score(y_test, y_pred, average='weighted')
  80. # confus_matrix = confusion_matrix(model, X_test, y_test)
  81. metrics = {'test_accuracy': accuracy
  82. , 'test_f1_score': f1}
  83. print(metrics)
  84. return metrics
  85. if __name__ == '__main__':
  86. GRAPH_VERSION = 5
  87. DATASET_PATH = './data/code_blocks_regex_graph_v{}.csv'.format(GRAPH_VERSION)
  88. MODEL_DIR = './models/svm_regex_graph_v{}.sav'.format(GRAPH_VERSION)
  89. TFIDF_DIR = './models/tfidf_svm_graph_v{}.pickle'.format(GRAPH_VERSION)
  90. CODE_COLUMN = 'code_block'
  91. TAGS_TO_PREDICT = ['import', 'data_import', 'data_export', 'preprocessing',
  92. 'visualization', 'model', 'deep_learning_model', 'train', 'predict']
  93. SCRIPT_DIR = __file__
  94. df, code_blocks = load_code_blocks(DATASET_PATH, CODE_COLUMN)
  95. nrows = df.shape[0]
  96. print("loaded")
  97. tfidf_params = {'min_df': 5
  98. , 'max_df': 0.3
  99. , 'smooth_idf': True}
  100. SVM_params = {'C':100
  101. , 'kernel':"linear"
  102. , 'random_state':241}
  103. data_meta = {'DATASET_PATH': DATASET_PATH
  104. ,'nrows': nrows
  105. ,'label': TAGS_TO_PREDICT
  106. ,'model': MODEL_DIR
  107. ,'script_dir': SCRIPT_DIR}
  108. with dagshub.dagshub_logger() as logger:
  109. print("evaluating..")
  110. metrics = SVM_multioutput_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params)
  111. print("saving the results..")
  112. logger.log_hyperparams(data_meta)
  113. logger.log_hyperparams(tfidf_params)
  114. logger.log_hyperparams(SVM_params)
  115. logger.log_metrics(metrics)
  116. print("finished")
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...