levin
/
source_code_classification
mirror of https://github.com/whatevernevermindbro/source_code_classification


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
            import pickle
from datetime import datetime

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputRegressor
import dagshub

def load_code_blocks(DATASET_PATH, CODE_COLUMN):
    df = pd.read_csv(DATASET_PATH, encoding='utf-8', comment='#', sep=',')#, quoting=csv.QUOTE_NONE, error_bad_lines=False)#, sep=','
    df.dropna(axis=0, inplace=True)
    code_blocks = df[CODE_COLUMN]
    # test_size = 0.1
    # test_rows = round(df.shape[0]*test_size)
    # train_rows = df.shape[0] - test_rows
    # train_code_blocks = df[CODE_COLUMN][0:test_rows]
    # test_code_blocks = df[CODE_COLUMN][train_rows:]
    return df, code_blocks

def tfidf_fit_transform(code_blocks, params, TFIDF_DIR):
    vectorizer = TfidfVectorizer(**params)
    tfidf = vectorizer.fit(code_blocks)
    pickle.dump(tfidf, open(TFIDF_DIR, "wb"))
    print('TF-IDF model has been saved')
    code_blocks_tfidf = tfidf.transform(code_blocks)
    return code_blocks_tfidf

def SVM_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params):
    code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
    X_train, X_test, y_train, y_test = train_test_split(code_blocks_tfidf, df[TAGS_TO_PREDICT], test_size=0.3)
    # grid = {"C": [100]}
    # cv = KFold(n_splits=2, shuffle=True, random_state=241)
    model = SVC(kernel="linear", random_state=241)
    # gs = GridSearchCV(model, grid, scoring="accuracy", cv=cv, verbose=1, n_jobs=-1)
    # gs.fit(X_train[:25000], y_train.ravel()[:25000])
    # C = gs.best_params_.get('C')
    # model = SVC(**SVM_params)
    print("Train SVM params:", model.get_params())
    n_estimators = 10
    clf = BaggingClassifier(model, max_samples=1.0 / n_estimators, n_estimators=n_estimators)
    # clf = model
    print("starting training..")
    clf.fit(X_train, y_train)
    print("saving the model")
    pickle.dump(clf, open(MODEL_DIR, 'wb'))
    print("predicting on the test..")
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    # confus_matrix = confusion_matrix(model, X_test, y_test)
    metrics = {'test_accuracy': accuracy
            , 'test_f1_score': f1}
    print(metrics)
    return metrics

def SVM_multioutput_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params):
    code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
    X_train, X_test, y_train, y_test = train_test_split(code_blocks_tfidf, df[TAGS_TO_PREDICT], test_size=0.3)
    # grid = {"C": [100]}
    # cv = KFold(n_splits=2, shuffle=True, random_state=241)
    model = SVC(kernel="linear", random_state=241)
    # gs = GridSearchCV(model, grid, scoring="accuracy", cv=cv, verbose=1, n_jobs=-1)
    # gs.fit(X_train[:25000], y_train.ravel()[:25000])
    # C = gs.best_params_.get('C')
    # model = SVC(**SVM_params)
    print("Train SVM params:", model.get_params())
    n_estimators = 10
    clf = MultiOutputRegressor(BaggingClassifier(model, max_samples=1.0 / n_estimators, n_estimators=n_estimators))
    # clf = model
    print("starting training..")
    clf.fit(X_train, y_train)
    print("saving the model")
    pickle.dump(clf, open(MODEL_DIR, 'wb'))
    print("predicting on the test..")
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    # confus_matrix = confusion_matrix(model, X_test, y_test)
    metrics = {'test_accuracy': accuracy
            , 'test_f1_score': f1}
    print(metrics)
    return metrics

if __name__ == '__main__':
    GRAPH_VERSION = 5
    DATASET_PATH = './data/code_blocks_regex_graph_v{}.csv'.format(GRAPH_VERSION)
    MODEL_DIR = './models/svm_regex_graph_v{}.sav'.format(GRAPH_VERSION)
    TFIDF_DIR = './models/tfidf_svm_graph_v{}.pickle'.format(GRAPH_VERSION)
    CODE_COLUMN = 'code_block'
    TAGS_TO_PREDICT = ['import', 'data_import', 'data_export', 'preprocessing',
                        'visualization', 'model', 'deep_learning_model', 'train', 'predict']
    SCRIPT_DIR = __file__
    
    df, code_blocks = load_code_blocks(DATASET_PATH, CODE_COLUMN)
    nrows = df.shape[0]
    print("loaded")
    tfidf_params = {'min_df': 5
            , 'max_df': 0.3
            , 'smooth_idf': True}
    SVM_params = {'C':100
            , 'kernel':"linear"
            , 'random_state':241}
    data_meta = {'DATASET_PATH': DATASET_PATH
                ,'nrows': nrows
                ,'label': TAGS_TO_PREDICT
                ,'model': MODEL_DIR
                ,'script_dir': SCRIPT_DIR}

    with dagshub.dagshub_logger() as logger:
        print("evaluating..")
        metrics = SVM_multioutput_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params)
        print("saving the results..")
        logger.log_hyperparams(data_meta)
        logger.log_hyperparams(tfidf_params)
        logger.log_hyperparams(SVM_params)
        logger.log_metrics(metrics)
    print("finished")