1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
- import argparse
- import pandas as pd
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, \
- f1_score
- from sklearn.model_selection import train_test_split
- import joblib
- import dagshub
- # Consts
- CLASS_LABEL = 'MachineLearning'
- train_df_path = 'data/train.csv.zip'
- test_df_path = 'data/test.csv.zip'
- def feature_engineering(raw_df):
- df = raw_df.copy()
- df['CreationDate'] = pd.to_datetime(df['CreationDate'])
- df['CreationDate_Epoch'] = df['CreationDate'].astype('int64') // 10 ** 9
- df = df.drop(columns=['Id', 'Tags'])
- df['Title_Len'] = df.Title.str.len()
- df['Body_Len'] = df.Body.str.len()
- # Drop the correlated features
- df = df.drop(columns=['FavoriteCount'])
- df['Text'] = df['Title'].fillna('') + ' ' + df['Body'].fillna('')
- return df
- def fit_tfidf(train_df, test_df):
- tfidf = TfidfVectorizer(max_features=25000)
- tfidf.fit(train_df['Text'])
- train_tfidf = tfidf.transform(train_df['Text'])
- test_tfidf = tfidf.transform(test_df['Text'])
- return train_tfidf, test_tfidf, tfidf
- def fit_model(train_X, train_y, random_state=42):
- clf_tfidf = RandomForestClassifier(random_state=random_state, max_depth=50, class_weight='balanced')
- clf_tfidf.fit(train_X, train_y)
- return clf_tfidf
- def eval_model(clf, X, y):
- y_proba = clf.predict_proba(X)[:, 1]
- y_pred = clf.predict(X)
- return {
- 'roc_auc': roc_auc_score(y, y_proba),
- 'average_precision': average_precision_score(y, y_proba),
- 'accuracy': accuracy_score(y, y_pred),
- 'precision': precision_score(y, y_pred),
- 'recall': recall_score(y, y_pred),
- 'f1': f1_score(y, y_pred),
- }
- def split(random_state=42):
- print('Loading data...')
- df = pd.read_csv('data/CrossValidated-Questions.csv')
- df[CLASS_LABEL] = df['Tags'].str.contains('machine-learning').fillna(False)
- train_df, test_df = train_test_split(df, random_state=random_state, stratify=df[CLASS_LABEL])
- print('Saving split data...')
- train_df.to_csv(train_df_path)
- test_df.to_csv(test_df_path)
- # Prepare a dictionary of either hyperparams or metrics for logging.
- def prepare_log(d, prefix=''):
- if prefix:
- prefix = f'{prefix}__'
- # Ensure all logged values are suitable for logging - complex objects aren't supported.
- def sanitize(value):
- return value if value is None or type(value) in [str, int, float, bool] else str(value)
- return {f'{prefix}{k}': sanitize(v) for k, v in d.items()}
- def train():
- print('Loading data...')
- train_df = pd.read_csv(train_df_path)
- test_df = pd.read_csv(test_df_path)
- print('Engineering features...')
- train_df = feature_engineering(train_df)
- test_df = feature_engineering(test_df)
- with dagshub.dagshub_logger() as logger:
- print('Fitting TFIDF...')
- train_tfidf, test_tfidf, tfidf = fit_tfidf(train_df, test_df)
- print('Saving TFIDF object...')
- joblib.dump(tfidf, 'outputs/tfidf.joblib')
- logger.log_hyperparams(prepare_log(tfidf.get_params(), 'tfidf'))
- print('Training model...')
- train_y = train_df[CLASS_LABEL]
- model = fit_model(train_tfidf, train_y)
- print('Saving trained model...')
- joblib.dump(model, 'outputs/model.joblib')
- logger.log_hyperparams(model_class=type(model).__name__)
- logger.log_hyperparams(prepare_log(model.get_params(), 'model'))
- print('Evaluating model...')
- train_metrics = eval_model(model, train_tfidf, train_y)
- print('Train metrics:')
- print(train_metrics)
- logger.log_metrics(prepare_log(train_metrics, 'train'))
- test_metrics = eval_model(model, test_tfidf, test_df[CLASS_LABEL])
- print('Test metrics:')
- print(test_metrics)
- logger.log_metrics(prepare_log(test_metrics, 'test'))
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- subparsers = parser.add_subparsers(title='Split or Train step:', dest='step')
- subparsers.required = True
- split_parser = subparsers.add_parser('split')
- split_parser.set_defaults(func=split)
- train_parser = subparsers.add_parser('train')
- train_parser.set_defaults(func=train)
- parser.parse_args().func()
|