Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

train.py 3.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
  1. # Load Pkgs
  2. import pandas as pd
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.tree import DecisionTreeClassifier
  5. from sklearn.model_selection import train_test_split
  6. from sklearn.metrics import accuracy_score,f1_score,precision_score
  7. from sklearn.pipeline import Pipeline
  8. from sklearn.preprocessing import StandardScaler,MinMaxScaler
  9. # Load Utils
  10. import dagshub
  11. import joblib
  12. import os
  13. from rich.console import Console
  14. console = Console()
  15. # Load Dataset
  16. def load_data(data):
  17. df = pd.read_csv(data)
  18. return df
  19. # Process Data
  20. def split_data(df,label_col='label',test_size=0.3,output_path='data/processed/'):
  21. """Split Dataset into Features and Labels"""
  22. # Features & Labels
  23. Xfeatures = df.drop(label_col,axis=1)
  24. # Select last column of dataframe as a dataframe object
  25. # last_column = df.iloc[: , -1:]
  26. ylabels = df[label_col]
  27. # Split Dataset
  28. x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=test_size,random_state=7)
  29. print("Generating Dataset for {}".format('training'))
  30. x_train.to_csv(os.path.join(output_path,"x_train.csv"))
  31. y_train.to_csv(os.path.join(output_path,"y_train.csv"))
  32. print("Generating Dataset for {}".format('testing'))
  33. x_test.to_csv(os.path.join(output_path,"x_test.csv"))
  34. y_test.to_csv(os.path.join(output_path,"y_test.csv"))
  35. print("Generating Metadata about dataset")
  36. col_xfeatures = pd.DataFrame(list(Xfeatures.columns))
  37. col_xfeatures.to_csv("features.csv",header=False,index=False)
  38. col_ylabels = pd.DataFrame({'target_labels':df['y'].unique()})
  39. col_ylabels.to_csv("target.csv",header=False,index=False)
  40. return x_train,x_test,y_train,y_test
  41. def build_pipeline(Estimator,X,y,Transformer):
  42. """Build a Pipeline using an Estimator and A Transformer
  43. >>> build_pipeline(LogisticRegression(),X,y,StandardScaler())
  44. """
  45. ml_pipe = Pipeline(steps=[('scaler',Transformer),('clf',Estimator)])
  46. # Fit To Train
  47. ml_pipe.fit(X,y)
  48. return ml_pipe
  49. def build_model(Estimator,X,y):
  50. """Build a Model using an Estimator and Train on Dataset"""
  51. model = Estimator()
  52. # Fit To Train
  53. model.fit(X,y)
  54. return model
  55. def evaluate_model(ml_pipe,x_test,y_test):
  56. accuracy = ml_pipe.score(x_test,y_test)
  57. y_pred = ml_pipe.predict(x_test)
  58. f1score = f1_score(y_test, y_pred, average='weighted')
  59. precision = precision_score(y_test,y_pred)
  60. return {'model_name':ml_pipe.named_steps['clf'],'accuracy':accuracy,'f1_score':f1score,'precision':precision}
  61. # Usage
  62. df = load_data("data/bank-additional-full_encoded.csv")
  63. console.print("Preprocessing Data",style='bold cyan')
  64. x_train,x_test,y_train,y_test = split_data(df,label_col='y')
  65. # Build Models
  66. pipe_lr = build_pipeline(DecisionTreeClassifier(),x_train,y_train,StandardScaler())
  67. console.print("Evaluating Model",style='bold cyan')
  68. # Evaluate Model
  69. evaluate_model(pipe_lr,x_test,y_test)
  70. # Log Results with Dagshub
  71. with dagshub.dagshub_logger() as logger:
  72. print('Saving Model/Pipeline...')
  73. joblib.dump(pipe_lr, 'models/pipe_dt.pkl')
  74. logger.log_hyperparams(model_class=type(pipe_lr.named_steps['clf']).__name__)
  75. logger.log_hyperparams({'pipe_lr': pipe_lr.get_params()})
  76. print("Working on Metrics...")
  77. # Log Metrics: This creates a metrics.csv file
  78. train_metrics = evaluate_model(pipe_lr,x_test,y_test)
  79. print('Train metrics:')
  80. print(train_metrics)
  81. logger.log_metrics({f'train__{k}': v for k,v in train_metrics.items()})
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...