Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

models.py 5.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
  1. '''
  2. Defines the Model class and methods.
  3. Usage:
  4. Instantiate a model:
  5. baseline = Model('baseline')
  6. Get scores for a dataframe:
  7. scores = baseline.score(df)
  8. '''
  9. import os
  10. import numpy as np
  11. import pandas as pd
  12. # from pandas.testing import assert_frame_equal
  13. from catboost import CatBoostClassifier, CatBoostRegressor
  14. from joblib import load
  15. import pickle
  16. import j_utils.munging as mg
  17. from lendingclub import config
  18. from lendingclub.modeling import score_utils as scr_util
  19. ppath = config.prj_dir
  20. dpath = config.data_dir
  21. mpath = config.modeling_dir
  22. class Model():
  23. '''
  24. Model class loads appropriate model based on name in constructor
  25. Also handles data preprocessing
  26. '''
  27. def __init__(self, name: str):
  28. self.name = name
  29. self.basempath = os.path.join(ppath, 'models')
  30. self.mpath = mpath
  31. self.proc_arti = None
  32. self.data_is_procced = False
  33. self.proc_df = None
  34. self.m = None
  35. self.df = None
  36. self.m_clf = None
  37. self.m_regr = None
  38. self.load_model()
  39. def load_model(self):
  40. '''
  41. Loads a model based on which model (self.name)
  42. '''
  43. if self.name in ['baseline', 'A', 'B', 'C', 'D', 'E', 'F', 'G']:
  44. with open(os.path.join(mpath, '{0}_model.pkl'.format(self.name)), 'rb') as file:
  45. self.m = pickle.load(file)
  46. elif self.name == 'logistic_regr':
  47. self.m = load(os.path.join(mpath, '{0}_model.pkl'.format(self.name)))
  48. self.proc_arti = load(os.path.join(mpath, '{0}_model_proc_arti.pkl'.format(self.name)))
  49. elif self.name in ['catboost_clf', 'catboost_regr']:
  50. if self.name == 'catboost_clf':
  51. self.m = CatBoostClassifier()
  52. elif self.name == 'catboost_regr':
  53. self.m = CatBoostRegressor()
  54. self.m.load_model(os.path.join(mpath,'{0}_model.cb'.format(self.name)))
  55. self.proc_arti = load(os.path.join(mpath, '{0}_model_proc_arti.pkl'.format(self.name)))
  56. elif self.name == 'catboost_both':
  57. self.m_clf = CatBoostClassifier()
  58. self.m_clf.load_model(os.path.join(mpath,'{0}_model.cb'.format('catboost_clf')))
  59. self.m_regr = CatBoostRegressor()
  60. self.m_regr.load_model(os.path.join(mpath,'{0}_model.cb'.format('catboost_regr')))
  61. # can take either for proc data, its same process
  62. self.proc_arti = load(os.path.join(mpath, '{0}_model_proc_arti.pkl'.format('catboost_regr')))
  63. def proc_data(self):
  64. '''
  65. Process dataframe appropriately for the model type, set self.proc_df
  66. '''
  67. self.proc_df = mg.val_test_proc(self.df.copy(), *self.proc_arti)
  68. self.data_is_procced = True
  69. def score(self, df: pd.DataFrame, return_all=False, random_penalty=False, clf_cutoff = .90, optimized=True):
  70. '''
  71. Given a dataframe (base_loan_info, non imputed or scaled or normalized)
  72. return scores. Imputation, Scaling, and Normalizing will be handled
  73. inside this method to match that done at training
  74. HIGHER SCORES SHOULD BE BETTER (for classification, want prob of
  75. not defaulting)
  76. optimized is only for catboost_both where notebooks 11/12 have been
  77. used to choose how to properly combine scores and what
  78. percentiles of score to invest in
  79. '''
  80. if (self.df is None) or ((self.df is not None) and not self.df.equals(df)):
  81. # if there is no previous df, or if previous df doesn't match
  82. # the df that was just passed to be scored
  83. self.df = df
  84. self.data_is_procced = False
  85. if not self.data_is_procced and any(n in self.name for n in ['regr', 'clf', 'both']):
  86. self.proc_data()
  87. # baselines and grades
  88. if self.name in ['baseline', 'A', 'B', 'C', 'D', 'E', 'F', 'G']:
  89. self.prng = np.random.RandomState(self.m)
  90. scores = self.prng.random(len(self.df))
  91. if self.name in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
  92. mask = np.where(df['grade'] == self.name, 0, 1).astype(bool)
  93. scores[mask] = 0
  94. return scores
  95. elif self.name in ['logistic_regr', 'catboost_clf', 'catboost_regr']:
  96. # self.proc_df = mg.val_test_proc(self.df, *self.proc_arti)
  97. # return probability of not default
  98. if self.name in ['logistic_regr', 'catboost_clf']:
  99. return self.m.predict_proba(self.proc_df)[:, 0]
  100. elif self.name in ['catboost_regr']:
  101. return self.m.predict(self.proc_df)
  102. elif self.name in ['catboost_both']:
  103. # if self.proc_df is None:
  104. # self.proc_df = mg.val_test_proc(self.df, *self.proc_arti)
  105. clf_scores = self.m_clf.predict_proba(self.proc_df)[:, 0]
  106. regr_scores = self.m_regr.predict(self.proc_df)
  107. # linearly combine clf and regr scaled, using clf_wt in scr_utils
  108. clf_wt_scorer = scr_util.combined_score(scr_util.clf_wt)
  109. self.proc_df['catboost_clf'] = clf_scores
  110. self.proc_df['catboost_regr'] = regr_scores
  111. self.proc_df['catboost_regr_scl'] = scr_util.scale_cb_regr_score(self.proc_df)
  112. comb_scores = clf_wt_scorer('catboost_clf', 'catboost_regr_scl', self.proc_df)
  113. if return_all:
  114. return comb_scores, regr_scores, clf_scores
  115. return comb_scores
  116. print('unknown model??')
  117. return None
  118. def load_scored_df():
  119. '''
  120. loads the df with all model scores. If it doesn't exist, creates it
  121. '''
  122. path = os.path.join(config.data_dir, 'scored_eval_loan_info.fth')
  123. if os.path.exists(path):
  124. return pd.read_feather(path)
  125. return pd.read_feather(os.path.join(config.data_dir, 'eval_loan_info.fth'))
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...