Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

lc_utils.py 12 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
  1. from typing import List, Union, Optional, Tuple
  2. import pandas as pd
  3. from j_utils import munging as mg
  4. def gen_datasets(today: str,
  5. valid_start: str,
  6. base_loan_info: pd.DataFrame,
  7. eval_loan_info: pd.DataFrame,
  8. target: Union[str, List[str]],
  9. doneness: float = .95,
  10. stat_adj: bool = True,
  11. oldest: Optional[str] = None,
  12. valid_end: Optional[str] = None,
  13. verbose: bool = False,
  14. impute: bool = False,) -> Tuple:
  15. # old_and_done: bool = False
  16. '''
  17. makes train_x, train_y, valid_x, valid_y, train_ids, valid_ids
  18. Args:
  19. today: string, marks the date. Training data is loans issued btwn @oldest until @today that have
  20. an end_d < @today
  21. valid_start: string, date to start validation set from. Must be greater than today and not the same year and month
  22. base_loan_info: the pandas dataframe of loan info (e.g. X)
  23. eval_loan_info: the pandas dataframe of target, other eval metrics (e.g. one or more of the columns is y)
  24. target: define the target column from eval_loan_info
  25. doneness: maturity time or maturity paid (or stat_adj versions) must be >= than this number
  26. stat_adj: True or False, choosing whether to use status adjusted values. Default is True
  27. oldest: Will not use loans that were issued before this date
  28. valid_end: Will not include loans greater than this date in the validation set
  29. verbose: for hyperlearn impute. Should be moved outside of this function
  30. impute: for hyperlearn impute. Should be moved outside of this function
  31. '''
  32. done_statuses = ['paid', 'charged_off', 'defaulted']
  33. today = pd.to_datetime(today)
  34. valid_start = pd.to_datetime(valid_start)
  35. assert (today < valid_start - pd.to_timedelta(valid_start.day-1, unit='d')),'valid_start must be greater than today and not the same year and month'
  36. # cut loans to required doneness
  37. if stat_adj:
  38. eval_loan_info_mask = eval_loan_info.eval('maturity_time_stat_adj >= @doneness or '
  39. 'maturity_paid_stat_adj >= @doneness or '
  40. 'loan_status == @done_statuses')
  41. else:
  42. eval_loan_info_mask = eval_loan_info.eval('maturity_time >= @doneness or '
  43. 'maturity_paid >= @doneness or '
  44. 'loan_status == @done_statuses')
  45. # specify date bounds of train and valid sets
  46. if oldest:
  47. train_mask = eval_loan_info.eval(#'issue_d <= @today and '
  48. 'issue_d >= @oldest and '
  49. 'end_d < @today') & eval_loan_info_mask
  50. else:
  51. train_mask = eval_loan_info.eval(#'issue_d <= @today and '
  52. 'end_d < @today') & eval_loan_info_mask
  53. if valid_end:
  54. valid_mask = eval_loan_info.eval("issue_d >= @valid_start and "
  55. "issue_d <= @valid_end") & eval_loan_info_mask
  56. else:
  57. valid_mask = eval_loan_info.eval("issue_d >= @valid_start") & eval_loan_info_mask
  58. train_x = base_loan_info.loc[train_mask]
  59. valid_x = base_loan_info.loc[valid_mask]
  60. train_ids = train_x['id']
  61. valid_ids = valid_x['id']
  62. assert len(train_x) == len(train_ids)
  63. assert len(valid_x) == len(valid_ids)
  64. if impute:
  65. import hyperlearn.hyperlearn.impute.SVDImpute as hpl_imp
  66. # setup for catboost
  67. # a bit more data processing and nan handling for catboost
  68. train_copy = train_x.copy()
  69. valid_copy = valid_x.copy()
  70. # get ready for hyperlearn svdimpute
  71. train_copy, max_dict, min_dict, cats_dict, norm_dict = mg.train_hpl_proc(
  72. train_copy, verbose=verbose)
  73. valid_copy = mg.val_test_hpl_proc(
  74. valid_copy, train_copy, max_dict, min_dict, cats_dict, verbose=verbose)
  75. # fit to train
  76. S, VT, mean, std, mins, standardise = hpl_imp.fit(train_copy.values)
  77. # impute on train
  78. train_svdimp = hpl_imp.transform(
  79. train_copy.values, S, VT, mean, std, mins, standardise)
  80. train_svdimp = pd.DataFrame(train_svdimp)
  81. train_svdimp.index = train_copy.index
  82. train_svdimp.columns = train_copy.columns
  83. # impute on test
  84. valid_svdimp = hpl_imp.transform(
  85. valid_copy.values, S, VT, mean, std, mins, standardise)
  86. valid_svdimp = pd.DataFrame(valid_svdimp)
  87. valid_svdimp.index = valid_copy.index
  88. valid_svdimp.columns = valid_copy.columns
  89. # imputing changes some ids. Make the ids the originals again.
  90. train_svdimp['id'] = train_ids
  91. valid_svdimp['id'] = valid_ids
  92. train_x = train_svdimp
  93. valid_x = valid_svdimp
  94. if type(target) == str:
  95. target = [target]
  96. target = ['id'] + target
  97. train_y = eval_loan_info.loc[train_mask, target]
  98. valid_y = eval_loan_info.loc[valid_mask, target]
  99. assert len(train_x) == len(train_ids) == len(train_y)
  100. assert len(valid_x) == len(valid_ids) == len(valid_y)
  101. train_x = train_x.sort_values('id')
  102. valid_x = valid_x.sort_values('id')
  103. train_y = train_y.sort_values('id')
  104. valid_y = valid_y.sort_values('id')
  105. assert (train_x['id'] != train_y['id']).sum() == 0
  106. assert (valid_x['id'] != valid_y['id']).sum() == 0
  107. return train_x, train_y, valid_x, valid_y, train_ids, valid_ids
  108. # make a crude test set for now
  109. def get_split_date(df, date_column, quantile):
  110. """
  111. https://stackoverflow.com/questions/31018622/pandas-quantile-function-for-dates
  112. Get the date on which to split a dataframe for timeseries splitting
  113. Adjusted coerce param to errors since SO is old.
  114. """
  115. # 1. convert date_column to datetime (useful in case it is a string)
  116. # 2. convert into int (for sorting)
  117. # 3. get the quantile
  118. # 4. get the corresponding date
  119. # 5. return, pray that it works
  120. quantile_date = pd.to_datetime(df[date_column], errors='raise').astype(
  121. 'int64').quantile(q=quantile) # .astype('datetime64[ns]')
  122. return pd.to_datetime(quantile_date)
  123. def split_out_traintestable_loans(df, eval_df, oldness_thrsh=.9):
  124. '''Can train/test on loans that pass the oldness_thrsh or have status paid/defaulted/charged_off'''
  125. old_enough_ids = eval_df[(eval_df['maturity_time_stat_adj'] >= oldness_thrsh) |
  126. (eval_df['maturity_paid_stat_adj'] >= oldness_thrsh) |
  127. (eval_df['loan_status'].isin(['paid', 'defaulted', 'charged_off']))]['id'].unique()
  128. df = df[df['id'].isin(old_enough_ids)]
  129. eval_df = eval_df[eval_df['id'].isin(old_enough_ids)]
  130. return df, eval_df
  131. def add_custom_lc_features(df):
  132. # added features
  133. df['monthly_inc'] = df['annual_inc'] / 12
  134. df['dti_w_loan'] = (df['dti'] * df['monthly_inc'] +
  135. df['installment']) / df['monthly_inc']
  136. df['delinq_to_monthly_inc'] = df['delinq_amnt'] / \
  137. df['monthly_inc']
  138. df['tot_cur_bal_to_monthly_inc'] = df['tot_cur_bal'] / \
  139. df['monthly_inc']
  140. df['loan_to_inc'] = df['loan_amount'] / \
  141. df['monthly_inc']
  142. # Deprecation
  143. # def gen_datasets(today: str,
  144. # valid_start: str,
  145. # base_loan_info: pd.DataFrame,
  146. # eval_loan_info: pd.DataFrame,
  147. # target: Union[str, List[str]],
  148. # doneness: float = .95,
  149. # stat_adj: bool = True,
  150. # oldest: Optional[str] = None,
  151. # valid_end: Optional[str] = None,
  152. # verbose: bool = False,
  153. # impute: bool = False) -> Tuple:
  154. # '''
  155. # all loans from oldest until today are taken as train. All loans issued after today until valid_end are used for validation. Uses hyperlearn svd_impute to impute missing values. Returns the train and test datasets. target can be single colname or list of colnames.
  156. # Will take all done loans as well (e.g. loan_status is paid, defaulted, charged_off)
  157. # Checks that train x/y are same length and order. Does same for valid
  158. # '''
  159. # # cut loans to required doneness
  160. # if stat_adj:
  161. # eval_loan_info = eval_loan_info[(eval_loan_info['maturity_time_stat_adj'] >= doneness) |
  162. # (eval_loan_info['maturity_paid_stat_adj'] >= doneness) |
  163. # (eval_loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted']))]
  164. # else:
  165. # eval_loan_info = eval_loan_info[(eval_loan_info['maturity_time'] >= doneness) |
  166. # (eval_loan_info['maturity_paid'] >= doneness) |
  167. # (eval_loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted']))]
  168. # # specify date bounds of train and valid sets
  169. # if oldest:
  170. # train_ids = eval_loan_info[(eval_loan_info['issue_d'] <= today) & (
  171. # eval_loan_info['issue_d'] >= oldest)]['id'].unique()
  172. # else:
  173. # train_ids = eval_loan_info[eval_loan_info['issue_d'] <= today]['id'].unique()
  174. # if valid_end:
  175. # valid_ids = eval_loan_info[(eval_loan_info['issue_d'] >= valid_start) & (
  176. # eval_loan_info['issue_d'] <= valid_end)]['id'].unique()
  177. # else:
  178. # valid_ids = eval_loan_info[(
  179. # eval_loan_info['issue_d'] >= valid_start)]['id'].unique()
  180. # train_x = base_loan_info[base_loan_info['id'].isin(train_ids)]
  181. # valid_x = base_loan_info[base_loan_info['id'].isin(valid_ids)]
  182. # assert len(train_x) == len(train_ids)
  183. # assert len(valid_x) == len(valid_ids)
  184. # if impute:
  185. # import hyperlearn.hyperlearn.impute.SVDImpute as hpl_imp
  186. # # setup for catboost
  187. # # a bit more data processing and nan handling for catboost
  188. # train_copy = train_x.copy()
  189. # valid_copy = valid_x.copy()
  190. # # get ready for hyperlearn svdimpute
  191. # train_copy, max_dict, min_dict, cats_dict, norm_dict = mg.train_hpl_proc(
  192. # train_copy, verbose=verbose)
  193. # valid_copy = mg.val_test_hpl_proc(
  194. # valid_copy, train_copy, max_dict, min_dict, cats_dict, verbose=verbose)
  195. # # fit to train
  196. # S, VT, mean, std, mins, standardise = hpl_imp.fit(train_copy.values)
  197. # # impute on train
  198. # train_svdimp = hpl_imp.transform(
  199. # train_copy.values, S, VT, mean, std, mins, standardise)
  200. # train_svdimp = pd.DataFrame(train_svdimp)
  201. # train_svdimp.index = train_copy.index
  202. # train_svdimp.columns = train_copy.columns
  203. # # impute on test
  204. # valid_svdimp = hpl_imp.transform(
  205. # valid_copy.values, S, VT, mean, std, mins, standardise)
  206. # valid_svdimp = pd.DataFrame(valid_svdimp)
  207. # valid_svdimp.index = valid_copy.index
  208. # valid_svdimp.columns = valid_copy.columns
  209. # # imputing changes some ids. Make the ids the originals again.
  210. # train_svdimp['id'] = train_ids
  211. # valid_svdimp['id'] = valid_ids
  212. # train_x = train_svdimp
  213. # valid_x = valid_svdimp
  214. # if type(target) == str:
  215. # target = [target]
  216. # target = ['id'] + target
  217. # train_y = eval_loan_info[eval_loan_info['id'].isin(train_ids)][target]
  218. # valid_y = eval_loan_info[eval_loan_info['id'].isin(valid_ids)][target]
  219. # assert len(train_x) == len(train_ids) == len(train_y)
  220. # assert len(valid_x) == len(valid_ids) == len(valid_y)
  221. # train_x.sort_values('id', inplace=True)
  222. # valid_x.sort_values('id', inplace=True)
  223. # train_y.sort_values('id', inplace=True)
  224. # valid_y.sort_values('id', inplace=True)
  225. # assert (train_x['id'] != train_y['id']).sum() == 0
  226. # assert (valid_x['id'] != valid_y['id']).sum() == 0
  227. # return train_x, train_y, valid_x, valid_y, train_ids, valid_ids
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...