Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

05_clean_loan_info.py 14 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
  1. # %load ../../lendingclub/csv_preparation/05_clean_loan_info.py
  2. '''
  3. maturity time and maturity paid are floats from 0 to 1 that express how "done"
  4. a loan is either time-wise, or money wise. There are loan-status adjusted versions as well.
  5. I use these because I want to include as much data for my models as possible while recognizing that
  6. there is uncertainty in knowing whether a loan was good or bad if it is ongoing.
  7. For example, if a loan is 120 days late, we know that loan is likely a very bad
  8. investment that our model should not be choosing. Is it possible that the
  9. loan all of a sudden becomes current and you get a massive return due to
  10. accumulated late fees? Yes, but not likely. In any case, I would rather
  11. incorporate that likely-to-be-bad loan into the model now instead of wait
  12. 2-ish months for that loan to truly go through the charged-off process.
  13. maturity_time is how close to original maturity the loan is, regardless of how
  14. much the loan has paid back and/or followed the expected payment schedule.
  15. maturity_paid is how close the loan is to completing all its payments (
  16. total_payments_received/(total_expected_payments at point in time,
  17. with adjustments for lateness))
  18. status adjusted are adjusting the maturity calculations knowing that if the
  19. loan does go the charge-off route, it has x months left or will recover .1
  20. percent of remaining outstanding principal on avg.
  21. Some examples of loans:
  22. 1) A loan is issued last month and almost pays off all the outstanding
  23. principal this month (maybe a borrower found better loan terms elsewhere,
  24. and took out that new loan to almost completely pay down the ) would have
  25. maturity_time near 0 and maturity_paid near 1
  26. 2) A 3 year loan that is 8 months in and is 120 days late has a low
  27. maturity_time and fairly high maturity_paid, as there is an adjustment for
  28. denominator (aside form what was already paid to date by the loan,
  29. only expecting a 10% recovery on remaining outstanding principal)
  30. '''
  31. import os
  32. import pickle
  33. import re
  34. import sys
  35. import numpy as np
  36. import pandas as pd
  37. from pandas.api.types import is_string_dtype
  38. from tqdm import tqdm
  39. import j_utils.munging as mg
  40. sys.path.append('/home/justin/projects/lendingclub/lendingclub/csv_preparation')
  41. import rem_to_be_paid as rtbp
  42. from lendingclub import config
  43. from lendingclub.csv_preparation import clean_loan_info as cli
  44. # load data, turn python Nones into np.nans
  45. dpath = config.data_dir
  46. loan_info = pd.read_feather(os.path.join(dpath, 'raw_loan_info.fth'))
  47. # cut loan info to dev set
  48. with open(os.path.join(config.data_dir, 'dev_ids.pkl'), "rb") as input_file:
  49. dev_ids = pickle.load(input_file)
  50. loan_info = loan_info.query('id in @dev_ids')
  51. loan_info.fillna(value=pd.np.nan, inplace=True)
  52. #turn all date columns into pandas timestamp ________________________________
  53. # date cols
  54. date_cols = [
  55. 'issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d',
  56. 'next_pymnt_d', 'sec_app_earliest_cr_line', 'hardship_start_date',
  57. 'hardship_end_date', 'payment_plan_start_date', 'debt_settlement_flag_date',
  58. 'settlement_date',
  59. ]
  60. for col in date_cols:
  61. cli.loan_info_fmt_date(loan_info, col)
  62. # Cleanups ___________________________________________________________________
  63. # installment funded
  64. rename_dict = {'installment': 'installment_currently'}
  65. loan_info.rename(rename_dict, inplace=True, axis=1)
  66. # emp_title
  67. loan_info['emp_title'] = loan_info['emp_title'].str.lower()
  68. # home_ownership
  69. dic_home_ownership = {
  70. 'MORTGAGE': 'MORTGAGE',
  71. 'RENT': 'RENT',
  72. 'OWN': 'OWN',
  73. 'OTHER': 'OTHER',
  74. 'NONE': 'NONE',
  75. 'ANY': 'NONE'
  76. }
  77. loan_info['home_ownership'] = loan_info['home_ownership'].str.upper().replace(
  78. dic_home_ownership)
  79. # verification_status and verification_status_joint
  80. dic_verification_status = {
  81. 'VERIFIED - income': 'platform',
  82. 'VERIFIED - income source': 'source',
  83. 'not verified': 'none',
  84. 'Source Verified': 'source',
  85. 'Not Verified': 'none',
  86. 'Verified': 'platform'
  87. }
  88. loan_info['verification_status'] = loan_info['verification_status'].replace(
  89. dic_verification_status)
  90. loan_info['verification_status_joint'] = loan_info[
  91. 'verification_status_joint'].replace(dic_verification_status)
  92. # status
  93. dic_status = {
  94. 'Current': 'current',
  95. 'Charged Off': 'charged_off',
  96. 'Fully Paid': 'paid',
  97. 'Late (31-120 days)': 'late_120',
  98. 'In Grace Period': 'grace_15',
  99. 'Late (16-30 days)': 'late_30',
  100. 'Default': 'defaulted',
  101. 'Issued': 'current'
  102. }
  103. loan_info['loan_status'] = loan_info['loan_status'].apply(
  104. lambda x: re.sub('Does not meet the credit policy. Status:', '', x))
  105. loan_info['loan_status'] = loan_info['loan_status'].apply(
  106. lambda x: re.sub('Does not meet the credit policy. Status:', '', x))
  107. loan_info['loan_status'] = loan_info['loan_status'].replace(dic_status)
  108. loan_info['hardship_loan_status'] = loan_info['hardship_loan_status'].replace(dic_status)
  109. #title
  110. loan_info['title'] = loan_info['title'].str.lower()
  111. # 12/14/2019 CORRECTIONS TO LOAN INFO TO MATCH DATA
  112. # COMING THROUGH LENDINGCLUB API
  113. # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  114. #application_type, remap to be same as api_loan
  115. dic_app_type = {
  116. 'JOINT APP': 'JOINT',
  117. }
  118. loan_info['application_type'] = loan_info['application_type'].str.upper().replace(dic_app_type)
  119. loan_info['initial_list_status'] = loan_info['initial_list_status'].str.upper()
  120. # # int_rate
  121. # loan_info['int_rate'] = loan_info['int_rate'] / 100
  122. #revol_util
  123. loan_info['revol_util'] = loan_info['revol_util'].apply(
  124. lambda x: float(x.strip('%')) if pd.notnull(x) else np.nan)
  125. # #all_util
  126. # loan_info['all_util'] = loan_info['all_util'] / 100.
  127. # # pct_tl_nvr_dlq
  128. # loan_info['pct_tl_nvr_dlq'] = loan_info['pct_tl_nvr_dlq'] / 100.
  129. # # percent_bc_gt_75
  130. # loan_info['percent_bc_gt_75'] = loan_info['percent_bc_gt_75'] / 100.
  131. # # dti
  132. # loan_info['dti'] = loan_info['dti'] / 100.
  133. # dti_joint
  134. # loan_info['dti_joint'] = loan_info['dti_joint'] / 100.
  135. # il_util
  136. # loan_info['il_util'] = loan_info['il_util'] / 100.
  137. # # bc_util
  138. # loan_info['bc_util'] = loan_info['bc_util'] / 100.
  139. # # sec_app_revol_util
  140. # loan_info['sec_app_revol_util'] = loan_info['sec_app_revol_util'] / 100.
  141. # # settlement_percentage
  142. # loan_info['settlement_percentage'] = loan_info['settlement_percentage'] / 100.
  143. # # check that percents are between 0 and 1, not 0 and 100
  144. # pct_cols = []
  145. # for col in loan_info.columns:
  146. # if any(x in col for x in ['pct', 'percent', 'util', 'dti', 'rate']):
  147. # pct_cols.append(col)
  148. # for col in pct_cols:
  149. # if loan_info[col].mean() > 1:
  150. # print('this col needs to be turned into a decimal form of percent: ', col)
  151. # if loan_info[col].median() > 1:
  152. # print('this col needs to be turned into a decimal form of percent: ', col)
  153. # Adding columns of interest _________________________________________________
  154. # unreceived principal, not overwriting out_prncp
  155. loan_info['unreceived_prncp'] = loan_info['funded_amnt'] - loan_info['total_rec_prncp']
  156. loan_info['unreceived_prncp'] = np.where(loan_info['unreceived_prncp'] <= 0.019,
  157. 0, loan_info['unreceived_prncp'])
  158. loan_info['unreceived_prncp'] = loan_info['unreceived_prncp'].round(2)
  159. # want to calculate what installment originally was
  160. loan_info['installment_at_funded'] = np.pmt(
  161. loan_info['int_rate']/12, loan_info['term'], -loan_info['funded_amnt'])
  162. # have a max_date for reference in making end_d
  163. max_date = loan_info['last_pymnt_d'].max()
  164. # end_d to me means the date we can stop tracking things about the loan. Should be defunct
  165. # make end_d
  166. status_grouped = loan_info.groupby('loan_status')
  167. end_d_series = pd.Series([])
  168. for status, group in status_grouped:
  169. end_d_series = end_d_series.append(
  170. cli.apply_end_d(status, group, max_date), verify_integrity=True)
  171. loan_info['end_d'] = end_d_series
  172. loan_info.loc[loan_info['end_d'] > max_date, 'end_d'] = max_date
  173. # adding line_history in days, months, and years using pandas .dt functions
  174. loan_info['line_history_d'] = (loan_info['issue_d'] - loan_info['earliest_cr_line']).dt.days
  175. loan_info['line_history_m'] = (
  176. loan_info['issue_d'].dt.year - loan_info['earliest_cr_line'].dt.year)*12 + (
  177. loan_info['issue_d'].dt.month - loan_info['earliest_cr_line'].dt.month)
  178. loan_info['line_history_y'] = (
  179. loan_info['issue_d'].dt.year - loan_info['earliest_cr_line'].dt.year) + (
  180. loan_info['issue_d'].dt.month - loan_info['earliest_cr_line'].dt.month)/12
  181. #credit_score
  182. loan_info['fico'] = (
  183. loan_info['fico_range_high'] + loan_info['fico_range_low']) / 2
  184. # maturity_time
  185. loan_info['months_passed'] = ((max_date - loan_info['issue_d']).dt.days * (12 / 365.25)).round()
  186. loan_info['maturity_time'] = loan_info['months_passed'] / loan_info['term']
  187. loan_info['maturity_time'] = np.where(loan_info['maturity_time'] >= 1, 1,
  188. loan_info['maturity_time'])
  189. # make rem_to_be_paid
  190. loan_info['rem_to_be_paid'] = rtbp.apply_rem_to_be_paid(
  191. loan_info['unreceived_prncp'].values, loan_info['installment_currently'].values,
  192. loan_info['int_rate'].values/100)
  193. loan_info['maturity_paid'] = loan_info['total_pymnt'] / (
  194. loan_info['total_pymnt'] + loan_info['rem_to_be_paid'])
  195. # making status adjusted versions of mat_time, mat_paid
  196. # grace = 35%, late_30 = 64%, late_120 = 98%,
  197. # See https://www.lendingclub.com/info/demand-and-credit-profile.action for %s used
  198. # maturity_time_stat_adj =
  199. # maturity_time * prob_not_def + months_passed/months_to_default * prob_def
  200. loan_info['maturity_time_stat_adj'] = np.where(loan_info['loan_status'] == 'grace_15',
  201. loan_info['maturity_time']*(1-.35) + ((loan_info['months_passed']/(loan_info['months_passed'] + 4))*.35),
  202. np.where(loan_info['loan_status'] == 'late_30', loan_info['maturity_time']*(1-.64) + ((loan_info['months_passed']/(loan_info['months_passed'] + 3))*.64),
  203. np.where(loan_info['loan_status'] == 'late_120', loan_info['maturity_time']*(1-.98) + ((loan_info['months_passed']/(loan_info['months_passed'] + 1))*.98), loan_info['maturity_time'])))
  204. loan_info['maturity_time_stat_adj'] = np.minimum(1, loan_info['maturity_time_stat_adj'])
  205. # maturity_paid_stat_adj =
  206. # maturity_paid * prob_not_def + total_paid/total_paid_and_outstanding * prob_def
  207. # .1 is from assuming 10% recovery on defaulted/charged_off loans
  208. loan_info['maturity_paid_stat_adj'] = np.where(loan_info['loan_status'] == 'grace_15', loan_info['maturity_paid']*(1-.35) + ((loan_info['total_pymnt']/(loan_info['total_pymnt'] + .1*loan_info['unreceived_prncp']))*.35),
  209. np.where(loan_info['loan_status'] == 'late_30', loan_info['maturity_paid']*(1-.64) + ((loan_info['total_pymnt']/(loan_info['total_pymnt'] + .1*loan_info['unreceived_prncp']))*.64),
  210. np.where(loan_info['loan_status'] == 'late_120', loan_info['maturity_paid']*(1-.98) + ((loan_info['total_pymnt']/(loan_info['total_pymnt'] + .1*loan_info['unreceived_prncp']))*.98), loan_info['maturity_paid']
  211. )))
  212. loan_info['maturity_paid_stat_adj'] = np.minimum(1, loan_info['maturity_paid_stat_adj'])
  213. # final adjustments to status_adj based on done statuses
  214. loan_info.loc[loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted']),
  215. 'maturity_paid_stat_adj'] = 1
  216. loan_info.loc[loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted']),
  217. 'maturity_time_stat_adj'] = 1
  218. # target_loose
  219. loan_info['target_loose'] = np.where(
  220. loan_info['loan_status'].isin(['charged_off', 'defaulted']), 1, 0)
  221. # pull out long string columns
  222. str_cols = loan_info.select_dtypes('object').columns
  223. strip_cols = ['desc', 'emp_title', 'title', 'url']
  224. strings_df = loan_info[strip_cols]
  225. loan_info.drop(columns=strip_cols, inplace=True)
  226. strings_df['id'] = loan_info['id']
  227. # make target strict, anything that was ever late is marked "bad"
  228. bad_statuses = set(['late_120', 'defaulted', 'charged_off', 'late_30'])
  229. pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history.fth'))
  230. target_strict_dict = {}
  231. id_grouped = pmt_hist.groupby('loan_id')
  232. for ids, group in tqdm(id_grouped):
  233. statuses = set(group['status_period_end'])
  234. if len(statuses.intersection(bad_statuses)) > 0:
  235. target_strict_dict[ids] = 1
  236. else:
  237. target_strict_dict[ids] = 0
  238. target_strict = pd.DataFrame.from_dict(target_strict_dict, orient='index').reset_index(drop=False)
  239. target_strict.columns = ['id', 'target_strict']
  240. loan_info.rename({'loan_id': 'id'}, axis=1, inplace=True)
  241. loan_info = pd.merge(loan_info, target_strict, how='outer', on='id')
  242. # add orig_amt_due and roi_simple
  243. loan_info['orig_amt_due'] = loan_info['term'] * loan_info['installment_at_funded']
  244. loan_info['roi_simple'] = loan_info['total_pymnt']/loan_info['funded_amnt']
  245. # More Data Cleanup __________________________________________________________
  246. # home_ownership: none should be other
  247. loan_info['home_ownership'].replace({'none': 'other'}, inplace=True)
  248. # annual_income has 4 nulls. Just fill with 0
  249. loan_info['annual_inc'].replace({np.nan: 0.0}, inplace=True)
  250. # drop the one null zip_code
  251. loan_info = loan_info[loan_info['zip_code'].notnull()]
  252. # drop the loans where earliest_cr_line is null
  253. loan_info = loan_info[loan_info['earliest_cr_line'].notnull()]
  254. # drop null chargeoff_within_12_mths
  255. loan_info = loan_info[loan_info['chargeoff_within_12_mths'].notnull()]
  256. # drop null tax_liens
  257. loan_info = loan_info[loan_info['tax_liens'].notnull()]
  258. # drop loans that have this null
  259. loan_info = loan_info[loan_info['inq_last_6mths'].notnull()]
  260. # Drop columns _______________________________________________________________
  261. # Dropping these since I don't want them and they might confuse me.
  262. # There is no reason why I care about money that went just to investors rather
  263. # than to lending club as well when they top off loans.
  264. loan_info.drop(['funded_amnt_inv', 'out_prncp_inv'], axis=1, inplace=True)
  265. # last cleanups before storing
  266. # if column type is string and has np.nan (a float), turn the nan into "None"
  267. # for the graphing eda notebook in Exploratory Data Analysis
  268. for col in loan_info.columns:
  269. if is_string_dtype(loan_info[col].dtype) & (loan_info[col].isnull().sum() > 0):
  270. loan_info[col] = loan_info[col].fillna('None')
  271. for col in strings_df.columns:
  272. if is_string_dtype(strings_df[col].dtype) & (strings_df[col].isnull().sum() > 0):
  273. strings_df[col] = strings_df[col].fillna('None')
  274. # reduce memory and store
  275. _, strings_df = mg.reduce_memory(strings_df)
  276. strings_df.reset_index(drop=True, inplace=True)
  277. _, loan_info = mg.reduce_memory(loan_info)
  278. loan_info.reset_index(drop=True, inplace=True)
  279. strings_df.to_feather(os.path.join(dpath, 'strings_loan_info.fth'))
  280. loan_info.to_feather(os.path.join(dpath, 'clean_loan_info.fth'))
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...