Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

06_data_and_eval_preparation.py 5.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
  1. '''
  2. renames loan_info to match what is received through the api
  3. makes a scaled pmt_history and other various targets for evaluation of models
  4. '''
  5. import os
  6. import numpy as np
  7. import pandas as pd
  8. import pickle
  9. from pandas.api.types import is_string_dtype, is_numeric_dtype
  10. from tqdm import tqdm
  11. import j_utils.munging as mg
  12. import lendingclub.config as config
  13. import lendingclub.investing.investing_utils as investing_utils
  14. # custom imports
  15. import user_creds.account_info as acc_info
  16. # set paths
  17. ppath = config.prj_dir
  18. dpath = config.data_dir
  19. # load in dataframes
  20. loan_info = pd.read_feather(os.path.join(dpath, 'clean_loan_info.fth'))
  21. pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history.fth'))
  22. strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info.fth'))
  23. strings = strings[strings['id'].isin(loan_info['id'])]
  24. # sort rows by loan_id (and date)
  25. loan_info = loan_info.sort_values('id')
  26. pmt_hist = pmt_hist.sort_values(['loan_id', 'date'])
  27. strings = strings.sort_values('id')
  28. # rename loan_id to id to match what comes through API
  29. pmt_hist = pmt_hist.rename({'loan_id': 'id'}, axis=1)
  30. # check how fields come in through API _______________________________________
  31. # constants and setup for various accounts and APIs
  32. token = acc_info.token
  33. inv_acc_id = acc_info.investor_id
  34. portfolio_id = acc_info.portfolio_id
  35. header = {
  36. 'Authorization': token,
  37. 'Content-Type': 'application/json',
  38. 'X-LC-LISTING-VERSION': '1.3'
  39. }
  40. # get the loans and process the dataframe
  41. _, all_loan_count = investing_utils.get_loans_and_ids(
  42. header, exclude_already=False)
  43. api_loans, api_ids = investing_utils.get_loans_and_ids(
  44. header, exclude_already=True)
  45. # checking the fields from csv vs API
  46. api_flds = set(api_loans.columns)
  47. licsv_flds = set(loan_info.columns)
  48. common_flds = api_flds.intersection(licsv_flds)
  49. api_flds_not_in_licsv = api_flds.difference(licsv_flds)
  50. licsv_flds_not_in_api = licsv_flds.difference(api_flds)
  51. # rename some loan_info fields to match those coming through api
  52. licsv_to_api_rename_dict = {
  53. 'acc_open_past_24mths':'acc_open_past_24_mths',
  54. 'zip_code': 'addr_zip',
  55. 'delinq_2yrs': 'delinq_2_yrs',
  56. 'funded_amnt': 'funded_amount',
  57. 'il_util': 'i_l_util',
  58. 'inq_last_6mths': 'inq_last_6_mths',
  59. # 'installment_at_funded': 'installment',
  60. 'verification_status': 'is_inc_v',
  61. 'verification_status_joint': 'is_inc_v_joint',
  62. 'loan_amnt': 'loan_amount',
  63. 'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
  64. 'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
  65. 'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
  66. }
  67. loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)
  68. loan_info.reset_index(drop=True, inplace=True)
  69. # split loan info into dataframes for training off of and evaluating__________
  70. eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time',
  71. 'maturity_time_stat_adj', 'maturity_paid_stat_adj',
  72. 'rem_to_be_paid', 'roi_simple', 'target_loose',
  73. 'target_strict', 'loan_status', 'id']
  74. strb_flds = ['desc', 'emp_title', 'id']
  75. base_loan_info = loan_info[list(common_flds)]
  76. eval_loan_info = loan_info[eval_flds + ['grade', 'sub_grade', 'term', 'int_rate']]
  77. str_loan_info = strings[strb_flds]
  78. # save it at bottom of script
  79. # eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
  80. # str_loan_info.reset_index(drop=True, inplace=True)
  81. # str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))
  82. # make a version of pmt_history where each loan is scaled to be equal size____
  83. pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
  84. loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
  85. loan_dollar_cols = [
  86. 'outs_princp_beg',
  87. 'princp_paid',
  88. 'int_paid',
  89. 'fee_paid',
  90. 'amt_due',
  91. 'amt_paid',
  92. 'outs_princp_end',
  93. 'charged_off_amt',
  94. 'monthly_pmt',
  95. 'recovs',
  96. 'recov_fees',
  97. 'all_cash_to_inv', ]
  98. id_grouped = pmt_hist.groupby('id', sort=False)
  99. funded_amts = []
  100. for ids, group in tqdm(id_grouped):
  101. funded_amt = loan_funded_amts[ids]
  102. funded_amts.extend([funded_amt]*len(group))
  103. for col in loan_dollar_cols:
  104. pmt_hist[col] = pmt_hist[col]/funded_amts
  105. # make npv_rois (using various discount rates and actual/known cashflows)_____
  106. interesting_cols_over_time = [
  107. 'outs_princp_beg',
  108. 'all_cash_to_inv',
  109. 'date',
  110. 'fico_last',
  111. 'm_on_books',
  112. 'status_period_end',
  113. 'id',
  114. ]
  115. pmt_hist = pmt_hist[interesting_cols_over_time]
  116. npv_roi_holder = {}
  117. disc_rates = np.arange(.05, .36, .01)
  118. id_grouped = pmt_hist.groupby('id')
  119. for ids, group in tqdm(id_grouped):
  120. npv_roi_dict = {}
  121. funded = group.iat[0, 0]
  122. cfs = [-funded] + group['all_cash_to_inv'].tolist()
  123. for rate in disc_rates:
  124. npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
  125. npv_roi_holder[ids] = npv_roi_dict
  126. npv_roi_df = pd.DataFrame(npv_roi_holder).T
  127. npv_roi_df.columns = npv_roi_df.columns.values.round(2)
  128. npv_roi_df.index.name = 'id'
  129. npv_roi_df.reset_index(inplace=True)
  130. eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')
  131. # some current loans I have no target_strict for and were not in pmt history.
  132. # Fill with negatives on npv_roi.
  133. eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
  134. eval_loan_info.fillna(-1, inplace=True)
  135. # SAVE this version of loan info
  136. loan_info.to_feather(os.path.join(dpath, 'clean_loan_info_api_name_matched.fth'))
  137. # SAVE
  138. base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
  139. with open(os.path.join(dpath, 'base_loan_info_dtypes.pkl'), 'wb') as f:
  140. pickle.dump(base_loan_info.dtypes.to_dict(), f)
  141. # SAVE
  142. pmt_hist.reset_index(drop=True, inplace=True)
  143. _, pmt_hist = mg.reduce_memory(pmt_hist)
  144. pmt_hist.to_feather(os.path.join(dpath, 'scaled_pmt_hist.fth'))
  145. # SAVE
  146. # feather must have string column names
  147. eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
  148. eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...