Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

data_and_eval_preparation.py 6.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
  1. import pandas as pd
  2. import numpy as np
  3. import sys
  4. import os
  5. from tqdm import tqdm
  6. import requests
  7. import datetime
  8. import gspread
  9. from google.oauth2 import service_account
  10. from google.auth.transport.requests import AuthorizedSession
  11. # custom imports
  12. sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
  13. import j_utils.munging as mg
  14. import lendingclub.user_creds.account_info as acc_info
  15. import lendingclub.scripts.investing.investing_utils as investing_utils
  16. # set paths
  17. ppath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', )
  18. dpath = os.path.join(ppath,'data')
  19. # load in dataframes
  20. loan_info = pd.read_feather(os.path.join(dpath, 'loan_info.fth'))
  21. pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))
  22. strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info_df.fth'))
  23. strings = strings[strings['id'].isin(loan_info['id'])]
  24. # sort rows by loan_id (and date)
  25. loan_info.sort_values('id', inplace=True)
  26. pmt_hist.sort_values(['loan_id', 'date'], inplace=True)
  27. strings.sort_values('id', inplace=True)
  28. # rename loan_id to id to match what comes through API
  29. pmt_hist.rename({'loan_id': 'id'}, axis=1, inplace = True)
  30. # check how fields come in through API _______________________________________
  31. # constants and setup for various accounts and APIs
  32. now = datetime.datetime.now()
  33. token = acc_info.token
  34. inv_acc_id = acc_info.investor_id
  35. portfolio_id = acc_info.portfolio_id
  36. my_gmail_account = acc_info.from_email_throwaway
  37. my_gmail_password = acc_info.password_throwaway+'!@'
  38. my_recipients = acc_info.to_emails_throwaway
  39. invest_ss_key = acc_info.invest_ss_key
  40. investins_ss_key = acc_info.investins_ss_key
  41. header = {
  42. 'Authorization': token,
  43. 'Content-Type': 'application/json',
  44. 'X-LC-LISTING-VERSION': '1.3'
  45. }
  46. acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
  47. str(inv_acc_id) + '/summary'
  48. order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
  49. str(inv_acc_id) + '/orders'
  50. creds = service_account.Credentials.from_service_account_file(os.path.join(ppath, 'user_creds', 'credentials.json'))
  51. scope = ['https://spreadsheets.google.com/feeds']
  52. creds = creds.with_scopes(scope)
  53. gc = gspread.Client(auth=creds)
  54. gc.session = AuthorizedSession(creds)
  55. sheet = gc.open_by_key(invest_ss_key).sheet1
  56. sheetins = gc.open_by_key(investins_ss_key).sheet1
  57. # get the loans and process the dataframe
  58. _, all_loan_count = investing_utils.get_loans_and_ids(
  59. header, exclude_already=False)
  60. api_loans, api_ids = investing_utils.get_loans_and_ids(
  61. header, exclude_already=True)
  62. # checking the fields from csv vs API
  63. api_flds = set(api_loans.columns)
  64. licsv_flds = set(loan_info.columns)
  65. common_flds = api_flds.intersection(licsv_flds)
  66. api_flds_not_in_licsv = api_flds.difference(licsv_flds)
  67. licsv_flds_not_in_api = licsv_flds.difference(api_flds)
  68. # rename some loan_info fields to match those coming through api
  69. licsv_to_api_rename_dict = {
  70. 'acc_open_past_24mths':'acc_open_past_24_mths',
  71. 'zip_code': 'addr_zip',
  72. 'delinq_2yrs': 'delinq_2_yrs',
  73. 'funded_amnt': 'funded_amount',
  74. 'il_util': 'i_l_util',
  75. 'inq_last_6mths': 'inq_last_6_mths',
  76. # 'installment_at_funded': 'installment',
  77. 'verification_status': 'is_inc_v',
  78. 'verification_status_joint': 'is_inc_v_joint',
  79. 'loan_amnt': 'loan_amount',
  80. 'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
  81. 'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
  82. 'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
  83. }
  84. loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)
  85. # save this version of loan info
  86. loan_info.reset_index(drop=True, inplace=True)
  87. loan_info.to_feather(os.path.join(dpath, 'loan_info_api_name_matched.fth'))
  88. # split loan info into dataframes for training off of and evaluating__________
  89. eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time', 'maturity_time_stat_adj', 'maturity_paid_stat_adj', 'rem_to_be_paid', 'roi_simple',
  90. 'target_loose', 'target_strict', 'loan_status', 'id']
  91. strb_flds = ['desc', 'emp_title', 'id']
  92. base_loan_info = loan_info[list(common_flds)]
  93. eval_loan_info = loan_info[eval_flds]
  94. str_loan_info = strings[strb_flds]
  95. # save
  96. base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
  97. eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
  98. str_loan_info.reset_index(drop=True, inplace=True)
  99. str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))
  100. # make a version of pmt_history where each loan is scaled to be equal size____
  101. pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
  102. loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
  103. loan_dollar_cols = [
  104. 'outs_princp_beg',
  105. 'princp_paid',
  106. 'int_paid',
  107. 'fee_paid',
  108. 'amt_due',
  109. 'amt_paid',
  110. 'outs_princp_end',
  111. 'charged_off_amt',
  112. 'monthly_pmt',
  113. 'recovs',
  114. 'recov_fees',
  115. 'all_cash_to_inv', ]
  116. id_grouped = pmt_hist.groupby('id', sort=False)
  117. funded_amts = []
  118. for ids, group in tqdm(id_grouped):
  119. funded_amt = loan_funded_amts[ids]
  120. funded_amts.extend([funded_amt]*len(group))
  121. for col in loan_dollar_cols:
  122. pmt_hist[col] = pmt_hist[col]/funded_amts
  123. # save
  124. pmt_hist.reset_index(drop=True, inplace=True)
  125. _, pmt_hist = mg.reduce_memory(pmt_hist)
  126. pmt_hist.to_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))
  127. # make npv_rois (using various discount rates and actual/known cashflows)_____
  128. interesting_cols_over_time = [
  129. 'outs_princp_beg',
  130. 'all_cash_to_inv',
  131. 'date',
  132. 'fico_last',
  133. 'm_on_books',
  134. 'status_period_end',
  135. 'id',
  136. ]
  137. pmt_hist = pmt_hist[interesting_cols_over_time]
  138. npv_roi_holder = {}
  139. disc_rates = np.arange(.05,.36,.01)
  140. id_grouped = pmt_hist.groupby('id')
  141. for ids, group in tqdm(id_grouped):
  142. npv_roi_dict = {}
  143. funded = group.iat[0,0]
  144. cfs = [-funded] + group['all_cash_to_inv'].tolist()
  145. for rate in disc_rates:
  146. npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
  147. npv_roi_holder[ids] = npv_roi_dict
  148. npv_roi_df = pd.DataFrame(npv_roi_holder).T
  149. npv_roi_df.columns = npv_roi_df.columns.values.round(2)
  150. npv_roi_df.index.name = 'id'
  151. npv_roi_df.reset_index(inplace=True)
  152. eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')
  153. # some current loans I have no target_strict for and were not in pmt history.
  154. # Fill with negatives on npv_roi.
  155. eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
  156. eval_loan_info.fillna(-1, inplace=True)
  157. # save
  158. # feather must have string column names
  159. eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
  160. eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...