Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

lc_utils.py 4.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  1. import pandas as pd
  2. from j_utils import munging as mg
  3. import hyperlearn.hyperlearn.impute.SVDImpute as hpl_imp
  4. def gen_expt_datasets(today, oldest, valid_start, base_loan_info, eval_loan_info, target, valid_end=None, verbose=False):
  5. '''
  6. all loans from oldest until today are taken as train. All loans issued after today until valid_end are used for validation. Uses hyperlearn svd_impute to impute missing values. Returns the train and test datasets. target can be single colname or list of colnames
  7. '''
  8. train_ids = eval_loan_info[(eval_loan_info['issue_d'] <= today) & (eval_loan_info['issue_d'] >= oldest)]['id'].unique()
  9. if valid_end:
  10. valid_ids = eval_loan_info[(eval_loan_info['issue_d'] >= valid_start) & (eval_loan_info['issue_d'] <= valid_end)]['id'].unique()
  11. else:
  12. valid_ids = eval_loan_info[(eval_loan_info['issue_d'] >= valid_start)]['id'].unique()
  13. train = base_loan_info[base_loan_info['id'].isin(train_ids)]
  14. valid = base_loan_info[base_loan_info['id'].isin(valid_ids)]
  15. # setup for catboost
  16. # a bit more data processing and nan handling for catboost
  17. train_copy = train.copy()
  18. valid_copy = valid.copy()
  19. # get ready for hyperlearn svdimpute
  20. train_copy, max_dict, min_dict, cats_dict, norm_dict = mg.train_hpl_proc(train_copy, verbose=verbose)
  21. valid_copy = mg.val_test_hpl_proc(valid_copy, train_copy, max_dict, min_dict, cats_dict, verbose=verbose)
  22. # fit to train
  23. S, VT, mean, std, mins, standardise = hpl_imp.fit(train_copy.values)
  24. # impute on train
  25. train_svdimp = hpl_imp.transform(train_copy.values, S, VT, mean, std, mins, standardise)
  26. train_svdimp = pd.DataFrame(train_svdimp)
  27. train_svdimp.index = train_copy.index
  28. train_svdimp.columns = train_copy.columns
  29. # impute on test
  30. valid_svdimp = hpl_imp.transform(valid_copy.values, S, VT, mean, std, mins, standardise)
  31. valid_svdimp = pd.DataFrame(valid_svdimp)
  32. valid_svdimp.index = valid_copy.index
  33. valid_svdimp.columns = valid_copy.columns
  34. # imputing changes some ids. Make the ids the originals again.
  35. train_svdimp['id'] = train_ids
  36. valid_svdimp['id'] = valid_ids
  37. train_y = eval_loan_info[eval_loan_info['id'].isin(train_ids)][target]
  38. valid_y = eval_loan_info[eval_loan_info['id'].isin(valid_ids)][target]
  39. return train_svdimp, train_y, valid_svdimp, valid_y, train_ids, valid_ids
  40. # make a crude test set for now
  41. def get_split_date(df, date_column, quantile):
  42. """
  43. https://stackoverflow.com/questions/31018622/pandas-quantile-function-for-dates
  44. Get the date on which to split a dataframe for timeseries splitting
  45. Adjusted coerce param to errors since SO is old.
  46. """
  47. # 1. convert date_column to datetime (useful in case it is a string)
  48. # 2. convert into int (for sorting)
  49. # 3. get the quantile
  50. # 4. get the corresponding date
  51. # 5. return, pray that it works
  52. quantile_date = pd.to_datetime(df[date_column], errors = 'raise').astype('int64').quantile(q=quantile)#.astype('datetime64[ns]')
  53. return pd.to_datetime(quantile_date)
  54. def split_out_traintestable_loans(df, eval_df, oldness_thrsh=.9):
  55. '''Can train/test on loans that pass the oldness_thrsh or have status paid/defaulted/charged_off'''
  56. old_enough_ids = eval_df[(eval_df['maturity_time_stat_adj'] >= oldness_thrsh) |
  57. (eval_df['maturity_paid_stat_adj'] >= oldness_thrsh) |
  58. (eval_df['loan_status'].isin(['paid', 'defaulted', 'charged_off']))]['id'].unique()
  59. df = df[df['id'].isin(old_enough_ids)]
  60. eval_df = eval_df[eval_df['id'].isin(old_enough_ids)]
  61. return df, eval_df
  62. def add_custom_lc_features(df):
  63. # added features
  64. df['monthly_inc'] = df['annual_inc'] / 12
  65. df['dti_w_loan'] = (df['dti'] * df['monthly_inc'] +
  66. df['installment']) / df['monthly_inc']
  67. df['delinq_to_monthly_inc'] = df['delinq_amnt'] / \
  68. df['monthly_inc']
  69. df['tot_cur_bal_to_monthly_inc'] = df['tot_cur_bal'] / \
  70. df['monthly_inc']
  71. df['loan_to_inc'] = df['loan_amount'] / \
  72. df['monthly_inc']
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...