Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

07_create_train_test.py 2.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  1. '''
  2. this makes the train and test sets as well as bootstrapped sets.
  3. trainable loans are loans that are "done" enough
  4. '''
  5. import os
  6. import pickle
  7. import pandas as pd
  8. # testing
  9. from lendingclub import config, utils
  10. from lendingclub.data_and_eval_preparation import create_train_test as ctt
  11. from sklearn.model_selection import train_test_split
  12. dpath = config.data_dir
  13. base_loan_info = pd.read_feather(os.path.join(dpath, 'base_loan_info.fth'))
  14. eval_loan_info = pd.read_feather(os.path.join(dpath, 'eval_loan_info.fth'))
  15. print(base_loan_info.shape, eval_loan_info.shape)
  16. with open(os.path.join(config.data_dir, 'strange_pmt_hist_ids.pkl'), 'rb') as f:
  17. strange_pmt_hist_ids = pickle.load(f)
  18. print('dropping {0} strange loans based on strange_pmt_hist_ids.pkl'.format(len(strange_pmt_hist_ids)))
  19. base_loan_info = base_loan_info.query('id not in @strange_pmt_hist_ids')
  20. eval_loan_info = eval_loan_info.query('id not in @strange_pmt_hist_ids')
  21. print(base_loan_info.shape, eval_loan_info.shape)
  22. #from 2010-1-1 onward, take out min(10%, 2000) loans to set aside as test
  23. doneness = .95
  24. train_testable_eval_loan_info = eval_loan_info.query('maturity_time_stat_adj >= @doneness or maturity_paid_stat_adj >= @doneness')
  25. train_testable_ids = train_testable_eval_loan_info['id']
  26. X_train, X_test, _, _ = train_test_split(train_testable_eval_loan_info, train_testable_eval_loan_info['target_strict'].values, stratify=train_testable_eval_loan_info['grade'], test_size=0.1)
  27. # remove loans before 2010-1-1 from test and add them to train
  28. add_to_train = X_test.query('issue_d < "2010-1-1"')
  29. X_train = pd.concat([X_train, add_to_train])
  30. train_ids = X_train['id'].tolist()
  31. X_test = X_test.query('id not in @train_ids')
  32. test_ids = X_test['id'].tolist()
  33. assert len(set(train_ids).intersection(test_ids)) == 0
  34. train_test_ids_dict = {}
  35. train_test_ids_dict['train_testable'] = train_testable_ids.tolist()
  36. train_test_ids_dict['train'] = train_ids
  37. train_test_ids_dict['test'] = test_ids
  38. # make 10 bootstrap month-by-month test_loan_infos (and maybe test_eval_loan_infos?)
  39. bootstrap_sample_idx = {}
  40. issue_d_g = X_test.groupby('issue_d')
  41. for i in range(10):
  42. to_concat = []
  43. for d, g in issue_d_g:
  44. to_concat.append(g.sample(len(g), replace=True))
  45. df = pd.concat(to_concat)
  46. bootstrap_sample_idx[i] = df.index.tolist()
  47. # save
  48. with open(os.path.join(dpath, 'train_test_ids.pkl'), 'wb') as file:
  49. pickle.dump(train_test_ids_dict, file)
  50. with open(os.path.join(dpath, 'bootstrap_test_idx.pkl'), 'wb') as file:
  51. pickle.dump(bootstrap_sample_idx, file)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...