Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

clean_pmt_history.py 4.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
  1. import numpy as np
  2. import pandas as pd
  3. def find_dupe_dates(group):
  4. '''finds duplicated dates in groupby group'''
  5. return pd.to_datetime(group[group.duplicated('date')]['date'].values)
  6. def merge_dupe_dates(group, column_iloc_map):
  7. '''
  8. Merges the releveant numeric columns in loans that have 2 entries
  9. for same month
  10. '''
  11. df_chunks = []
  12. dupe_dates = find_dupe_dates(group)
  13. df_chunks.append(group[~group['date'].isin(dupe_dates)])
  14. for date in dupe_dates:
  15. problem_rows = group[group['date'] == date]
  16. ori_index = problem_rows.index
  17. keep_row = problem_rows.iloc[-1].to_dict()
  18. keep_row['outs_princp_beg'] = problem_rows.iloc[0,column_iloc_map['outs_princp_beg']]
  19. summed = problem_rows.sum()
  20. keep_row['princp_paid'] = summed['princp_paid']
  21. keep_row['int_paid'] = summed['int_paid']
  22. keep_row['fee_paid'] = summed['fee_paid']
  23. # keep_row['amt_due'] = summed['amt_due']
  24. keep_row['amt_paid'] = summed['amt_paid']
  25. keep_row['charged_off_this_month'] = summed['charged_off_this_month']
  26. keep_row['charged_off_amt'] = summed['charged_off_amt']
  27. keep_row['recovs'] = summed['recovs']
  28. keep_row['recov_fees'] = summed['recov_fees']
  29. keep_row['all_cash_to_inv'] = summed['all_cash_to_inv']
  30. to_append = pd.DataFrame(keep_row, index=[ori_index[-1]])
  31. df_chunks.append(to_append)
  32. return pd.concat(df_chunks)
  33. def insert_missing_dates(group, ids, verbose = False):
  34. '''
  35. redoes date so that there is one entry for each date
  36. '''
  37. # Copy Paste finished below
  38. issue_d = group['issue_d'].min()
  39. first_date = group['date'].min()
  40. last_date = group['date'].max()
  41. expected_months = set(pd.date_range(start=first_date, end=last_date, freq='MS'))
  42. actual_months = set(group['date'])
  43. to_make_months = list(expected_months.symmetric_difference(actual_months))
  44. to_make_months.sort()
  45. if len(to_make_months) > 0:
  46. months_to_copy = []
  47. for month in to_make_months:
  48. months_to_copy.append(
  49. find_closest_previous_record(
  50. ids, issue_d, first_date, actual_months, month))
  51. copied = group[group['date'].isin(months_to_copy)].copy()
  52. copied['amt_paid'] = 0.0
  53. copied['date'] = to_make_months
  54. copied['amt_due'] = np.where(copied['date'] < first_date, 0, copied['amt_due'])
  55. return pd.concat([group, copied])
  56. else:
  57. if verbose:
  58. print('somehow there were no dates to add? id: {0}'.format(ids))
  59. return None
  60. def find_closest_previous_record(ids, issue_d, first_date, actual_months, month):
  61. '''This function finds the closest previous month that is in the group.
  62. It is here to handle cases where a record of one month is missing, but the
  63. record before that missing month is also missing.'''
  64. offset = pd.DateOffset(months=-1)
  65. prev_month = month + offset
  66. if month < issue_d:
  67. print(ids)
  68. return first_date
  69. elif prev_month in actual_months:
  70. return prev_month
  71. return find_closest_previous_record(ids, issue_d, first_date, actual_months, prev_month)
  72. def detect_strange_pmt_hist(group, verbose=False):
  73. '''
  74. for each group in pmt_hist.groupby('LOAN_ID'), check that the
  75. original due_amt is close to what is expected based on term, rate
  76. '''
  77. first_idx = group.index[0]
  78. exp_pmt = np.pmt(group.at[first_idx,'InterestRate']/12., group.at[first_idx,'term'], -group.at[first_idx,'PBAL_BEG_PERIOD'])
  79. rep_pmt = group.at[first_idx,'MONTHLYCONTRACTAMT']
  80. if verbose:
  81. print('expected pmt: {0}, reported pmt: {1}'.format(exp_pmt, rep_pmt))
  82. if abs(exp_pmt - rep_pmt)/(exp_pmt) > .01:
  83. return True
  84. return False
  85. def pmt_hist_fmt_date(df, col):
  86. '''
  87. Specifically reformats pmt_hist dates in the expected way
  88. '''
  89. month_dict = {
  90. 'jan': '1-',
  91. 'feb': '2-',
  92. 'mar': '3-',
  93. 'apr': '4-',
  94. 'may': '5-',
  95. 'jun': '6-',
  96. 'jul': '7-',
  97. 'aug': '8-',
  98. 'sep': '9-',
  99. 'oct': '10-',
  100. 'nov': '11-',
  101. 'dec': '12-'
  102. }
  103. df[col] = pd.to_datetime(
  104. df[col].str[:3].str.lower().replace(month_dict) +
  105. df[col].str[3:],
  106. format='%m-%Y')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...