jmhsi
/
lendingclub
mirror of https://github.com/jmhsi/lendingclub.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
            import pandas as pd
from j_utils import munging as mg
import hyperlearn.hyperlearn.impute.SVDImpute as hpl_imp

def gen_expt_datasets(today, oldest, valid_start, base_loan_info, eval_loan_info, target, valid_end=None, verbose=False):
    '''
    all loans from oldest until today are taken as train. All loans issued after today until valid_end are used for validation. Uses hyperlearn svd_impute to impute missing values. Returns the train and test datasets. target can be single colname or list of colnames
    '''
    train_ids = eval_loan_info[(eval_loan_info['issue_d'] <= today) & (eval_loan_info['issue_d'] >= oldest)]['id'].unique()
    if valid_end:
        valid_ids = eval_loan_info[(eval_loan_info['issue_d'] >= valid_start) & (eval_loan_info['issue_d'] <= valid_end)]['id'].unique()
    else:
        valid_ids = eval_loan_info[(eval_loan_info['issue_d'] >= valid_start)]['id'].unique()
    train = base_loan_info[base_loan_info['id'].isin(train_ids)]
    valid = base_loan_info[base_loan_info['id'].isin(valid_ids)]
    
    # setup for catboost
    # a bit more data processing and nan handling for catboost
    train_copy = train.copy()
    valid_copy = valid.copy()
    
    # get ready for hyperlearn svdimpute
    train_copy, max_dict, min_dict, cats_dict, norm_dict = mg.train_hpl_proc(train_copy, verbose=verbose)
    valid_copy = mg.val_test_hpl_proc(valid_copy, train_copy, max_dict, min_dict, cats_dict, verbose=verbose)

    # fit to train
    S, VT, mean, std, mins, standardise = hpl_imp.fit(train_copy.values)
    
    # impute on train
    train_svdimp = hpl_imp.transform(train_copy.values, S, VT, mean, std, mins, standardise)
    train_svdimp = pd.DataFrame(train_svdimp)
    train_svdimp.index = train_copy.index
    train_svdimp.columns = train_copy.columns
    
    # impute on test
    valid_svdimp = hpl_imp.transform(valid_copy.values, S, VT, mean, std, mins, standardise)
    valid_svdimp = pd.DataFrame(valid_svdimp)
    valid_svdimp.index = valid_copy.index
    valid_svdimp.columns = valid_copy.columns
    
    # imputing changes some ids. Make the ids the originals again.
    train_svdimp['id'] = train_ids
    valid_svdimp['id'] = valid_ids
    
    train_y = eval_loan_info[eval_loan_info['id'].isin(train_ids)][target]
    valid_y = eval_loan_info[eval_loan_info['id'].isin(valid_ids)][target]
    
    return train_svdimp, train_y, valid_svdimp, valid_y, train_ids, valid_ids

# make a crude test set for now
def get_split_date(df, date_column, quantile): 

    """
    https://stackoverflow.com/questions/31018622/pandas-quantile-function-for-dates
    Get the date on which to split a dataframe for timeseries splitting
    Adjusted coerce param to errors since SO is old.
    """ 

    # 1. convert date_column to datetime (useful in case it is a string) 
    # 2. convert into int (for sorting) 
    # 3. get the quantile 
    # 4. get the corresponding date
    # 5. return, pray that it works 

    quantile_date = pd.to_datetime(df[date_column], errors = 'raise').astype('int64').quantile(q=quantile)#.astype('datetime64[ns]')

    return pd.to_datetime(quantile_date)

def split_out_traintestable_loans(df, eval_df, oldness_thrsh=.9):
    '''Can train/test on loans that pass the oldness_thrsh or have status paid/defaulted/charged_off'''
    old_enough_ids = eval_df[(eval_df['maturity_time_stat_adj'] >= oldness_thrsh) | 
                                    (eval_df['maturity_paid_stat_adj'] >= oldness_thrsh) | 
                                    (eval_df['loan_status'].isin(['paid', 'defaulted', 'charged_off']))]['id'].unique()
    df = df[df['id'].isin(old_enough_ids)]
    eval_df = eval_df[eval_df['id'].isin(old_enough_ids)]
    return df, eval_df


def add_custom_lc_features(df):
    # added features
    df['monthly_inc'] = df['annual_inc'] / 12
    df['dti_w_loan'] = (df['dti'] * df['monthly_inc'] +
                                    df['installment']) / df['monthly_inc']
    df['delinq_to_monthly_inc'] = df['delinq_amnt'] / \
        df['monthly_inc']
    df['tot_cur_bal_to_monthly_inc'] = df['tot_cur_bal'] / \
        df['monthly_inc']
    df['loan_to_inc'] = df['loan_amount'] / \
        df['monthly_inc']