jmhsi
/
lendingclub
mirror of https://github.com/jmhsi/lendingclub.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
141

	
            '''
Defines the Model class and methods.
Usage:
    Instantiate a model:
        baseline = Model('baseline')
    Get scores for a dataframe:
        scores = baseline.score(df)
'''


import os

import numpy as np
import pandas as pd
# from pandas.testing import assert_frame_equal
from catboost import CatBoostClassifier, CatBoostRegressor
from joblib import load
import pickle

import j_utils.munging as mg
from lendingclub import config
from lendingclub.modeling import score_utils as scr_util

ppath = config.prj_dir
dpath = config.data_dir
mpath = config.modeling_dir


class Model():
    '''
    Model class loads appropriate model based on name in constructor
    Also handles data preprocessing
    '''
    def __init__(self, name: str):
        self.name = name
        self.basempath = os.path.join(ppath, 'models')
        self.mpath = mpath
        self.proc_arti = None
        self.data_is_procced = False
        self.proc_df = None
        self.m = None
        self.df = None
        self.m_clf = None
        self.m_regr = None
        self.load_model()
        
    def load_model(self):
        '''
        Loads a model based on which model (self.name)
        '''
        if self.name in ['baseline', 'A', 'B', 'C', 'D', 'E', 'F', 'G']:
            with open(os.path.join(mpath, '{0}_model.pkl'.format(self.name)), 'rb') as file:
                self.m = pickle.load(file)
        elif self.name == 'logistic_regr':
            self.m = load(os.path.join(mpath, '{0}_model.pkl'.format(self.name)))
            self.proc_arti = load(os.path.join(mpath, '{0}_model_proc_arti.pkl'.format(self.name)))
        elif self.name in ['catboost_clf', 'catboost_regr']:
            if self.name == 'catboost_clf':
                self.m = CatBoostClassifier()
            elif self.name == 'catboost_regr':
                self.m = CatBoostRegressor()
            self.m.load_model(os.path.join(mpath,'{0}_model.cb'.format(self.name)))
            self.proc_arti = load(os.path.join(mpath, '{0}_model_proc_arti.pkl'.format(self.name)))
        elif self.name == 'catboost_both':
            self.m_clf = CatBoostClassifier()
            self.m_clf.load_model(os.path.join(mpath,'{0}_model.cb'.format('catboost_clf')))
            self.m_regr = CatBoostRegressor()
            self.m_regr.load_model(os.path.join(mpath,'{0}_model.cb'.format('catboost_regr')))
            # can take either for proc data, its same process
            self.proc_arti = load(os.path.join(mpath, '{0}_model_proc_arti.pkl'.format('catboost_regr')))
            
    def proc_data(self):
        '''
        Process dataframe appropriately for the model type, set self.proc_df
        '''
        self.proc_df = mg.val_test_proc(self.df.copy(), *self.proc_arti)
        self.data_is_procced = True

    def score(self, df: pd.DataFrame, return_all=False, random_penalty=False, clf_cutoff = .90, optimized=True):
        '''
        Given a dataframe (base_loan_info, non imputed or scaled or normalized)
        return scores. Imputation, Scaling, and Normalizing will be handled
        inside this method to match that done at training
        
        HIGHER SCORES SHOULD BE BETTER (for classification, want prob of
        not defaulting)
        
        optimized is only for catboost_both where notebooks 11/12 have been
        used to choose how to properly combine scores and what
        percentiles of score to invest in
        '''
        if (self.df is None) or ((self.df is not None) and not self.df.equals(df)):
            # if there is no previous df, or if previous df doesn't match
            # the df that was just passed to be scored
            self.df = df
            self.data_is_procced = False
            
        if not self.data_is_procced and any(n in self.name for n in ['regr', 'clf', 'both']):
            self.proc_data()
        
        # baselines and grades
        if self.name in ['baseline', 'A', 'B', 'C', 'D', 'E', 'F', 'G']:
            self.prng = np.random.RandomState(self.m)
            scores = self.prng.random(len(self.df))
            if self.name in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
                mask = np.where(df['grade'] == self.name, 0, 1).astype(bool)
                scores[mask] = 0
            return scores
        elif self.name in ['logistic_regr', 'catboost_clf', 'catboost_regr']:
#             self.proc_df = mg.val_test_proc(self.df, *self.proc_arti)
            # return probability of not default
            if self.name in ['logistic_regr', 'catboost_clf']:
                return self.m.predict_proba(self.proc_df)[:, 0]
            elif self.name in ['catboost_regr']:
                return self.m.predict(self.proc_df)
        elif self.name in ['catboost_both']:
#             if self.proc_df is None:
#                 self.proc_df = mg.val_test_proc(self.df, *self.proc_arti)
            clf_scores = self.m_clf.predict_proba(self.proc_df)[:, 0]
            regr_scores = self.m_regr.predict(self.proc_df)

            # linearly combine clf and regr scaled, using clf_wt in scr_utils 
            clf_wt_scorer = scr_util.combined_score(scr_util.clf_wt)
            self.proc_df['catboost_clf'] = clf_scores
            self.proc_df['catboost_regr'] = regr_scores            
            self.proc_df['catboost_regr_scl'] = scr_util.scale_cb_regr_score(self.proc_df)
            comb_scores = clf_wt_scorer('catboost_clf', 'catboost_regr_scl', self.proc_df)
            if return_all:
                return comb_scores, regr_scores, clf_scores
            return comb_scores
        print('unknown model??')
        return None
        
def load_scored_df():
    '''
    loads the df with all model scores. If it doesn't exist, creates it
    '''
    path = os.path.join(config.data_dir, 'scored_eval_loan_info.fth')
    if os.path.exists(path):
        return pd.read_feather(path)
    return pd.read_feather(os.path.join(config.data_dir, 'eval_loan_info.fth'))