Rishav-hub
/
dvc-nlp-demo
mirror of https://github.com/Rishav-hub/dvc-nlp-demo.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
            import argparse
import os
import math
from tqdm import tqdm
import logging
from src.utils.all_utils import read_yaml, create_directory, save_reports
import joblib
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, roc_curve

STAGE = "Four"

logging.basicConfig(
    filename=os.path.join("logs", 'running_logs.log'), 
    level=logging.INFO, 
    format="[%(asctime)s: %(levelname)s: %(module)s]: %(message)s",
    filemode="a"
    )

def main(config_path):
    config = read_yaml(config_path)
    # params = read_yaml(params_path)

    artifacts = config["artifacts"]
    featurized_data_dir_path = os.path.join(artifacts["ARTIFACTS_DIR"], artifacts["FEATURIZED_DATA"])
    featurized_test_data_path = os.path.join(featurized_data_dir_path, artifacts["FEATURIZED_OUT_TEST"])


    model_dir_path = os.path.join(artifacts["ARTIFACTS_DIR"], artifacts["MODEL_DIR"])
    model_path = os.path.join(model_dir_path, artifacts["MODEL_NAME"])

    # Load matrix
    matrix = joblib.load(featurized_test_data_path)


    # Load model
    model = joblib.load(model_path)

    labels = np.squeeze(matrix[:, 1].toarray())
    X = matrix[:,2:]

    # Predict
    predictions_by_class = model.predict_proba(X)
    predictions = predictions_by_class[:, 1]

    PRC_json_path = config['plots']['PRC']
    ROC_json_path = config['plots']['ROC']
    scores_json_path = config['metrics']['SCORES']


    avg_prec = average_precision_score(labels, predictions)
    roc_auc = roc_auc_score(labels, predictions)

    scores = {
        "avg_prec": avg_prec,
        "roc_auc": roc_auc
    }
    save_reports(scores, scores_json_path)

    # PRC
    precision, recall, thresh = precision_recall_curve(labels, predictions)

    nth_point = math.ceil(len(thresh)/1000)
    prc_points = list(zip(precision, recall, thresh))[::nth_point]

    prc_data = {
        "prc": [
            {"precision": p, "recall": r, "threshold": t}
            for p, r, t in prc_points
        ]
    }
    
    save_reports(prc_data, PRC_json_path)
    
    fpr, tpr, roc_threshold = roc_curve(labels, predictions)

    roc_data = {
        "roc": [
            {"fpr": fp, "tpr": tp, "threshold": t}
            for fp, tp, t in zip(fpr, tpr, roc_threshold)
        ]
    }

    save_reports(roc_data, ROC_json_path)


if __name__ == '__main__':
    args = argparse.ArgumentParser()
    args.add_argument("--config", "-c", default="config/config.yaml")
    # args.add_argument("--params", "-p", default="params.yaml")
    parsed_args = args.parse_args()

    try:
        logging.info("\n********************")
        logging.info(f">>>>> stage {STAGE} started <<<<<")
        main(config_path=parsed_args.config)
        logging.info(f">>>>> stage {STAGE} completed!<<<<<\n")
    except Exception as e:
        logging.exception(e)
        raise e