1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
- import numpy as np
- import pandas as pd
- import joblib
- import matplotlib.pyplot as plt
- import seaborn as sns
- from src import *
- from config import *
- dataset_path = DATA_DIR + "/01_mobile_price_classification"
- cfg = {
- "num_col_names": [
- "battery_power",
- "clock_speed",
- "int_memory",
- "m_dep",
- "mobile_wt",
- "px_height",
- "px_width",
- "ram",
- "sc_h",
- "sc_w",
- "talk_time",
- ],
- "cat_col_names": [
- "blue",
- "dual_sim",
- "fc",
- "four_g",
- "n_cores",
- "pc",
- "three_g",
- "touch_screen",
- "wifi",
- ],
- "target_col_name": ["price_range"],
- "n_splits": 5,
- "shuffle": True,
- "SEED": 1234
- }
- def read_data(dataset_path):
- """
- load CSV data
- """
- train = pd.read_csv(dataset_path + '/train.csv')
- test = pd.read_csv(dataset_path + '/test.csv')
- train["type"] = "train"
- test["type"] = "test"
- df = pd.concat([train, test], axis=0)
- df = df.drop(columns="id")
- return df
-
- if __name__ == "__main__":
- ###################
- # Read Data
- ###################
- df = read_data(dataset_path)
- # set configuration
- df[cfg["num_col_names"]] = df[cfg["num_col_names"]].astype("float")
- df[cfg["cat_col_names"]] = df[cfg["cat_col_names"]].astype("category")
- df[cfg["target_col_name"]] = df[cfg["target_col_name"]].astype("category")
- ###################
- # Prerocessing
- ###################
- df = categorical_imputer(
- df=df,
- cat_col_names=cfg["cat_col_names"]
- )
- #df = rarelabel_encoder(
- # df=df,
- # cat_col_names=cfg["cat_col_names"]
- #)
-
- df = ordinal_encoder(
- df=df,
- cat_col_names=cfg["cat_col_names"]
- )
- df = equal_freq_discretiser(
- df=df,
- num_col_names=cfg["num_col_names"]
- )
- df = variable_transformer(
- df=df,
- num_col_names=cfg["num_col_names"],
- variable_type="power_transformer"
- )
- df = censor_outliers(
- df=df,
- num_col_names=cfg["num_col_names"]
- )
- df = drop_constant_features(df)
- ###################
- # Train Test Split
- ###################
- train, test = df[df["type"]=="train"].drop(columns="type"), df[df["type"]=="test"].drop(columns="type")
- train, val, test = data_splitting(
- df=target_transformer(df=train, target=cfg["target_col_name"]),
- target=cfg["target_col_name"],
- n_splits=cfg["n_splits"],
- shuffle=cfg["shuffle"],
- random_state=cfg["SEED"]
- )
- ###################
- # Train and Evaluate
- ###################
- trainer = Trainer(
- model=get_model(),
- target=cfg["target_col_name"],
- model_path="./model",
- logs_path="./logs",
- random_state=cfg["SEED"]
- )
|