Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

tuning.py 6.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
  1. from dataclasses import *
  2. import lightgbm as lgb
  3. import optuna
  4. from optuna import Study, Trial
  5. from sklearn.base import TransformerMixin
  6. import yspecies
  7. from yspecies.models import Metrics, ModelFactory
  8. from yspecies.partition import ExpressionPartitions
  9. from yspecies.utils import *
  10. @dataclass
  11. class SpecializedTuningResults:
  12. '''
  13. Originally used with LightGBMTuner but than decided to get rid of it until bugs are fixed
  14. '''
  15. best_params: dict
  16. best_score: float
  17. def print_info(self):
  18. print("Best score:", self.best_score)
  19. best_params = self.best_params
  20. print("Best params:", best_params)
  21. print(" Params: ")
  22. for key, value in best_params.items():
  23. print(" {}: {}".format(key, value))
  24. @dataclass
  25. class LightGBMTuner(TransformerMixin):
  26. '''
  27. It is somewhat buggy, see https://github.com/optuna/optuna/issues/1602#issuecomment-670937574
  28. I had to switch to GeneralTuner while they are fixing it
  29. '''
  30. time_budget_seconds: int
  31. parameters: Dict = field(default_factory=lambda: {
  32. 'boosting_type': 'dart',
  33. 'objective': 'regression',
  34. 'metric': 'huber'
  35. })
  36. num_boost_round: int = 500
  37. seed: int = 42
  38. def fit(self, partitions: ExpressionPartitions, y=None) -> Dict:
  39. cat = partitions.categorical_index if partitions.features.has_categorical else "auto"
  40. lgb_train = lgb.Dataset(partitions.X, partitions.Y, categorical_feature=cat, free_raw_data=False)
  41. tuner = lgb.LightGBMTunerCV(
  42. self.parameters, lgb_train, verbose_eval=self.num_boost_round, folds=partitions.folds,
  43. time_budget=self.time_budget_seconds,
  44. num_boost_round=self.num_boost_round
  45. )
  46. tuner.tune_bagging()
  47. tuner.tune_feature_fraction()
  48. tuner.tune_min_data_in_leaf()
  49. tuner.tune_feature_fraction_stage2()
  50. tuner.run()
  51. return SpecializedTuningResults(tuner.best_params, tuner.best_score)
  52. @dataclass
  53. class CrossValidator(TransformerMixin):
  54. '''
  55. Transformer that does cross-validation
  56. '''
  57. num_boost_round: int = 500
  58. seed: int = 42
  59. parameters: Dict = field(default_factory=lambda: {
  60. 'boosting_type': 'dart',
  61. 'objective': 'regression',
  62. 'metric': {'mae', 'mse', 'huber'},
  63. 'max_leaves': 20,
  64. 'max_depth': 3,
  65. 'learning_rate': 0.07,
  66. 'feature_fraction': 0.8,
  67. 'bagging_fraction': 1,
  68. 'min_data_in_leaf': 6,
  69. 'lambda_l1': 0.9,
  70. 'lambda_l2': 0.9,
  71. "verbose": -1
  72. })
  73. def fit(self, partitions: ExpressionPartitions, y=None) -> Dict:
  74. cat = partitions.categorical_index if partitions.features.has_categorical else "auto"
  75. lgb_train = lgb.Dataset(partitions.X, partitions.Y, categorical_feature=cat, free_raw_data=False)
  76. eval_hist = lgb.cv(self.parameters,
  77. lgb_train,
  78. folds=partitions.folds,
  79. metrics=["mae", "mse", "huber"],
  80. categorical_feature=cat,
  81. show_stdv=True,
  82. verbose_eval=self.num_boost_round,
  83. seed=self.seed,
  84. num_boost_round=self.num_boost_round)
  85. return eval_hist
  86. @dataclass
  87. class TuningResults:
  88. best_params: dict
  89. train_metrics: Metrics = None
  90. validation_metrics: Metrics = None
  91. @dataclass
  92. class GeneralTuner(TransformerMixin):
  93. num_boost_round: int = 500
  94. seed: int = 42
  95. #time_budget_seconds: int = 600
  96. to_optimize: str = "huber"
  97. n_trials: int = 10
  98. n_jobs: int = -1
  99. num_boost_round_train: int = 1000
  100. repeats: int = 10
  101. study: Study = field(default_factory=lambda: optuna.create_study(direction='minimize'))
  102. parameters: Callable[[Trial], float] = None
  103. best_model: lgb.Booster = None
  104. best_params: dict = None
  105. def default_parameters(self, trial: Trial) -> Dict:
  106. return {
  107. 'objective': 'regression',
  108. 'metric': {'mae', 'mse', 'huber'},
  109. 'verbosity': -1,
  110. 'boosting_type': trial.suggest_categorical('boosting_type', ['dart', 'gbdt']),
  111. 'lambda_l1': trial.suggest_uniform('lambda_l1', 0.01, 4.0),
  112. 'lambda_l2': trial.suggest_uniform('lambda_l2', 0.01, 4.0),
  113. 'max_leaves': trial.suggest_int("max_leaves", 15, 40),
  114. 'max_depth': trial.suggest_int('max_depth', 3, 8),
  115. 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  116. 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  117. 'learning_rate': trial.suggest_uniform('learning_rate', 0.04, 0.2),
  118. 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 4, 10),
  119. "verbose": -1
  120. }
  121. def cv(self, partitions: ExpressionPartitions, trial: Trial) -> Dict:
  122. params = self.default_parameters(trial) if self.parameters is None else self.parameters(trial)
  123. cross = CrossValidator(self.num_boost_round,self.seed, parameters = params)
  124. return cross.fit(partitions)
  125. def fit(self, partitions: ExpressionPartitions, y=None) -> dict:
  126. def objective(trial: Trial):
  127. values: np.ndarray = np.zeros(self.repeats)
  128. for i in range(0, self.repeats):
  129. eval_hist = self.cv(partitions, trial)
  130. values[i] = np.array(eval_hist[f"{self.to_optimize}-mean"]).min()
  131. return np.average(values)
  132. self.study.optimize(objective, show_progress_bar=False, n_trials=self.n_trials, n_jobs=self.n_jobs, gc_after_trial=True)
  133. self.best_params = self.study.best_params
  134. print(f"best_params: {self.best_params}")
  135. return self.best_params
  136. def transform(self, partitions: ExpressionPartitions) -> TuningResults:
  137. assert self.best_params is not None, "best params are not known - the model must be first fit!"
  138. if partitions.nhold_out > 0:
  139. factory = ModelFactory(parameters=self.best_params)
  140. self.best_model = factory.regression_model(partitions.cv_merged_x,partitions.hold_out_x,
  141. partitions.cv_merged_y, partitions.hold_out_y,
  142. partitions.categorical_index, num_boost_round=self.num_boost_round_train)
  143. train_prediction = self.best_model.predict(partitions.cv_merged_x, num_iteration=self.best_model.best_iteration)
  144. test_prediction = self.best_model.predict(partitions.hold_out_x, num_iteration=self.best_model.best_iteration)
  145. train_metrics = Metrics.calculate(train_prediction, partitions.cv_merged_y)
  146. test_metrics = Metrics.calculate(test_prediction, partitions.hold_out_y)
  147. else:
  148. train_metrics = None
  149. test_metrics = None
  150. return TuningResults(self.study.best_params, train_metrics, test_metrics)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...