Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

tune.py 8.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
  1. import sys
  2. from pathlib import Path
  3. from typing import Union, List, Tuple
  4. from dataclasses import replace
  5. import click
  6. def get_local_path():
  7. debug_local = True #to use local version
  8. local = (Path(".") / "yspecies").resolve()
  9. if debug_local and local.exists():
  10. #sys.path.insert(0, Path(".").as_posix())
  11. sys.path.insert(0, local.as_posix())
  12. print("extending pathes with local yspecies")
  13. print(sys.path)
  14. return local
  15. #@click.group(invoke_without_command=True)
  16. @click.command()
  17. @click.option('--trait', default="mtGC", help='trait name')
  18. @click.option('--metrics', default="r2_huber_kendall", help='metrics names')
  19. @click.option('--trials', default=200, help='Number of trials in hyper optimization')
  20. @click.option('--folds', default=5, help='Number of folds in cross-validation')
  21. @click.option('--hold_outs', default=1, help='Number of hold outs in cross-validation')
  22. @click.option('--repeats', default=5, help="number of times to repeat validation")
  23. @click.option('--not_validated_species', default=True, help="not_validated_species")
  24. @click.option('--threads', default=1, help="number of threads (1 by default). If you put -1 it will try to utilize all cores, however it can be dangerous memorywise")
  25. @click.option('--debug_local', default=True, help="debug local")
  26. def tune(trait: str, metrics: str, trials: int, folds: int, hold_outs: int, repeats: int, not_validated_species: Union[bool, List[str]], threads: int, debug_local: bool):
  27. print(f"starting hyperparameters optimization script with {trials} trials, {folds} folds and {hold_outs} hold outs!")
  28. local = get_local_path()
  29. importance_type = "split"
  30. lgb_params = {"bagging_fraction": 0.9522534844058304,
  31. "boosting_type": "dart",
  32. "objective": "regression",
  33. "feature_fraction": 0.42236910941558053,
  34. "lambda_l1": 0.020847266580277746,
  35. "lambda_l2": 2.8448564854773326,
  36. "learning_rate": 0.11484015430016059,
  37. "max_depth": 3,
  38. "max_leaves": 35,
  39. "min_data_in_leaf": 9,
  40. "num_iterations": 150
  41. }
  42. life_history = ["lifespan", "mass_g", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
  43. from sklearn.pipeline import Pipeline
  44. from yspecies.workflow import Repeat, Collect
  45. from yspecies.config import Locations, DataLoader
  46. from yspecies.preprocess import FeatureSelection, DataExtractor
  47. from yspecies.partition import DataPartitioner, PartitionParameters
  48. from yspecies.selection import ShapSelector
  49. from yspecies.tuning import Tune
  50. from yspecies.results import FeatureSummary, FeatureResults
  51. import optuna
  52. from optuna import Trial
  53. import pprint
  54. pp = pprint.PrettyPrinter(indent=4)
  55. # ### Loading data ###
  56. # Let's load data from species/genes/expressions selected by select_samples.py notebook
  57. from pathlib import Path
  58. locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")
  59. default_selection = FeatureSelection(
  60. samples = ["tissue","species"], #samples metadata to include
  61. species = [], #species metadata other then Y label to include
  62. exclude_from_training = ["species"], #exclude some fields from LightGBM training
  63. to_predict = trait, #column to predict
  64. categorical = ["tissue"],
  65. select_by = "shap",
  66. importance_type = importance_type,
  67. feature_perturbation = "tree_path_dependent"
  68. )
  69. loader = DataLoader(locations, default_selection)
  70. selections = loader.load_life_history()
  71. to_select = selections[trait]
  72. # ## Setting up ShapSelector ##
  73. # Deciding on selection parameters (which fields to include, exclude, predict)
  74. partition_params = PartitionParameters(folds, hold_outs, 2, 42)
  75. selection = FeatureSelection(
  76. samples = ["tissue","species"], #samples metadata to include
  77. species = [], #species metadata other then Y label to include
  78. exclude_from_training = ["species"], #exclude some fields from LightGBM training
  79. to_predict = trait, #column to predict
  80. categorical = ["tissue"],
  81. select_by = "shap",
  82. importance_type = "split"
  83. )
  84. url = f'sqlite:///' +str((locations.interim.optimization / f"{trait}.sqlite").absolute())
  85. print('loading (if exists) study from '+url)
  86. storage = optuna.storages.RDBStorage(
  87. url=url
  88. #engine_kwargs={'check_same_thread': False}
  89. )
  90. study = optuna.multi_objective.study.create_study(directions=['maximize','minimize','maximize'], storage = storage, study_name = f"{trait}_{metrics}", load_if_exists = True)
  91. def objective_parameters(trial: Trial) -> dict:
  92. return {
  93. 'objective': 'regression',
  94. 'metric': {'mae', 'mse', 'huber'},
  95. 'verbosity': -1,
  96. 'boosting_type': trial.suggest_categorical('boosting_type', ['dart', 'gbdt']),
  97. 'lambda_l1': trial.suggest_uniform('lambda_l1', 0.01, 3.0),
  98. 'lambda_l2': trial.suggest_uniform('lambda_l2', 0.01, 3.0),
  99. 'max_leaves': trial.suggest_int("max_leaves", 15, 25),
  100. 'max_depth': trial.suggest_int('max_depth', 3, 8),
  101. 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 1.0),
  102. 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 1.0),
  103. 'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
  104. 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 8),
  105. 'drop_rate': trial.suggest_uniform('drop_rate', 0.1, 0.3),
  106. "verbose": -1
  107. }
  108. optimization_parameters = objective_parameters
  109. from yspecies.workflow import SplitReduce
  110. def side(i: int):
  111. print(i)
  112. return i
  113. prepare_partition = SplitReduce(
  114. outputs=DataPartitioner(),
  115. split=lambda x: [(x[0], replace(partition_params, seed=side(x[2])))],
  116. reduce=lambda x, output: (output[0], x[1])
  117. )
  118. partition_and_cv = Pipeline(
  119. [
  120. ("prepare partition", prepare_partition),
  121. ("shap_computation", ShapSelector()) #('crossvalidator', CrossValidator())
  122. ]
  123. )
  124. def get_objectives(results: List[FeatureResults]) -> Tuple[float, float, float]:
  125. summary = FeatureSummary(results)
  126. return (summary.metrics_average.R2, summary.metrics_average.huber, summary.kendall_tau_abs_mean)
  127. partition_and_cv_repeat = Pipeline([
  128. ("repeat_cv_pipe", Repeat(partition_and_cv, repeats, lambda x, i: [x[0], x[1], i] )),
  129. ("collect_mean", Collect(fold=lambda outputs: get_objectives(outputs)))
  130. ]
  131. )
  132. p = Pipeline([
  133. ('extractor', DataExtractor()),
  134. ('tune', Tune(partition_and_cv_repeat, study=study, n_trials=trials, parameters_space=optimization_parameters))
  135. ])
  136. from yspecies.tuning import MultiObjectiveResults
  137. results: MultiObjectiveResults = p.fit_transform(to_select)
  138. best = results.best_trials
  139. import json
  140. for i, t in enumerate(best):
  141. trait_path = locations.metrics.optimization / trait
  142. if not trait_path.exists():
  143. trait_path.mkdir()
  144. path = trait_path / f"{str(i)}.json"
  145. print(f"writing parameters to {path}")
  146. with open(path, 'w') as f:
  147. params = t.params
  148. values = t.values
  149. to_write = {"number": t.number,"params": params, "metrics": {"R2":values[0], "huber": values[1], "kendall_tau": values[2]}}
  150. json.dump(to_write, f, sort_keys=True, indent=4)
  151. print(f"FINISHED HYPER OPTIMIZING {trait}")
  152. @click.command()
  153. @click.option('--life_history', default=["lifespan", "mass_g", "gestation_days", "mtGC", "metabolic_rate", "temperature"], help='life history list')
  154. @click.option('--metrics', default="r2_huber_kendall", help='metrics names')
  155. @click.option('--trials', default=10, help='Number of trials in hyper optimization')
  156. @click.option('--folds', default=5, help='Number of folds in cross-validation')
  157. @click.option('--hold_outs', default=1, help='Number of hold outs in cross-validation')
  158. @click.option('--repeats', default=5, help="number of times to repeat validation")
  159. @click.option('--not_validated_species', default=True, help="not_validated_species")
  160. @click.option('--threads', default=1, help="number of threads (1 by default). If you put -1 it will try to utilize all cores, however it can be dangerous memorywise")
  161. @click.option('--debug_local', default=True, help="debug local")
  162. def tune_all(life_history: List[str], metrics: str, trials: int, folds: int, hold_outs: int, repeats: int, not_validated_species: Union[bool, List[str]], threads: int, debug_local: bool):
  163. for trait in life_history:
  164. tune(trait, metrics, trials, folds, hold_outs, repeats, not_validated_species, threads, debug_local)
  165. if __name__ == "__main__":
  166. tune()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...