1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
|
- import sys
- from pathlib import Path
- from typing import Union, List, Tuple
- from dataclasses import replace
- import click
- def get_local_path():
- debug_local = True #to use local version
- local = (Path(".") / "yspecies").resolve()
- if debug_local and local.exists():
- #sys.path.insert(0, Path(".").as_posix())
- sys.path.insert(0, local.as_posix())
- print("extending pathes with local yspecies")
- print(sys.path)
- return local
- #@click.group(invoke_without_command=True)
- @click.command()
- @click.option('--trait', default="mtGC", help='trait name')
- @click.option('--metrics', default="r2_huber_kendall", help='metrics names')
- @click.option('--trials', default=200, help='Number of trials in hyper optimization')
- @click.option('--folds', default=5, help='Number of folds in cross-validation')
- @click.option('--hold_outs', default=1, help='Number of hold outs in cross-validation')
- @click.option('--repeats', default=5, help="number of times to repeat validation")
- @click.option('--not_validated_species', default=True, help="not_validated_species")
- @click.option('--threads', default=1, help="number of threads (1 by default). If you put -1 it will try to utilize all cores, however it can be dangerous memorywise")
- @click.option('--debug_local', default=True, help="debug local")
- def tune(trait: str, metrics: str, trials: int, folds: int, hold_outs: int, repeats: int, not_validated_species: Union[bool, List[str]], threads: int, debug_local: bool):
- print(f"starting hyperparameters optimization script with {trials} trials, {folds} folds and {hold_outs} hold outs!")
- local = get_local_path()
- importance_type = "split"
- lgb_params = {"bagging_fraction": 0.9522534844058304,
- "boosting_type": "dart",
- "objective": "regression",
- "feature_fraction": 0.42236910941558053,
- "lambda_l1": 0.020847266580277746,
- "lambda_l2": 2.8448564854773326,
- "learning_rate": 0.11484015430016059,
- "max_depth": 3,
- "max_leaves": 35,
- "min_data_in_leaf": 9,
- "num_iterations": 150
- }
- life_history = ["lifespan", "mass_g", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
- from sklearn.pipeline import Pipeline
- from yspecies.workflow import Repeat, Collect
- from yspecies.config import Locations, DataLoader
- from yspecies.preprocess import FeatureSelection, DataExtractor
- from yspecies.partition import DataPartitioner, PartitionParameters
- from yspecies.selection import ShapSelector
- from yspecies.tuning import Tune
- from yspecies.results import FeatureSummary, FeatureResults
- import optuna
- from optuna import Trial
- import pprint
- pp = pprint.PrettyPrinter(indent=4)
- # ### Loading data ###
- # Let's load data from species/genes/expressions selected by select_samples.py notebook
- from pathlib import Path
- locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")
- default_selection = FeatureSelection(
- samples = ["tissue","species"], #samples metadata to include
- species = [], #species metadata other then Y label to include
- exclude_from_training = ["species"], #exclude some fields from LightGBM training
- to_predict = trait, #column to predict
- categorical = ["tissue"],
- select_by = "shap",
- importance_type = importance_type,
- feature_perturbation = "tree_path_dependent"
- )
- loader = DataLoader(locations, default_selection)
- selections = loader.load_life_history()
- to_select = selections[trait]
- # ## Setting up ShapSelector ##
- # Deciding on selection parameters (which fields to include, exclude, predict)
- partition_params = PartitionParameters(folds, hold_outs, 2, 42)
- selection = FeatureSelection(
- samples = ["tissue","species"], #samples metadata to include
- species = [], #species metadata other then Y label to include
- exclude_from_training = ["species"], #exclude some fields from LightGBM training
- to_predict = trait, #column to predict
- categorical = ["tissue"],
- select_by = "shap",
- importance_type = "split"
- )
- url = f'sqlite:///' +str((locations.interim.optimization / f"{trait}.sqlite").absolute())
- print('loading (if exists) study from '+url)
- storage = optuna.storages.RDBStorage(
- url=url
- #engine_kwargs={'check_same_thread': False}
- )
- study = optuna.multi_objective.study.create_study(directions=['maximize','minimize','maximize'], storage = storage, study_name = f"{trait}_{metrics}", load_if_exists = True)
- def objective_parameters(trial: Trial) -> dict:
- return {
- 'objective': 'regression',
- 'metric': {'mae', 'mse', 'huber'},
- 'verbosity': -1,
- 'boosting_type': trial.suggest_categorical('boosting_type', ['dart', 'gbdt']),
- 'lambda_l1': trial.suggest_uniform('lambda_l1', 0.01, 3.0),
- 'lambda_l2': trial.suggest_uniform('lambda_l2', 0.01, 3.0),
- 'max_leaves': trial.suggest_int("max_leaves", 15, 25),
- 'max_depth': trial.suggest_int('max_depth', 3, 8),
- 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 1.0),
- 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 1.0),
- 'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
- 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 8),
- 'drop_rate': trial.suggest_uniform('drop_rate', 0.1, 0.3),
- "verbose": -1
- }
- optimization_parameters = objective_parameters
- from yspecies.workflow import SplitReduce
- def side(i: int):
- print(i)
- return i
- prepare_partition = SplitReduce(
- outputs=DataPartitioner(),
- split=lambda x: [(x[0], replace(partition_params, seed=side(x[2])))],
- reduce=lambda x, output: (output[0], x[1])
- )
- partition_and_cv = Pipeline(
- [
- ("prepare partition", prepare_partition),
- ("shap_computation", ShapSelector()) #('crossvalidator', CrossValidator())
- ]
- )
- def get_objectives(results: List[FeatureResults]) -> Tuple[float, float, float]:
- summary = FeatureSummary(results)
- return (summary.metrics_average.R2, summary.metrics_average.huber, summary.kendall_tau_abs_mean)
- partition_and_cv_repeat = Pipeline([
- ("repeat_cv_pipe", Repeat(partition_and_cv, repeats, lambda x, i: [x[0], x[1], i] )),
- ("collect_mean", Collect(fold=lambda outputs: get_objectives(outputs)))
- ]
- )
- p = Pipeline([
- ('extractor', DataExtractor()),
- ('tune', Tune(partition_and_cv_repeat, study=study, n_trials=trials, parameters_space=optimization_parameters))
- ])
- from yspecies.tuning import MultiObjectiveResults
- results: MultiObjectiveResults = p.fit_transform(to_select)
- best = results.best_trials
- import json
- for i, t in enumerate(best):
- trait_path = locations.metrics.optimization / trait
- if not trait_path.exists():
- trait_path.mkdir()
- path = trait_path / f"{str(i)}.json"
- print(f"writing parameters to {path}")
- with open(path, 'w') as f:
- params = t.params
- values = t.values
- to_write = {"number": t.number,"params": params, "metrics": {"R2":values[0], "huber": values[1], "kendall_tau": values[2]}}
- json.dump(to_write, f, sort_keys=True, indent=4)
- print(f"FINISHED HYPER OPTIMIZING {trait}")
- @click.command()
- @click.option('--life_history', default=["lifespan", "mass_g", "gestation_days", "mtGC", "metabolic_rate", "temperature"], help='life history list')
- @click.option('--metrics', default="r2_huber_kendall", help='metrics names')
- @click.option('--trials', default=10, help='Number of trials in hyper optimization')
- @click.option('--folds', default=5, help='Number of folds in cross-validation')
- @click.option('--hold_outs', default=1, help='Number of hold outs in cross-validation')
- @click.option('--repeats', default=5, help="number of times to repeat validation")
- @click.option('--not_validated_species', default=True, help="not_validated_species")
- @click.option('--threads', default=1, help="number of threads (1 by default). If you put -1 it will try to utilize all cores, however it can be dangerous memorywise")
- @click.option('--debug_local', default=True, help="debug local")
- def tune_all(life_history: List[str], metrics: str, trials: int, folds: int, hold_outs: int, repeats: int, not_validated_species: Union[bool, List[str]], threads: int, debug_local: bool):
- for trait in life_history:
- tune(trait, metrics, trials, folds, hold_outs, repeats, not_validated_species, threads, debug_local)
- if __name__ == "__main__":
- tune()
|