Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

tune.py 5.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
  1. import sys
  2. from dataclasses import replace
  3. import click
  4. import optuna
  5. from optuna import Trial
  6. from sklearn.pipeline import Pipeline
  7. from yspecies.config import *
  8. from yspecies.dataset import ExpressionDataset
  9. from yspecies.partition import DataPartitioner
  10. from yspecies.partition import PartitionParameters
  11. from yspecies.preprocess import FeatureSelection, DataExtractor
  12. from yspecies.workflow import TupleWith, Repeat
  13. from yspecies.tuning import CrossValidator, ResultsCV
  14. def get_local_path():
  15. debug_local = True #to use local version
  16. local = (Path("..") / "yspecies").resolve()
  17. if debug_local and local.exists():
  18. sys.path.insert(0, Path("..").as_posix())
  19. #sys.path.insert(0, local.as_posix())
  20. print("extending pathes with local yspecies")
  21. print(sys.path)
  22. return local
  23. @click.command()
  24. @click.option('--name', default="general_tuner", help='study name')
  25. @click.option('--trials', default=10, help='Number of trials in hyper optimization')
  26. @click.option("--loss", default="huber", help="loss type (huber, l1, l2), huber by default")
  27. @click.option('--folds', default=5, help='Number of folds in cross-validation')
  28. @click.option('--hold_outs', default=1, help='Number of hold outs in cross-validation')
  29. @click.option('--threads', default=1, help="number of threads (1 by default). If you put -1 it will try to utilize all cores, however it can be dangerous memorywise")
  30. @click.option('--species_in_validation', default=3, help="species_in_validation")
  31. @click.option('--not_validated_species', default="", help="not_validated_species")
  32. @click.option('--repeats', default=10, help="number of times to repeat validation")
  33. def tune(name: str, trials: int, loss: str,
  34. folds: int, hold_outs: int, threads: int,
  35. species_in_validation: int, not_validated_species: str,
  36. repeats: int):
  37. print(f"starting hyperparameters optimization script with {trials} trials, {folds} folds and {hold_outs} hold outs!")
  38. local = get_local_path()
  39. if not_validated_species is None or not_validated_species == "":
  40. not_validated_species = []
  41. elif type(not_validated_species) is str:
  42. not_validated_species = [not_validated_species]
  43. else:
  44. not_validated_species = not_validated_species
  45. locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")
  46. data = ExpressionDataset.from_folder(locations.interim.selected)
  47. ### PIPELINE ###
  48. number_of_folds = 5
  49. partition_params = PartitionParameters(number_of_folds, 0, 2, [], 42)
  50. lgb_params = {"bagging_fraction": 0.9522534844058304,
  51. "boosting_type": "dart",
  52. "objective": "regression",
  53. "feature_fraction": 0.42236910941558053,
  54. "lambda_l1": 0.020847266580277746,
  55. "lambda_l2": 2.8448564854773326,
  56. "learning_rate": 0.11484015430016059,
  57. "max_depth": 3,
  58. "max_leaves": 35,
  59. "min_data_in_leaf": 9}
  60. partition_cv_pipe = Pipeline([
  61. ('partitioner', DataPartitioner()),
  62. ('prepare_for_partitioning', TupleWith(lgb_params)),
  63. ('crossvalidator', CrossValidator())
  64. ]
  65. )
  66. repeated_cv = Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))
  67. selection_pipeline = Pipeline([
  68. ('extractor', DataExtractor()),
  69. ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
  70. ("partition_shap", repeated_cv)]
  71. )
  72. ### SELECTION PARAMS ###
  73. selection = select_lifespan = FeatureSelection(
  74. samples = ["tissue","species"], #samples metadata to include
  75. species = [], #species metadata other then Y label to include
  76. exclude_from_training = ["species"], #exclude some fields from LightGBM training
  77. to_predict = "lifespan", #column to predict
  78. categorical = ["tissue"])
  79. select_lifespan = selection
  80. select_mass = replace(selection, to_predict = "mass_g")
  81. select_gestation = replace(selection, to_predict = "gestation")
  82. select_mtgc = replace(selection, to_predict = "mtgc")
  83. ext = Pipeline([
  84. ('extractor', DataExtractor(selection)), # to extract the data required for ML from the dataset
  85. ("partitioner", DataPartitioner(n_folds = folds, n_hold_out = hold_outs, species_in_validation=species_in_validation, not_validated_species = not_validated_species))
  86. ])
  87. stage_one_lifespan = selection_pipeline.fit_transform((data, select_lifespan))
  88. type(stage_one_lifespan)
  89. url = f'sqlite:///' +str((locations.metrics.lifespan / "study.sqlite").absolute())
  90. print('loading (if exists) study from '+url)
  91. storage = optuna.storages.RDBStorage(
  92. url=url
  93. #engine_kwargs={'check_same_thread': False}
  94. )
  95. study = optuna.create_study(storage, study_name="general_tuner", direction='minimize', load_if_exists=True)
  96. def objective_parameters(trial: Trial) -> dict:
  97. return {
  98. 'objective': 'regression',
  99. 'metric': {'mae', 'mse', 'huber'},
  100. 'verbosity': -1,
  101. 'boosting_type': trial.suggest_categorical('boosting_type', ['dart', 'gbdt']),
  102. 'lambda_l1': trial.suggest_uniform('lambda_l1', 0.01, 4.0),
  103. 'lambda_l2': trial.suggest_uniform('lambda_l2', 0.01, 4.0),
  104. 'max_leaves': trial.suggest_int("max_leaves", 15, 25),
  105. 'max_depth': trial.suggest_int('max_depth', 3, 8),
  106. 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 1.0),
  107. 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 1.0),
  108. 'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
  109. 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 8),
  110. "verbose": -1
  111. }
  112. if __name__ == "__main__":
  113. tune()
  114. #light_tune()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...