1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
- from functools import cached_property
- from sklearn.pipeline import Pipeline
- from yspecies.config import *
- from yspecies.partition import DataPartitioner
- from yspecies.partition import PartitionParameters
- from yspecies.preprocess import DataExtractor
- from yspecies.explanations import ShapSelector, FeatureSummary
- from yspecies.selection import CrossValidator
- from yspecies.tuning import MultiObjectiveResults
- from yspecies.workflow import TupleWith, Repeat, Collect
- @dataclass
- class PipelineFactory:
- locations: Locations
- repeats: int = 10
- n_folds: int = 5
- n_hold_out: int = 1
- @cached_property
- def partition_parameters(self):
- return PartitionParameters(self.n_folds, self.n_hold_out, 2, 42)
- def load_study(self, path: Path, study_name: str):
- url = f'sqlite:///' +str(path.absolute())
- print('loading (if exists) study from '+url)
- storage = optuna.storages.RDBStorage(
- url=url
- #engine_kwargs={'check_same_thread': False}
- )
- return optuna.multi_objective.study.create_study(directions=['maximize', 'minimize', 'maximize'], storage=storage, study_name=study_name, load_if_exists = True)
- def make_partition_shap_pipe(self, trait: str = None, study_name: str = None, study_path: Path = None, opt_metrics: str = "huber"):
- assert trait is not None or study_path is not None, "either trait or study path should be not None"
- study_name = f"{trait}_r2_huber_kendall" if study_name is None else study_name
- study_path = self.locations.interim.optimization / (trait+".sqlite") if study_path is None else study_path
- study = self.load_study(study_path, study_name)
- if len(study.get_pareto_front_trials())>0 :
- optimization_results: MultiObjectiveResults = MultiObjectiveResults.from_study(study)
- if opt_metrics == "huber":
- metrics, params = optimization_results.best_metrics_params_huber()
- elif opt_metrics == "kendall_tau":
- metrics, params = optimization_results.best_metrics_params_kendall_tau()
- else:
- metrics, params = optimization_results.best_metrics_params_r2()
- params["verbose"] = -1
- if "early_stopping_round" not in params:
- params["early_stopping_round"] = 10
- else:
- print("FALLING BACK TO DEFAULT PARAMETERS")
- params = {"bagging_fraction": 0.9522534844058304,
- "boosting_type": "dart",
- "objective": "regression",
- "feature_fraction": 0.42236910941558053,
- "lambda_l1": 0.020847266580277746,
- "lambda_l2": 2.8448564854773326,
- "learning_rate": 0.11484015430016059,
- "max_depth": 3,
- "max_leaves": 35,
- "min_data_in_leaf": 9,
- "num_iterations": 250,
- "metrics": ["l1", "l2", "huber"]
- }
- return Pipeline([
- ("partitioner", DataPartitioner()),
- ('prepare_for_selection', TupleWith(params)),
- ("cross_validation", CrossValidator()),
- ("shap_computation", ShapSelector())
- ]
- )
- def make_shap_pipeline(self, trait: str = None, study_name: str = None, study_path: Path = None, opt_metrics: str = "huber"):
- partition_shap_pipe = self.make_partition_shap_pipe(trait, study_name, study_path, opt_metrics)
- return Pipeline(
- [
- ('extractor', DataExtractor()),
- ('prepare_for_partitioning', TupleWith(self.partition_parameters)), # to extract the data required for ML from the dataset
- ("partition_shap", partition_shap_pipe)
- ]
- )
- def make_repeated_shap_pipeline(self, trait: str = None, study_name: str = None, study_path: Path = None, opt_metrics:str = "huber"):
- partition_shap_pipe = self.make_partition_shap_pipe(trait, study_name, study_path = study_path, opt_metrics=opt_metrics)
- repeated_cv = Repeat(partition_shap_pipe, self.repeats, lambda x,i: (x[0], replace(x[1], seed=i)))
- return Pipeline(
- [
- ('extractor', DataExtractor()),
- ('prepare_for_partitioning', TupleWith(self.partition_parameters)), # to extract the data required for ML from the dataset
- ("repeated_partition_shap", repeated_cv),
- ("summarize", Collect(fold=lambda results: FeatureSummary(results)))
- ]
- )
|