Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

helpers.py 4.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
  1. from functools import cached_property
  2. from sklearn.pipeline import Pipeline
  3. from yspecies.config import *
  4. from yspecies.partition import DataPartitioner
  5. from yspecies.partition import PartitionParameters
  6. from yspecies.preprocess import DataExtractor
  7. from yspecies.explanations import ShapSelector, FeatureSummary
  8. from yspecies.selection import CrossValidator
  9. from yspecies.tuning import MultiObjectiveResults
  10. from yspecies.workflow import TupleWith, Repeat, Collect
  11. @dataclass
  12. class PipelineFactory:
  13. locations: Locations
  14. repeats: int = 10
  15. n_folds: int = 5
  16. n_hold_out: int = 1
  17. @cached_property
  18. def partition_parameters(self):
  19. return PartitionParameters(self.n_folds, self.n_hold_out, 2, 42)
  20. def load_study(self, path: Path, study_name: str):
  21. url = f'sqlite:///' +str(path.absolute())
  22. print('loading (if exists) study from '+url)
  23. storage = optuna.storages.RDBStorage(
  24. url=url
  25. #engine_kwargs={'check_same_thread': False}
  26. )
  27. return optuna.multi_objective.study.create_study(directions=['maximize', 'minimize', 'maximize'], storage=storage, study_name=study_name, load_if_exists = True)
  28. def make_partition_shap_pipe(self, trait: str = None, study_name: str = None, study_path: Path = None, opt_metrics: str = "huber"):
  29. assert trait is not None or study_path is not None, "either trait or study path should be not None"
  30. study_name = f"{trait}_r2_huber_kendall" if study_name is None else study_name
  31. study_path = self.locations.interim.optimization / (trait+".sqlite") if study_path is None else study_path
  32. study = self.load_study(study_path, study_name)
  33. if len(study.get_pareto_front_trials())>0 :
  34. optimization_results: MultiObjectiveResults = MultiObjectiveResults.from_study(study)
  35. if opt_metrics == "huber":
  36. metrics, params = optimization_results.best_metrics_params_huber()
  37. elif opt_metrics == "kendall_tau":
  38. metrics, params = optimization_results.best_metrics_params_kendall_tau()
  39. else:
  40. metrics, params = optimization_results.best_metrics_params_r2()
  41. params["verbose"] = -1
  42. if "early_stopping_round" not in params:
  43. params["early_stopping_round"] = 10
  44. else:
  45. print("FALLING BACK TO DEFAULT PARAMETERS")
  46. params = {"bagging_fraction": 0.9522534844058304,
  47. "boosting_type": "dart",
  48. "objective": "regression",
  49. "feature_fraction": 0.42236910941558053,
  50. "lambda_l1": 0.020847266580277746,
  51. "lambda_l2": 2.8448564854773326,
  52. "learning_rate": 0.11484015430016059,
  53. "max_depth": 3,
  54. "max_leaves": 35,
  55. "min_data_in_leaf": 9,
  56. "num_iterations": 250,
  57. "metrics": ["l1", "l2", "huber"]
  58. }
  59. return Pipeline([
  60. ("partitioner", DataPartitioner()),
  61. ('prepare_for_selection', TupleWith(params)),
  62. ("cross_validation", CrossValidator()),
  63. ("shap_computation", ShapSelector())
  64. ]
  65. )
  66. def make_shap_pipeline(self, trait: str = None, study_name: str = None, study_path: Path = None, opt_metrics: str = "huber"):
  67. partition_shap_pipe = self.make_partition_shap_pipe(trait, study_name, study_path, opt_metrics)
  68. return Pipeline(
  69. [
  70. ('extractor', DataExtractor()),
  71. ('prepare_for_partitioning', TupleWith(self.partition_parameters)), # to extract the data required for ML from the dataset
  72. ("partition_shap", partition_shap_pipe)
  73. ]
  74. )
  75. def make_repeated_shap_pipeline(self, trait: str = None, study_name: str = None, study_path: Path = None, opt_metrics:str = "huber"):
  76. partition_shap_pipe = self.make_partition_shap_pipe(trait, study_name, study_path = study_path, opt_metrics=opt_metrics)
  77. repeated_cv = Repeat(partition_shap_pipe, self.repeats, lambda x,i: (x[0], replace(x[1], seed=i)))
  78. return Pipeline(
  79. [
  80. ('extractor', DataExtractor()),
  81. ('prepare_for_partitioning', TupleWith(self.partition_parameters)), # to extract the data required for ML from the dataset
  82. ("repeated_partition_shap", repeated_cv),
  83. ("summarize", Collect(fold=lambda results: FeatureSummary(results)))
  84. ]
  85. )
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...