aging-research
/
species


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
141

	
            import sys
from dataclasses import replace

import click
import optuna
from optuna import Trial
from sklearn.pipeline import Pipeline

from yspecies.config import *
from yspecies.dataset import ExpressionDataset
from yspecies.partition import DataPartitioner
from yspecies.partition import PartitionParameters
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.workflow import TupleWith, Repeat
from yspecies.tuning import CrossValidator, ResultsCV


def get_local_path():
    debug_local = True #to use local version
    local = (Path("..") / "yspecies").resolve()
    if debug_local and local.exists():
        sys.path.insert(0, Path("..").as_posix())
        #sys.path.insert(0, local.as_posix())
        print("extending pathes with local yspecies")
        print(sys.path)
    return local

@click.command()
@click.option('--name', default="general_tuner", help='study name')
@click.option('--trials', default=10, help='Number of trials in hyper optimization')
@click.option("--loss", default="huber", help="loss type (huber, l1, l2), huber by default")
@click.option('--folds', default=5, help='Number of folds in cross-validation')
@click.option('--hold_outs', default=1, help='Number of hold outs in cross-validation')
@click.option('--threads', default=1, help="number of threads (1 by default). If you put -1 it will try to utilize all cores, however it can be dangerous memorywise")
@click.option('--species_in_validation', default=3, help="species_in_validation")
@click.option('--not_validated_species', default="", help="not_validated_species")
@click.option('--repeats', default=10, help="number of times to repeat validation")
def tune(name: str, trials: int, loss: str,
         folds: int, hold_outs: int, threads: int,
         species_in_validation: int, not_validated_species: str,
         repeats: int):
    print(f"starting hyperparameters optimization script with {trials} trials, {folds} folds and {hold_outs} hold outs!")
    local = get_local_path()

    if not_validated_species is None or not_validated_species == "":
        not_validated_species = []
    elif type(not_validated_species) is str:
        not_validated_species = [not_validated_species]
    else:
        not_validated_species = not_validated_species


    locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

    data = ExpressionDataset.from_folder(locations.interim.selected)

    ### PIPELINE ###

    number_of_folds = 5

    partition_params = PartitionParameters(number_of_folds, 0, 2, [],  42)

    lgb_params = {"bagging_fraction": 0.9522534844058304,
     "boosting_type": "dart",
     "objective": "regression",
     "feature_fraction": 0.42236910941558053,
     "lambda_l1": 0.020847266580277746,
     "lambda_l2": 2.8448564854773326,
     "learning_rate": 0.11484015430016059,
     "max_depth": 3,
     "max_leaves": 35,
     "min_data_in_leaf": 9}

    partition_cv_pipe = Pipeline([
        ('partitioner', DataPartitioner()),
        ('prepare_for_partitioning', TupleWith(lgb_params)),
        ('crossvalidator', CrossValidator())
    ]
    )
    repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))
    selection_pipeline =  Pipeline([
        ('extractor', DataExtractor()),
        ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
        ("partition_shap", repeated_cv)]
    )


### SELECTION PARAMS ###

    selection = select_lifespan = FeatureSelection(
        samples = ["tissue","species"], #samples metadata to include
        species =  [], #species metadata other then Y label to include
        exclude_from_training = ["species"],  #exclude some fields from LightGBM training
        to_predict = "lifespan", #column to predict
        categorical = ["tissue"])

    select_lifespan = selection
    select_mass = replace(selection, to_predict = "mass_g")
    select_gestation = replace(selection, to_predict = "gestation")
    select_mtgc = replace(selection, to_predict = "mtgc")


    ext = Pipeline([
        ('extractor', DataExtractor(selection)), # to extract the data required for ML from the dataset
        ("partitioner", DataPartitioner(n_folds = folds, n_hold_out = hold_outs, species_in_validation=species_in_validation, not_validated_species = not_validated_species))
    ])

    stage_one_lifespan = selection_pipeline.fit_transform((data, select_lifespan))
    type(stage_one_lifespan)

    url = f'sqlite:///' +str((locations.metrics.lifespan / "study.sqlite").absolute())
    print('loading (if exists) study from '+url)
    storage = optuna.storages.RDBStorage(
        url=url
        #engine_kwargs={'check_same_thread': False}
    )
    study = optuna.create_study(storage, study_name="general_tuner", direction='minimize', load_if_exists=True)

    def objective_parameters(trial: Trial) -> dict:
        return {
            'objective': 'regression',
            'metric': {'mae', 'mse', 'huber'},
            'verbosity': -1,
            'boosting_type': trial.suggest_categorical('boosting_type', ['dart', 'gbdt']),
            'lambda_l1': trial.suggest_uniform('lambda_l1', 0.01, 4.0),
            'lambda_l2': trial.suggest_uniform('lambda_l2', 0.01, 4.0),
            'max_leaves': trial.suggest_int("max_leaves", 15, 25),
            'max_depth': trial.suggest_int('max_depth', 3, 8),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 1.0),
            'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 8),
            "verbose": -1
        }

if __name__ == "__main__":
    tune()
    #light_tune()