Deci-AI
/
super-gradients
connected to https://github.com/Deci-AI/super-gradients.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
            from super_gradients.training.utils import HpmStruct
from copy import deepcopy

DEFAULT_TRAINING_PARAMS = {
    "lr_warmup_epochs": 0,
    "lr_warmup_steps": 0,
    "lr_cooldown_epochs": 0,
    "warmup_initial_lr": None,
    "cosine_final_lr_ratio": 0.01,
    "optimizer": "SGD",
    "optimizer_params": {},
    "criterion_params": {},
    "ema": False,
    "batch_accumulate": 1,  # number of batches to accumulate before every backward pass
    "ema_params": {},
    "zero_weight_decay_on_bias_and_bn": False,
    "load_opt_params": True,
    "run_validation_freq": 1,
    "run_test_freq": 1,
    "save_model": True,
    "metric_to_watch": "Accuracy",
    "launch_tensorboard": False,
    "tb_files_user_prompt": False,  # Asks User for Tensorboard Deletion Prompt
    "silent_mode": False,  # Silents the Print outs
    "mixed_precision": False,
    "tensorboard_port": None,
    "save_ckpt_epoch_list": [],  # indices where the ckpt will save automatically
    "average_best_models": True,
    "dataset_statistics": False,  # add a dataset statistical analysis and sample images to tensorboard
    "save_tensorboard_to_s3": False,
    "lr_schedule_function": None,
    "train_metrics_list": [],
    "valid_metrics_list": [],
    "greater_metric_to_watch_is_better": True,
    "precise_bn": False,
    "precise_bn_batch_size": None,
    "seed": 42,
    "lr_mode": None,
    "phase_callbacks": None,
    "log_installed_packages": True,
    "sg_logger": "base_sg_logger",
    "sg_logger_params": {
        "tb_files_user_prompt": False,  # Asks User for Tensorboard Deletion Prompt
        "project_name": "",
        "launch_tensorboard": False,
        "tensorboard_port": None,
        "save_checkpoints_remote": False,  # upload checkpoint files to s3
        "save_tensorboard_remote": False,  # upload tensorboard files to s3
        "save_logs_remote": False,
    },  # upload log files to s3
    "warmup_mode": "LinearEpochLRWarmup",
    "step_lr_update_freq": None,
    "lr_updates": [],
    "initial_lr": None,
    "clip_grad_norm": None,
    "pre_prediction_callback": None,
    "ckpt_best_name": "ckpt_best.pth",
    "enable_qat": False,
    "resume": False,
    "resume_path": None,
    "ckpt_name": "ckpt_latest.pth",
    "resume_strict_load": False,
    "sync_bn": False,
    "kill_ddp_pgroup_on_end": True,  # Whether to kill the DDP process group in the end of training.
    "max_train_batches": None,  # For debug- when not None- will break out of inner train loop
    # (i.e iterating over train_loader) when reaching this number of batches.
    "max_valid_batches": None,  # For debug- when not None- will break out of inner valid loop
    # (i.e iterating over valid_loader) when reaching this number of batches.
    "resume_from_remote_sg_logger": False,  # When true, ckpt_name (checkpoint filename to resume, ckpt_latest.pth by
    # default) will be downloaded into the experiment checkpoints directory prior to loading weights, then resumed
    # from that checkpoint. The source is unique to every logger, and currently supported for WandB loggers only.
    # Note that for this to work, the experiment must be ran with sg_logger_params.save_checkpoints_remote=True. For
    # WandB loggers, one must also pass the run id through the wandb_id arg in sg_logger_params.
    "torch_compile": False,  # Enable or disable use of torch.compile to optimize the model
    "torch_compile_loss": False,  # Enable or disable use of torch.compile to optimize the loss
    "torch_compile_options": {
        "mode": "reduce-overhead",  # Can be either “default”, “reduce-overhead” or “max-autotune”
        "fullgraph": False,  # Whether it is ok to break model into several subgraphs
        "dynamic": False,  # Use dynamic shape tracing
        "backend": "inductor",  # backend to be used
        "options": None,  # A dictionary of options to pass to the backend.
        "disable": False,  # Turn torch.compile() into a no-op for testing
    },  # torch.compile options from https://pytorch.org/docs/stable/generated/torch.compile.html
    "finetune": False  # Whether to freeze a fixed part of the model (supported only for models that implement
    # get_finetune_lr_dict, see SgModule.get_finetune_lr_dict. Tailored for each model class.)
}

DEFAULT_OPTIMIZER_PARAMS_SGD = {"weight_decay": 1e-4, "momentum": 0.9}

DEFAULT_OPTIMIZER_PARAMS_ADAM = {"weight_decay": 1e-4}

DEFAULT_OPTIMIZER_PARAMS_RMSPROP = {"weight_decay": 1e-4, "momentum": 0.9}

DEFAULT_OPTIMIZER_PARAMS_RMSPROPTF = {"weight_decay": 1e-4, "momentum": 0.9}

TRAINING_PARAM_SCHEMA = {
    "type": "object",
    "properties": {
        "max_epochs": {"type": "number", "minimum": 1},
        # FIXME: CHECK THE IMPORTANCE OF THE COMMENTED SCHEMA- AS IT CAUSES HYDRA USE TO CRASH
        # "lr_updates": {"type": "array", "minItems": 1},
        "lr_decay_factor": {"type": "number", "minimum": 0, "maximum": 1},
        "lr_warmup_epochs": {"type": "number", "minimum": 0, "maximum": 10},
        "initial_lr": {
            "anyOf": [
                {"type": ["number", "string", "boolean", "null"]},
                {"type": "object", "patternProperties": {"^[a-zA-Z0-9_.]+$": {"type": "number"}}, "additionalProperties": False},
            ]
        },
    },
    "if": {"properties": {"lr_mode": {"const": "StepLRScheduler"}}},
    "then": {"required": ["lr_updates", "lr_decay_factor"]},
    "required": ["max_epochs", "lr_mode", "initial_lr", "loss"],
}


class TrainingParams(HpmStruct):
    def __init__(self, **entries):
        # WE initialize by the default training params, overridden by the provided params
        default_training_params = deepcopy(DEFAULT_TRAINING_PARAMS)
        super().__init__(**default_training_params)
        self.set_schema(TRAINING_PARAM_SCHEMA)
        if len(entries) > 0:
            self.override(**entries)

    def override(self, **entries):
        super().override(**entries)
        self.validate()