1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
- from super_gradients.training.utils import HpmStruct
- from copy import deepcopy
- DEFAULT_TRAINING_PARAMS = {
- "lr_warmup_epochs": 0,
- "lr_warmup_steps": 0,
- "lr_cooldown_epochs": 0,
- "warmup_initial_lr": None,
- "cosine_final_lr_ratio": 0.01,
- "optimizer": "SGD",
- "optimizer_params": {},
- "criterion_params": {},
- "ema": False,
- "batch_accumulate": 1, # number of batches to accumulate before every backward pass
- "ema_params": {},
- "zero_weight_decay_on_bias_and_bn": False,
- "load_opt_params": True,
- "run_validation_freq": 1,
- "run_test_freq": 1,
- "save_model": True,
- "metric_to_watch": "Accuracy",
- "launch_tensorboard": False,
- "tb_files_user_prompt": False, # Asks User for Tensorboard Deletion Prompt
- "silent_mode": False, # Silents the Print outs
- "mixed_precision": False,
- "tensorboard_port": None,
- "save_ckpt_epoch_list": [], # indices where the ckpt will save automatically
- "average_best_models": True,
- "dataset_statistics": False, # add a dataset statistical analysis and sample images to tensorboard
- "save_tensorboard_to_s3": False,
- "lr_schedule_function": None,
- "train_metrics_list": [],
- "valid_metrics_list": [],
- "greater_metric_to_watch_is_better": True,
- "precise_bn": False,
- "precise_bn_batch_size": None,
- "seed": 42,
- "lr_mode": None,
- "phase_callbacks": None,
- "log_installed_packages": True,
- "sg_logger": "base_sg_logger",
- "sg_logger_params": {
- "tb_files_user_prompt": False, # Asks User for Tensorboard Deletion Prompt
- "project_name": "",
- "launch_tensorboard": False,
- "tensorboard_port": None,
- "save_checkpoints_remote": False, # upload checkpoint files to s3
- "save_tensorboard_remote": False, # upload tensorboard files to s3
- "save_logs_remote": False,
- }, # upload log files to s3
- "warmup_mode": "LinearEpochLRWarmup",
- "step_lr_update_freq": None,
- "lr_updates": [],
- "initial_lr": None,
- "clip_grad_norm": None,
- "pre_prediction_callback": None,
- "ckpt_best_name": "ckpt_best.pth",
- "enable_qat": False,
- "resume": False,
- "resume_path": None,
- "ckpt_name": "ckpt_latest.pth",
- "resume_strict_load": False,
- "sync_bn": False,
- "kill_ddp_pgroup_on_end": True, # Whether to kill the DDP process group in the end of training.
- "max_train_batches": None, # For debug- when not None- will break out of inner train loop
- # (i.e iterating over train_loader) when reaching this number of batches.
- "max_valid_batches": None, # For debug- when not None- will break out of inner valid loop
- # (i.e iterating over valid_loader) when reaching this number of batches.
- "resume_from_remote_sg_logger": False, # When true, ckpt_name (checkpoint filename to resume, ckpt_latest.pth by
- # default) will be downloaded into the experiment checkpoints directory prior to loading weights, then resumed
- # from that checkpoint. The source is unique to every logger, and currently supported for WandB loggers only.
- # Note that for this to work, the experiment must be ran with sg_logger_params.save_checkpoints_remote=True. For
- # WandB loggers, one must also pass the run id through the wandb_id arg in sg_logger_params.
- "torch_compile": False, # Enable or disable use of torch.compile to optimize the model
- "torch_compile_loss": False, # Enable or disable use of torch.compile to optimize the loss
- "torch_compile_options": {
- "mode": "reduce-overhead", # Can be either “default”, “reduce-overhead” or “max-autotune”
- "fullgraph": False, # Whether it is ok to break model into several subgraphs
- "dynamic": False, # Use dynamic shape tracing
- "backend": "inductor", # backend to be used
- "options": None, # A dictionary of options to pass to the backend.
- "disable": False, # Turn torch.compile() into a no-op for testing
- }, # torch.compile options from https://pytorch.org/docs/stable/generated/torch.compile.html
- "finetune": False # Whether to freeze a fixed part of the model (supported only for models that implement
- # get_finetune_lr_dict, see SgModule.get_finetune_lr_dict. Tailored for each model class.)
- }
- DEFAULT_OPTIMIZER_PARAMS_SGD = {"weight_decay": 1e-4, "momentum": 0.9}
- DEFAULT_OPTIMIZER_PARAMS_ADAM = {"weight_decay": 1e-4}
- DEFAULT_OPTIMIZER_PARAMS_RMSPROP = {"weight_decay": 1e-4, "momentum": 0.9}
- DEFAULT_OPTIMIZER_PARAMS_RMSPROPTF = {"weight_decay": 1e-4, "momentum": 0.9}
- TRAINING_PARAM_SCHEMA = {
- "type": "object",
- "properties": {
- "max_epochs": {"type": "number", "minimum": 1},
- # FIXME: CHECK THE IMPORTANCE OF THE COMMENTED SCHEMA- AS IT CAUSES HYDRA USE TO CRASH
- # "lr_updates": {"type": "array", "minItems": 1},
- "lr_decay_factor": {"type": "number", "minimum": 0, "maximum": 1},
- "lr_warmup_epochs": {"type": "number", "minimum": 0, "maximum": 10},
- "initial_lr": {
- "anyOf": [
- {"type": ["number", "string", "boolean", "null"]},
- {"type": "object", "patternProperties": {"^[a-zA-Z0-9_.]+$": {"type": "number"}}, "additionalProperties": False},
- ]
- },
- },
- "if": {"properties": {"lr_mode": {"const": "StepLRScheduler"}}},
- "then": {"required": ["lr_updates", "lr_decay_factor"]},
- "required": ["max_epochs", "lr_mode", "initial_lr", "loss"],
- }
- class TrainingParams(HpmStruct):
- def __init__(self, **entries):
- # WE initialize by the default training params, overridden by the provided params
- default_training_params = deepcopy(DEFAULT_TRAINING_PARAMS)
- super().__init__(**default_training_params)
- self.set_schema(TRAINING_PARAM_SCHEMA)
- if len(entries) > 0:
- self.override(**entries)
- def override(self, **entries):
- super().override(**entries)
- self.validate()
|