Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

#670 add clearml & wandb

Merged
Ghost merged 1 commits into Deci-AI:master from deci-ai:feature/SG-612-add_experiment_monitoring_tutp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
  1. from super_gradients.training.utils import HpmStruct
  2. from copy import deepcopy
  3. DEFAULT_TRAINING_PARAMS = {
  4. "lr_warmup_epochs": 0,
  5. "lr_warmup_steps": 0,
  6. "lr_cooldown_epochs": 0,
  7. "warmup_initial_lr": None,
  8. "cosine_final_lr_ratio": 0.01,
  9. "optimizer": "SGD",
  10. "optimizer_params": {},
  11. "criterion_params": {},
  12. "ema": False,
  13. "batch_accumulate": 1, # number of batches to accumulate before every backward pass
  14. "ema_params": {},
  15. "zero_weight_decay_on_bias_and_bn": False,
  16. "load_opt_params": True,
  17. "run_validation_freq": 1,
  18. "save_model": True,
  19. "metric_to_watch": "Accuracy",
  20. "launch_tensorboard": False,
  21. "tb_files_user_prompt": False, # Asks User for Tensorboard Deletion Prompt
  22. "silent_mode": False, # Silents the Print outs
  23. "mixed_precision": False,
  24. "tensorboard_port": None,
  25. "save_ckpt_epoch_list": [], # indices where the ckpt will save automatically
  26. "average_best_models": True,
  27. "dataset_statistics": False, # add a dataset statistical analysis and sample images to tensorboard
  28. "save_tensorboard_to_s3": False,
  29. "lr_schedule_function": None,
  30. "train_metrics_list": [],
  31. "valid_metrics_list": [],
  32. "greater_metric_to_watch_is_better": True,
  33. "precise_bn": False,
  34. "precise_bn_batch_size": None,
  35. "seed": 42,
  36. "lr_mode": None,
  37. "phase_callbacks": None,
  38. "log_installed_packages": True,
  39. "sg_logger": "base_sg_logger",
  40. "sg_logger_params": {
  41. "tb_files_user_prompt": False, # Asks User for Tensorboard Deletion Prompt
  42. "project_name": "",
  43. "launch_tensorboard": False,
  44. "tensorboard_port": None,
  45. "save_checkpoints_remote": False, # upload checkpoint files to s3
  46. "save_tensorboard_remote": False, # upload tensorboard files to s3
  47. "save_logs_remote": False,
  48. }, # upload log files to s3
  49. "warmup_mode": "linear_step",
  50. "step_lr_update_freq": None,
  51. "lr_updates": [],
  52. "clip_grad_norm": None,
  53. "pre_prediction_callback": None,
  54. "ckpt_best_name": "ckpt_best.pth",
  55. "enable_qat": False,
  56. "qat_params": {
  57. "start_epoch": 0,
  58. "quant_modules_calib_method": "percentile",
  59. "per_channel_quant_modules": False,
  60. "calibrate": True,
  61. "calibrated_model_path": None,
  62. "calib_data_loader": None,
  63. "num_calib_batches": 2,
  64. "percentile": 99.99,
  65. },
  66. "resume": False,
  67. "resume_path": None,
  68. "ckpt_name": "ckpt_latest.pth",
  69. "resume_strict_load": False,
  70. "sync_bn": False,
  71. "kill_ddp_pgroup_on_end": True, # Whether to kill the DDP process group in the end of training.
  72. "max_train_batches": None, # For debug- when not None- will break out of inner train loop
  73. # (i.e iterating over train_loader) when reaching this number of batches.
  74. "max_valid_batches": None, # For debug- when not None- will break out of inner valid loop
  75. # (i.e iterating over valid_loader) when reaching this number of batches.
  76. }
  77. DEFAULT_OPTIMIZER_PARAMS_SGD = {"weight_decay": 1e-4, "momentum": 0.9}
  78. DEFAULT_OPTIMIZER_PARAMS_ADAM = {"weight_decay": 1e-4}
  79. DEFAULT_OPTIMIZER_PARAMS_RMSPROP = {"weight_decay": 1e-4, "momentum": 0.9}
  80. DEFAULT_OPTIMIZER_PARAMS_RMSPROPTF = {"weight_decay": 1e-4, "momentum": 0.9}
  81. TRAINING_PARAM_SCHEMA = {
  82. "type": "object",
  83. "properties": {
  84. "max_epochs": {"type": "number", "minimum": 1},
  85. # FIXME: CHECK THE IMPORTANCE OF THE COMMENTED SCHEMA- AS IT CAUSES HYDRA USE TO CRASH
  86. # "lr_updates": {"type": "array", "minItems": 1},
  87. "lr_decay_factor": {"type": "number", "minimum": 0, "maximum": 1},
  88. "lr_warmup_epochs": {"type": "number", "minimum": 0, "maximum": 10},
  89. "initial_lr": {"type": "number", "exclusiveMinimum": 0, "maximum": 10},
  90. },
  91. "if": {"properties": {"lr_mode": {"const": "step"}}},
  92. "then": {"required": ["lr_updates", "lr_decay_factor"]},
  93. "required": ["max_epochs", "lr_mode", "initial_lr", "loss"],
  94. }
  95. class TrainingParams(HpmStruct):
  96. def __init__(self, **entries):
  97. # WE initialize by the default training params, overridden by the provided params
  98. default_training_params = deepcopy(DEFAULT_TRAINING_PARAMS)
  99. super().__init__(**default_training_params)
  100. self.set_schema(TRAINING_PARAM_SCHEMA)
  101. if len(entries) > 0:
  102. self.override(**entries)
  103. def override(self, **entries):
  104. super().override(**entries)
  105. self.validate()
Discard
Tip!

Press p or to see the previous file or, n or to see the next file