Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

#473 Add doc for clearML integration

Merged
Ghost merged 1 commits into Deci-AI:master from deci-ai:feature/SG-187-add_doc_clearml
@@ -360,6 +360,25 @@ train_params = { ... # training parameters
                }
                }
 ```
 ```
 
 
+### Integration to ClearML
+```python
+from super_gradients import Trainer
+
+# create a trainer object, look the declaration for more parameters
+trainer = Trainer("experiment_name")
+
+train_params = { ... # training parameters
+                "sg_logger": "clearml_sg_logger", # Weights&Biases Logger, see class WandBSGLogger for details
+                "sg_logger_params": # paramenters that will be passes to __init__ of the logger 
+                  {
+                    "project_name": "project_name", # ClearML project name
+                    "save_checkpoints_remote": True,
+                    "save_tensorboard_remote": True,
+                    "save_logs_remote": True,
+                  } 
+               }
+```
+
 
 
 ## Installation Methods
 ## Installation Methods
 __________________________________________________________________________________________________________
 __________________________________________________________________________________________________________
@@ -519,5 +538,3 @@ Features:
 ֿ
 ֿ
 
 
 Request free trial [here](https://bit.ly/3qO3icq) 
 Request free trial [here](https://bit.ly/3qO3icq) 
-
-
Discard
@@ -39,17 +39,20 @@ class ClearMLSGLogger(BaseSGLogger):
         save_logs_remote: bool = True,
         save_logs_remote: bool = True,
     ):
     ):
         """
         """
-
-        :param experiment_name:         Used for logging and loading purposes
-        :param s3_path:                 If set to 's3' (i.e. s3://my-bucket) saves the Checkpoints in AWS S3 otherwise saves the Checkpoints Locally
-        :param checkpoint_loaded:       If true, then old tensorboard files will *not* be deleted when tb_files_user_prompt=True
-        :param max_epochs:              Number of epochs planned for this training
-        :param tb_files_user_prompt:    Asks user for Tensorboard deletion prompt.
-        :param launch_tensorboard:      Whether to launch a TensorBoard process.
-        :param tensorboard_port:        Specific port number for the tensorboard to use when launched (when set to None, some free port number will be used)
-        :param save_checkpoints_remote: Saves checkpoints in s3.
-        :param save_tensorboard_remote: Saves tensorboard in s3.
-        :param save_logs_remote:        Saves log files in s3.
+        :param project_name: ClearML project name that can include many experiments
+        :param experiment_name: Used for logging and loading purposes
+        :param storage_location: If set to 's3' (i.e. s3://my-bucket) saves the Checkpoints in AWS S3 otherwise saves the Checkpoints Locally
+        :param resumed: if true, then old tensorboard files will *not* be deleted when tb_files_user_prompt=True
+        :param training_params: training_params for the experiment.
+        :param checkpoints_dir_path: Local root directory path where all experiment logging directories will
+                                                 reside.
+        :param tb_files_user_prompt: Asks user for Tensorboard deletion prompt.
+        :param launch_tensorboard: Whether to launch a TensorBoard process.
+        :param tensorboard_port: Specific port number for the tensorboard to use when launched (when set to None, some free port
+                    number will be used
+        :param save_checkpoints_remote: Saves checkpoints in ClearML server.
+        :param save_tensorboard_remote: Saves tensorboard in ClearML server.
+        :param save_logs_remote: Saves log files in ClearML server.
         """
         """
         self.s3_location_available = storage_location.startswith("s3")
         self.s3_location_available = storage_location.startswith("s3")
         super().__init__(
         super().__init__(
Discard
@@ -8,13 +8,14 @@ logger = get_logger(__name__)
 
 
 try:
 try:
     from deci_lab_client.client import DeciPlatformClient
     from deci_lab_client.client import DeciPlatformClient
+
     _imported_deci_lab_failure = None
     _imported_deci_lab_failure = None
 except (ImportError, NameError, ModuleNotFoundError) as import_err:
 except (ImportError, NameError, ModuleNotFoundError) as import_err:
     logger.warn("Failed to import deci_lab_client")
     logger.warn("Failed to import deci_lab_client")
     _imported_deci_lab_failure = import_err
     _imported_deci_lab_failure = import_err
 
 
-TENSORBOARD_EVENTS_PREFIX = 'events.out.tfevents'
-LOGS_PREFIX = 'log_'
+TENSORBOARD_EVENTS_PREFIX = "events.out.tfevents"
+LOGS_PREFIX = "log_"
 
 
 
 
 class DeciPlatformSGLogger(BaseSGLogger):
 class DeciPlatformSGLogger(BaseSGLogger):
@@ -27,9 +28,11 @@ class DeciPlatformSGLogger(BaseSGLogger):
 
 
         auth_token = os.getenv("DECI_PLATFORM_TOKEN")
         auth_token = os.getenv("DECI_PLATFORM_TOKEN")
         if auth_token is None:
         if auth_token is None:
-            raise ValueError('The environment variable "DECI_PLATFORM_TOKEN" is required in order to use '
-                             'DeciPlatformSGLogger. Please set it with your own credentials '
-                             '(available in https://console.deci.ai/settings)')
+            raise ValueError(
+                'The environment variable "DECI_PLATFORM_TOKEN" is required in order to use '
+                "DeciPlatformSGLogger. Please set it with your own credentials "
+                "(available in https://console.deci.ai/settings)"
+            )
 
 
         super().__init__(**kwargs)
         super().__init__(**kwargs)
         self.platform_client = DeciPlatformClient()
         self.platform_client = DeciPlatformClient()
@@ -47,7 +50,7 @@ class DeciPlatformSGLogger(BaseSGLogger):
 
 
         # Upload to Deci platform
         # Upload to Deci platform
         if not os.path.isdir(self.checkpoints_dir_path):
         if not os.path.isdir(self.checkpoints_dir_path):
-            raise ValueError('Provided directory does not exist')
+            raise ValueError("Provided directory does not exist")
 
 
         self._upload_latest_file_starting_with(start_with=TENSORBOARD_EVENTS_PREFIX)
         self._upload_latest_file_starting_with(start_with=TENSORBOARD_EVENTS_PREFIX)
         self._upload_latest_file_starting_with(start_with=LOGS_PREFIX)
         self._upload_latest_file_starting_with(start_with=LOGS_PREFIX)
@@ -61,9 +64,7 @@ class DeciPlatformSGLogger(BaseSGLogger):
         """
         """
 
 
         files_path = [
         files_path = [
-            os.path.join(self.checkpoints_dir_path, file_name)
-            for file_name in os.listdir(self.checkpoints_dir_path)
-            if file_name.startswith(start_with)
+            os.path.join(self.checkpoints_dir_path, file_name) for file_name in os.listdir(self.checkpoints_dir_path) if file_name.startswith(start_with)
         ]
         ]
 
 
         most_recent_file_path = max(files_path, key=os.path.getctime)
         most_recent_file_path = max(files_path, key=os.path.getctime)
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
  1. import os
  2. from super_gradients.training import Trainer, models
  3. from super_gradients.training.metrics.classification_metrics import Accuracy, Top5
  4. from super_gradients.training.dataloaders.dataloaders import cifar10_train, cifar10_val
  5. os.environ["DECI_PLATFORM_TOKEN"] = "XXX" # Replace XXX with your token
  6. trainer = Trainer(experiment_name='demo-deci-platform-logger')
  7. model = models.get("resnet18", num_classes=10)
  8. trainer.train(training_params={"max_epochs": 20,
  9. "lr_updates": [5, 10, 15],
  10. "lr_decay_factor": 0.1,
  11. "lr_mode": "step",
  12. "initial_lr": 0.1,
  13. "loss": "cross_entropy",
  14. "optimizer": "SGD",
  15. "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
  16. "train_metrics_list": [Accuracy(), Top5()],
  17. "valid_metrics_list": [Accuracy(), Top5()],
  18. "metric_to_watch": "Accuracy",
  19. "greater_metric_to_watch_is_better": True,
  20. "sg_logger": "deci_platform_sg_logger"},
  21. train_loader=cifar10_train(),
  22. valid_loader=cifar10_val())
Discard
    Discard
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    1. from super_gradients.training import Trainer, models
    2. from super_gradients.training.metrics.classification_metrics import Accuracy, Top5
    3. from super_gradients.training.dataloaders.dataloaders import cifar10_train, cifar10_val
    4. trainer = Trainer(experiment_name="demo-clearml-logger")
    5. model = models.get("resnet18", num_classes=10)
    6. training_params = {
    7. "max_epochs": 20,
    8. "lr_updates": [5, 10, 15],
    9. "lr_decay_factor": 0.1,
    10. "lr_mode": "step",
    11. "initial_lr": 0.1,
    12. "loss": "cross_entropy",
    13. "optimizer": "SGD",
    14. "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
    15. "train_metrics_list": [Accuracy(), Top5()],
    16. "valid_metrics_list": [Accuracy(), Top5()],
    17. "metric_to_watch": "Accuracy",
    18. "greater_metric_to_watch_is_better": True,
    19. "sg_logger": "clearml_sg_logger",
    20. "sg_logger_params": {
    21. "project_name": "project_name", # ClearML project name
    22. "save_checkpoints_remote": True,
    23. "save_tensorboard_remote": True,
    24. "save_logs_remote": True,
    25. },
    26. }
    27. trainer.train(model=model, training_params=training_params, train_loader=cifar10_train(), valid_loader=cifar10_val())
    Discard
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    1. import os
    2. from super_gradients.training import Trainer, models
    3. from super_gradients.training.metrics.classification_metrics import Accuracy, Top5
    4. from super_gradients.training.dataloaders.dataloaders import cifar10_train, cifar10_val
    5. os.environ["DECI_PLATFORM_TOKEN"] = "XXX" # Replace XXX with your token
    6. trainer = Trainer(experiment_name="demo-deci-platform-logger")
    7. model = models.get("resnet18", num_classes=10)
    8. training_params = {
    9. "max_epochs": 20,
    10. "lr_updates": [5, 10, 15],
    11. "lr_decay_factor": 0.1,
    12. "lr_mode": "step",
    13. "initial_lr": 0.1,
    14. "loss": "cross_entropy",
    15. "optimizer": "SGD",
    16. "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
    17. "train_metrics_list": [Accuracy(), Top5()],
    18. "valid_metrics_list": [Accuracy(), Top5()],
    19. "metric_to_watch": "Accuracy",
    20. "greater_metric_to_watch_is_better": True,
    21. "sg_logger": "deci_platform_sg_logger",
    22. }
    23. trainer.train(model=model, training_params=training_params, train_loader=cifar10_train(), valid_loader=cifar10_val())
    Discard