Deci-AI
/
super-gradients
connected to https://github.com/Deci-AI/super-gradients.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
            #!/usr/bin/env python
""" Single node distributed training.

    The program will dispatch distributed training on all available GPUs residing in a single node.

    Usage:
    python -m torch.distributed.launch --nproc_per_node=n distributed_training_imagenet.py
    where n is the number of GPUs required, e.g., n=8

    Important note: (1) in distributed training it is customary to specify learning rates and batch sizes per GPU.
    Whatever learning rate and schedule you specify will be applied to the each GPU individually.
    Since gradients are passed and summed (reduced) from all to all GPUs, the effective batch size is the
    batch you specify times the number of GPUs. In the literature there are several "best practices" to set
    learning rates and schedules for large batch sizes.
    Should be checked with. (2) The training protocol specified in this file for 8 GPUs are far from optimal.
    The best protocol should use cosine schedule.

    In the example below: for ImageNet training using Resnet50, when applied with n=8 should compute an Eopch in about
    5min20sec with 8 V100 GPUs.

    Todo: (1) the code is more or less ready for multiple nodes, but I have not experimented with it at all.
          (2) detection and segmentation codes were not modified and should not work properly.
              Specifically, the analogue changes done in sg_classification_model should be done also in
              deci_segmentation_model and deci_detection_model

"""
import super_gradients
import torch.distributed
from super_gradients.training.sg_model import MultiGPUMode
from super_gradients.training import SgModel
from super_gradients.training.datasets.dataset_interfaces import ImageNetDatasetInterface
from super_gradients.common.aws_connection.aws_secrets_manager_connector import AWSSecretsManagerConnector
from super_gradients.training.metrics.classification_metrics import Accuracy, Top5

torch.backends.cudnn.benchmark = True

super_gradients.init_trainer()
# TODO - VALIDATE THE HYPER PARAMETERS WITH RAN TO FIX THIS EXAMPLE CODE
train_params = {"max_epochs": 110,
                "lr_updates": [30, 60, 90],
                "lr_decay_factor": 0.1,
                "initial_lr": 0.6,
                "loss": "cross_entropy",
                "lr_mode": "step",
                # "initial_lr": 0.05 * 2,
                "lr_warmup_epochs": 5,
                # "criterion_params":{"smooth_eps":0.1}}
                "mixed_precision": True,
                # "mixed_precision_opt_level": "O3",
                "optimizer_params": {"weight_decay": 0.000, "momentum": 0.9},
                # "optimizer_params": {"weight_decay": 0.0001, "momentum": 0.9}
                "train_metrics_list": [Accuracy(), Top5()], "valid_metrics_list": [Accuracy(), Top5()],
                "loss_logging_items_names": ["Loss"], "metric_to_watch": "Accuracy",
                "greater_metric_to_watch_is_better": True}
dataset_params = {"batch_size": 128}

model_repo_bucket_name = AWSSecretsManagerConnector.get_secret_value_for_secret_key(aws_env='research',
                                                                                    secret_name='training_secrets',
                                                                                    secret_key='S3.MODEL_REPOSITORY_BUCKET_NAME')
model = SgModel("test_checkpoints_resnet_8_gpus",
                model_checkpoints_location='s3://' + model_repo_bucket_name,
                multi_gpu=MultiGPUMode.DISTRIBUTED_DATA_PARALLEL
                )
# FOR AWS
dataset = ImageNetDatasetInterface(data_dir="/data/Imagenet", dataset_params=dataset_params)
model.connect_dataset_interface(dataset, data_loader_num_workers=8)
model.build_model("resnet50", load_checkpoint=False)
model.train(training_params=train_params)