Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

#643 PPYolo-E

Merged
Ghost merged 1 commits into Deci-AI:master from deci-ai:feature/SG-344-PP-Yolo-E-Training-Replicate-Recipe
42 changed files with 2307 additions and 94 deletions
  1. 9
    0
      src/super_gradients/common/object_names.py
  2. 0
    8
      src/super_gradients/recipes/arch_params/csp_resnet_arch_params.yaml
  3. 0
    5
      src/super_gradients/recipes/arch_params/csp_resnet_l_arch_params.yaml
  4. 0
    5
      src/super_gradients/recipes/arch_params/csp_resnet_m_arch_params.yaml
  5. 0
    5
      src/super_gradients/recipes/arch_params/csp_resnet_s_arch_params.yaml
  6. 0
    5
      src/super_gradients/recipes/arch_params/csp_resnet_x_arch_params.yaml
  7. 31
    0
      src/super_gradients/recipes/arch_params/ppyoloe_arch_params.yaml
  8. 9
    0
      src/super_gradients/recipes/arch_params/ppyoloe_l_arch_params.yaml
  9. 9
    0
      src/super_gradients/recipes/arch_params/ppyoloe_m_arch_params.yaml
  10. 9
    0
      src/super_gradients/recipes/arch_params/ppyoloe_s_arch_params.yaml
  11. 9
    0
      src/super_gradients/recipes/arch_params/ppyoloe_x_arch_params.yaml
  12. 61
    0
      src/super_gradients/recipes/coco2017_ppyoloe_l.yaml
  13. 61
    0
      src/super_gradients/recipes/coco2017_ppyoloe_m.yaml
  14. 57
    0
      src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
  15. 59
    0
      src/super_gradients/recipes/coco2017_ppyoloe_x.yaml
  16. 97
    0
      src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
  17. 60
    0
      src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
  18. 4
    0
      src/super_gradients/training/dataloaders/__init__.py
  19. 38
    19
      src/super_gradients/training/dataloaders/dataloaders.py
  20. 8
    6
      src/super_gradients/training/datasets/detection_datasets/coco_detection.py
  21. 6
    5
      src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
  22. 2
    0
      src/super_gradients/training/losses/__init__.py
  23. 2
    0
      src/super_gradients/training/losses/all_losses.py
  24. 905
    0
      src/super_gradients/training/losses/ppyolo_loss.py
  25. 4
    5
      src/super_gradients/training/metrics/detection_metrics.py
  26. 6
    1
      src/super_gradients/training/models/all_architectures.py
  27. 30
    3
      src/super_gradients/training/models/detection_models/csp_resnet.py
  28. 4
    0
      src/super_gradients/training/models/detection_models/pp_yolo_e/__init__.py
  29. 185
    0
      src/super_gradients/training/models/detection_models/pp_yolo_e/pan.py
  30. 81
    0
      src/super_gradients/training/models/detection_models/pp_yolo_e/post_prediction_callback.py
  31. 78
    0
      src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
  32. 264
    0
      src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
  33. 6
    0
      src/super_gradients/training/pretrained_models.py
  34. 1
    1
      src/super_gradients/training/sg_trainer/sg_trainer.py
  35. 2
    0
      src/super_gradients/training/transforms/__init__.py
  36. 2
    0
      src/super_gradients/training/transforms/all_transforms.py
  37. 45
    19
      src/super_gradients/training/transforms/transforms.py
  38. 30
    0
      src/super_gradients/training/utils/bbox_utils.py
  39. 2
    0
      src/super_gradients/training/utils/callbacks/all_callbacks.py
  40. 1
    1
      src/super_gradients/training/utils/callbacks/callbacks.py
  41. 26
    0
      src/super_gradients/training/utils/callbacks/ppyoloe_switch_callback.py
  42. 104
    6
      src/super_gradients/training/utils/detection_utils.py
@@ -7,6 +7,7 @@ class Losses:
     SHELFNET_OHEM_LOSS = "shelfnet_ohem_loss"
     SHELFNET_OHEM_LOSS = "shelfnet_ohem_loss"
     SHELFNET_SE_LOSS = "shelfnet_se_loss"
     SHELFNET_SE_LOSS = "shelfnet_se_loss"
     YOLOX_LOSS = "yolox_loss"
     YOLOX_LOSS = "yolox_loss"
+    PPYOLOE_LOSS = "ppyoloe_loss"
     YOLOX_FAST_LOSS = "yolox_fast_loss"
     YOLOX_FAST_LOSS = "yolox_fast_loss"
     SSD_LOSS = "ssd_loss"
     SSD_LOSS = "ssd_loss"
     STDC_LOSS = "stdc_loss"
     STDC_LOSS = "stdc_loss"
@@ -55,6 +56,8 @@ class Transforms:
     DetectionRescale = "DetectionRescale"
     DetectionRescale = "DetectionRescale"
     DetectionPaddedRescale = "DetectionPaddedRescale"
     DetectionPaddedRescale = "DetectionPaddedRescale"
     DetectionTargetsFormatTransform = "DetectionTargetsFormatTransform"
     DetectionTargetsFormatTransform = "DetectionTargetsFormatTransform"
+    DetectionNormalize = "DetectionNormalize"
+    #
     RandomResizedCropAndInterpolation = "RandomResizedCropAndInterpolation"
     RandomResizedCropAndInterpolation = "RandomResizedCropAndInterpolation"
     RandAugmentTransform = "RandAugmentTransform"
     RandAugmentTransform = "RandAugmentTransform"
     Lighting = "Lighting"
     Lighting = "Lighting"
@@ -131,6 +134,8 @@ class Callbacks:
     EARLY_STOP = "EarlyStop"
     EARLY_STOP = "EarlyStop"
     DETECTION_MULTISCALE_PREPREDICTION = "DetectionMultiscalePrePredictionCallback"
     DETECTION_MULTISCALE_PREPREDICTION = "DetectionMultiscalePrePredictionCallback"
     YOLOX_TRAINING_STAGE_SWITCH = "YoloXTrainingStageSwitchCallback"
     YOLOX_TRAINING_STAGE_SWITCH = "YoloXTrainingStageSwitchCallback"
+    PPYOLOE_TRAINING_STAGE_SWITCH = "PPYoloETrainingStageSwitchCallback"
+    DETECTION_VISUALIZATION_CALLBACK = "DetectionVisualizationCallback"
 
 
 
 
 class LRSchedulers:
 class LRSchedulers:
@@ -275,6 +280,10 @@ class Models:
     UNET_CUSTOM_CLS = "unet_custom_cls"
     UNET_CUSTOM_CLS = "unet_custom_cls"
     STDC_CUSTOM = "stdc_custom"
     STDC_CUSTOM = "stdc_custom"
     STDC_CUSTOM_CLS = "stdc_custom_cls"
     STDC_CUSTOM_CLS = "stdc_custom_cls"
+    PP_YOLOE_S = "ppyoloe_s"
+    PP_YOLOE_M = "ppyoloe_m"
+    PP_YOLOE_L = "ppyoloe_l"
+    PP_YOLOE_X = "ppyoloe_x"
 
 
 
 
 class ConcatenatedTensorFormats:
 class ConcatenatedTensorFormats:
Discard
1
2
3
4
5
6
7
8
  1. layers: [3, 6, 6, 3] # model's structure
  2. channels: [64, 128, 256, 512, 1024] # number of outputs channels for step and consecutive feature maps
  3. activation: silu # model's width multiplier
  4. return_idx: [1,2,3] # Indexes of feature maps to output
  5. use_large_stem: True # If True, uses 3 conv+bn+act instead of 2 in stem blocks
  6. width_mult: # scaling factor to number of channels
  7. depth_mult: # scaling factor to number of layers
  8. use_alpha: False # If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
Discard
1
2
3
4
5
  1. defaults:
  2. - csp_resnet_arch_params
  3. depth_mult: 1.0
  4. width_mult: 1.0
Discard
1
2
3
4
5
  1. defaults:
  2. - csp_resnet_arch_params
  3. depth_mult: 0.67
  4. width_mult: 0.75
Discard
1
2
3
4
5
  1. defaults:
  2. - csp_resnet_arch_params
  3. depth_mult: 0.33
  4. width_mult: 0.50
Discard
1
2
3
4
5
  1. defaults:
  2. - csp_resnet_arch_params
  3. depth_mult: 1.33
  4. width_mult: 1.25
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
  1. depth_mult:
  2. width_mult:
  3. num_classes: 80
  4. backbone:
  5. layers: [ 3, 6, 6, 3 ] # Backbone's structure
  6. channels: [ 64, 128, 256, 512, 1024 ] # Number of outputs channels for stem and consecutive feature maps
  7. activation: silu
  8. return_idx: [ 1, 2, 3 ] # Indexes of feature maps to output, indiced 1,2,3 correspond to feature maps of stride 8,16,32
  9. use_large_stem: True # If True, uses 3 conv+bn+act instead of 2 in stem blocks
  10. use_alpha: False # If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
  11. pretrained_weights:
  12. neck:
  13. in_channels: [256, 512, 1024]
  14. out_channels: [768, 384, 192]
  15. activation: silu
  16. block_num: 3
  17. stage_num: 1
  18. spp: True
  19. head:
  20. in_channels: [768, 384, 192]
  21. activation: silu
  22. fpn_strides: [32, 16, 8]
  23. grid_cell_scale: 5.0
  24. grid_cell_offset: 0.5
  25. reg_max: 16 # Number of bins for size prediction
  26. eval_size: # Size of the image for evaluation. Setting this value can be beneficial for inference speed since anchors will not be regenerated for each forward call.
Discard
1
2
3
4
5
6
7
8
9
  1. defaults:
  2. - ppyoloe_arch_params
  3. - _self_
  4. depth_mult: 1.0
  5. width_mult: 1.0
  6. backbone:
  7. pretrained_weights: https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/CSPResNetb_l_pretrained.pth
Discard
1
2
3
4
5
6
7
8
9
  1. defaults:
  2. - ppyoloe_arch_params
  3. - _self_
  4. depth_mult: 0.67
  5. width_mult: 0.75
  6. backbone:
  7. pretrained_weights: https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/CSPResNetb_m_pretrained.pth
Discard
1
2
3
4
5
6
7
8
9
  1. defaults:
  2. - ppyoloe_arch_params
  3. - _self_
  4. depth_mult: 0.33
  5. width_mult: 0.50
  6. backbone:
  7. pretrained_weights: https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/CSPResNetb_s_pretrained.pth
Discard
1
2
3
4
5
6
7
8
9
  1. defaults:
  2. - ppyoloe_arch_params
  3. - _self_
  4. depth_mult: 1.33
  5. width_mult: 1.25
  6. backbone:
  7. pretrained_weights: https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/CSPResNetb_x_pretrained.pth
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
  1. # PP-Yolo-E Detection training on COCO2017 Dataset:
  2. # PP-Yolo-E trained in 640x640
  3. # Checkpoints + tensorboards: https://deci-pretrained-models.s3.amazonaws.com/ppyoloe_coco/
  4. # Recipe runs with batch size = 20 X 8 gpus = 160.
  5. # Instructions:
  6. # 0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
  7. # 1. Move to the project root (where you will find the ReadMe and src folder)
  8. # 2. Run the command you want:
  9. # ppyoloe_s: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_s
  10. # ppyoloe_m: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_m
  11. # ppyoloe_l: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_l
  12. # ppyoloe_x: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_x
  13. #
  14. # Training times and accuracies (mAP@0.5-0.95 (COCO API, confidence 0.001, IoU threshold 0.6, test on 640x640 images):
  15. # ppyoloe_s: 37h on 8 NVIDIA GeForce RTX 3090, mAP: 42.52 (val)
  16. # ppyoloe_m: 58h on 8 NVIDIA GeForce RTX 3090, mAP: 47.11 (val)
  17. # ppyoloe_l: COMING SOON
  18. # ppyoloe_x: COMING SOON
  19. #
  20. defaults:
  21. - training_hyperparams: coco2017_ppyoloe_train_params
  22. - dataset_params: coco_detection_ppyoloe_dataset_params
  23. - arch_params: ppyoloe_l_arch_params
  24. - checkpoint_params: default_checkpoint_params
  25. - _self_
  26. train_dataloader: coco2017_train_ppyoloe
  27. val_dataloader: coco2017_val_ppyoloe
  28. load_checkpoint: False
  29. resume: False
  30. dataset_params:
  31. train_dataloader_params:
  32. batch_size: 20
  33. training_hyperparams:
  34. resume: ${resume}
  35. mixed_precision: True
  36. initial_lr: 1e-3
  37. architecture: pp_yoloe_l
  38. multi_gpu: DDP
  39. num_gpus: 8
  40. experiment_suffix: ""
  41. experiment_name: coco2017_${architecture}${experiment_suffix}
  42. ckpt_root_dir:
  43. # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
  44. hydra:
  45. run:
  46. # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
  47. dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
  1. # PP-Yolo-E Detection training on COCO2017 Dataset:
  2. # PP-Yolo-E trained in 640x640
  3. # Checkpoints + tensorboards: https://deci-pretrained-models.s3.amazonaws.com/ppyoloe_coco/
  4. # Recipe runs with batch size = 24 X 8 gpus = 192.
  5. # Instructions:
  6. # 0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
  7. # 1. Move to the project root (where you will find the ReadMe and src folder)
  8. # 2. Run the command you want:
  9. # ppyoloe_s: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_s
  10. # ppyoloe_m: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_m
  11. # ppyoloe_l: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_l
  12. # ppyoloe_x: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_x
  13. #
  14. # Training times and accuracies (mAP@0.5-0.95 (COCO API, confidence 0.001, IoU threshold 0.6, test on 640x640 images):
  15. # ppyoloe_s: 37h on 8 NVIDIA GeForce RTX 3090, mAP: 42.52 (val)
  16. # ppyoloe_m: 58h on 8 NVIDIA GeForce RTX 3090, mAP: 47.11 (val)
  17. # ppyoloe_l: COMING SOON
  18. # ppyoloe_x: COMING SOON
  19. #
  20. defaults:
  21. - training_hyperparams: coco2017_ppyoloe_train_params
  22. - dataset_params: coco_detection_ppyoloe_dataset_params
  23. - arch_params: ppyoloe_m_arch_params
  24. - checkpoint_params: default_checkpoint_params
  25. - _self_
  26. train_dataloader: coco2017_train_ppyoloe
  27. val_dataloader: coco2017_val_ppyoloe
  28. load_checkpoint: False
  29. resume: False
  30. dataset_params:
  31. train_dataloader_params:
  32. batch_size: 24
  33. training_hyperparams:
  34. resume: ${resume}
  35. mixed_precision: True
  36. initial_lr: 1e-3
  37. architecture: pp_yoloe_m
  38. multi_gpu: DDP
  39. num_gpus: 8
  40. experiment_suffix: ""
  41. experiment_name: coco2017_${architecture}${experiment_suffix}
  42. ckpt_root_dir:
  43. # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
  44. hydra:
  45. run:
  46. # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
  47. dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
  1. # PP-Yolo-E Detection training on COCO2017 Dataset:
  2. # PP-Yolo-E trained in 640x640
  3. # Recipe runs with batch size = 32 X 8 gpus = 256.
  4. # Instructions:
  5. # 0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
  6. # 1. Move to the project root (where you will find the ReadMe and src folder)
  7. # 2. Run the command you want:
  8. # ppyoloe_s: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_s
  9. # ppyoloe_m: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_m
  10. # ppyoloe_l: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_l
  11. # ppyoloe_x: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_x
  12. #
  13. # Training times and accuracies (mAP@0.5-0.95 (COCO API, confidence 0.001, IoU threshold 0.6, test on 640x640 images):
  14. # ppyoloe_s: 37h on 8 NVIDIA GeForce RTX 3090, mAP: 42.52 (val)
  15. # ppyoloe_m: 58h on 8 NVIDIA GeForce RTX 3090, mAP: 47.11 (val)
  16. # ppyoloe_l: COMING SOON
  17. # ppyoloe_x: COMING SOON
  18. #
  19. defaults:
  20. - training_hyperparams: coco2017_ppyoloe_train_params
  21. - dataset_params: coco_detection_ppyoloe_dataset_params
  22. - arch_params: ppyoloe_s_arch_params
  23. - checkpoint_params: default_checkpoint_params
  24. - _self_
  25. train_dataloader: coco2017_train_ppyoloe
  26. val_dataloader: coco2017_val_ppyoloe
  27. load_checkpoint: False
  28. resume: False
  29. dataset_params:
  30. train_dataloader_params:
  31. batch_size: 32
  32. training_hyperparams:
  33. resume: ${resume}
  34. mixed_precision: True
  35. architecture: pp_yoloe_s
  36. multi_gpu: DDP
  37. num_gpus: 8
  38. experiment_suffix: ""
  39. experiment_name: coco2017_${architecture}${experiment_suffix}
  40. ckpt_root_dir:
  41. # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
  42. hydra:
  43. run:
  44. # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
  45. dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  1. # PP-Yolo-E Detection training on COCO2017 Dataset:
  2. # PP-Yolo-E trained in 640x640
  3. # Checkpoints + tensorboards: https://deci-pretrained-models.s3.amazonaws.com/ppyoloe_coco/
  4. # Recipe runs with batch size = 16 X 8 gpus = 128.
  5. # Instructions:
  6. # 0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
  7. # 1. Move to the project root (where you will find the ReadMe and src folder)
  8. # 2. Run the command you want:
  9. # ppyoloe_s: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_s
  10. # ppyoloe_m: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_m
  11. # ppyoloe_l: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_l
  12. # ppyoloe_x: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_x
  13. #
  14. # Training times and accuracies (mAP@0.5-0.95 (COCO API, confidence 0.001, IoU threshold 0.6, test on 640x640 images):
  15. # ppyoloe_s: 37h on 8 NVIDIA GeForce RTX 3090, mAP: 42.52 (val)
  16. # ppyoloe_m: 58h on 8 NVIDIA GeForce RTX 3090, mAP: 47.11 (val)
  17. # ppyoloe_l: COMING SOON
  18. # ppyoloe_x: COMING SOON
  19. #
  20. defaults:
  21. - training_hyperparams: coco2017_ppyoloe_train_params
  22. - dataset_params: coco_detection_ppyoloe_dataset_params
  23. - arch_params: ppyoloe_x_arch_params
  24. - checkpoint_params: default_checkpoint_params
  25. - _self_
  26. train_dataloader: coco2017_train_ppyoloe
  27. val_dataloader: coco2017_val_ppyoloe
  28. load_checkpoint: False
  29. resume: False
  30. dataset_params:
  31. train_dataloader_params:
  32. batch_size: 16
  33. training_hyperparams:
  34. resume: ${resume}
  35. mixed_precision: True
  36. architecture: pp_yoloe_x
  37. multi_gpu: DDP
  38. num_gpus: 8
  39. experiment_suffix: ""
  40. experiment_name: coco2017_${architecture}${experiment_suffix}
  41. ckpt_root_dir:
  42. # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
  43. hydra:
  44. run:
  45. # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
  46. dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
  1. train_dataset_params:
  2. data_dir: /data/coco # root path to coco data
  3. subdir: images/train2017 # sub directory path of data_dir containing the train data.
  4. json_file: instances_train2017.json # path to coco train json file, data_dir/annotations/train_json_file.
  5. input_dim: # None, do not resize dataset on load
  6. cache_dir:
  7. cache: False
  8. transforms:
  9. - DetectionRandomAffine:
  10. degrees: 0 # rotation degrees, randomly sampled from [-degrees, degrees]
  11. translate: 0.25 # image translation fraction
  12. scales: [ 0.5, 1.5 ] # random rescale range (keeps size by padding/cropping) after mosaic transform.
  13. shear: 0.0 # shear degrees, randomly sampled from [-degrees, degrees]
  14. target_size:
  15. filter_box_candidates: True # whether to filter out transformed bboxes by edge size, area ratio, and aspect ratio.
  16. wh_thr: 2 # edge size threshold when filter_box_candidates = True (pixels)
  17. area_thr: 0.1 # threshold for area ratio between original image and the transformed one, when when filter_box_candidates = True
  18. ar_thr: 20 # aspect ratio threshold when filter_box_candidates = True
  19. - DetectionRandomRotate90:
  20. prob: 0.5
  21. - DetectionRGB2BGR:
  22. prob: 0.25
  23. - DetectionHSV:
  24. prob: 0.5 # probability to apply HSV transform
  25. hgain: 18 # HSV transform hue gain (randomly sampled from [-hgain, hgain])
  26. sgain: 30 # HSV transform saturation gain (randomly sampled from [-sgain, sgain])
  27. vgain: 30 # HSV transform value gain (randomly sampled from [-vgain, vgain])
  28. - DetectionHorizontalFlip:
  29. prob: 0.5 # probability to apply horizontal flip
  30. - DetectionMixup:
  31. input_dim:
  32. mixup_scale: [ 0.5, 1.5 ] # random rescale range for the additional sample in mixup
  33. prob: 0.5 # probability to apply per-sample mixup
  34. flip_prob: 0.5 # probability to apply horizontal flip
  35. - DetectionNormalize:
  36. mean: [ 123.675, 116.28, 103.53 ]
  37. std: [ 58.395, 57.12, 57.375 ]
  38. - DetectionTargetsFormatTransform:
  39. max_targets: 256
  40. output_format: LABEL_CXCYWH
  41. tight_box_rotation: False
  42. class_inclusion_list:
  43. max_num_samples:
  44. with_crowd: False
  45. train_dataloader_params:
  46. batch_size: 32
  47. num_workers: 8
  48. shuffle: True
  49. drop_last: True
  50. # Disable pin_memory due to presence of PPYoloECollateFN with uses random resize during training
  51. pin_memory: False
  52. worker_init_fn:
  53. _target_: super_gradients.training.utils.utils.load_func
  54. dotpath: super_gradients.training.datasets.datasets_utils.worker_init_reset_seed
  55. collate_fn: # collate function for trainset
  56. _target_: super_gradients.training.utils.detection_utils.PPYoloECollateFN
  57. random_resize_sizes: [ 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768 ]
  58. random_resize_modes:
  59. - 0 # cv::INTER_NEAREST
  60. - 1 # cv::INTER_LINEAR
  61. - 2 # cv::INTER_CUBIC
  62. - 3 # cv::INTER_AREA
  63. - 4 # cv::INTER_LANCZOS4
  64. val_dataset_params:
  65. data_dir: /data/coco # root path to coco data
  66. subdir: images/val2017 # sub directory path of data_dir containing the train data.
  67. json_file: instances_val2017.json # path to coco train json file, data_dir/annotations/train_json_file.
  68. input_dim:
  69. cache_dir:
  70. cache: False
  71. transforms:
  72. - DetectionRescale:
  73. output_shape: [640, 640]
  74. - DetectionNormalize:
  75. mean: [ 123.675, 116.28, 103.53 ]
  76. std: [ 58.395, 57.12, 57.375 ]
  77. - DetectionTargetsFormatTransform:
  78. max_targets: 256
  79. output_format: LABEL_CXCYWH
  80. tight_box_rotation: False
  81. class_inclusion_list:
  82. max_num_samples:
  83. with_crowd: False
  84. val_dataloader_params:
  85. batch_size: 64
  86. num_workers: 8
  87. drop_last: False
  88. shuffle: False
  89. pin_memory: False
  90. collate_fn: # collate function for trainset
  91. _target_: super_gradients.training.utils.detection_utils.PPYoloECollateFN
  92. _convert_: all
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
  1. defaults:
  2. - default_train_params
  3. max_epochs: 500
  4. static_assigner_end_epoch: 150
  5. warmup_mode: "linear_batch_step"
  6. warmup_initial_lr: 1e-6
  7. lr_warmup_steps: 1000
  8. lr_warmup_epochs: 0
  9. initial_lr: 2e-3
  10. lr_mode: cosine
  11. cosine_final_lr_ratio: 0.1
  12. zero_weight_decay_on_bias_and_bn: False
  13. batch_accumulate: 1
  14. save_ckpt_epoch_list: [200, 250, 300, 350, 400, 450]
  15. loss:
  16. ppyoloe_loss:
  17. num_classes: ${arch_params.num_classes}
  18. reg_max: ${arch_params.head.reg_max}
  19. optimizer: AdamW
  20. optimizer_params:
  21. weight_decay: 0.0001
  22. ema: True
  23. ema_params:
  24. decay: 0.9997
  25. decay_type: threshold
  26. mixed_precision: False
  27. sync_bn: True
  28. valid_metrics_list:
  29. - DetectionMetrics:
  30. score_thres: 0.1
  31. top_k_predictions: 300
  32. num_cls: ${arch_params.num_classes}
  33. normalize_targets: True
  34. post_prediction_callback:
  35. _target_: super_gradients.training.models.detection_models.pp_yolo_e.PPYoloEPostPredictionCallback
  36. score_threshold: 0.01
  37. nms_top_k: 1000
  38. max_predictions: 300
  39. nms_threshold: 0.7
  40. pre_prediction_callback:
  41. phase_callbacks:
  42. - PPYoloETrainingStageSwitchCallback:
  43. static_assigner_end_epoch: ${training_hyperparams.static_assigner_end_epoch}
  44. metric_to_watch: 'mAP@0.50:0.95'
  45. greater_metric_to_watch_is_better: True
  46. _convert_: all
Discard
@@ -5,6 +5,8 @@ from .dataloaders import (
     coco2017_val_yolox,
     coco2017_val_yolox,
     coco2017_train_ssd_lite_mobilenet_v2,
     coco2017_train_ssd_lite_mobilenet_v2,
     coco2017_val_ssd_lite_mobilenet_v2,
     coco2017_val_ssd_lite_mobilenet_v2,
+    coco2017_train_ppyoloe,
+    coco2017_val_ppyoloe,
     imagenet_train,
     imagenet_train,
     imagenet_val,
     imagenet_val,
     imagenet_efficientnet_train,
     imagenet_efficientnet_train,
@@ -58,6 +60,8 @@ __all__ = [
     "coco2017_val_yolox",
     "coco2017_val_yolox",
     "coco2017_train_ssd_lite_mobilenet_v2",
     "coco2017_train_ssd_lite_mobilenet_v2",
     "coco2017_val_ssd_lite_mobilenet_v2",
     "coco2017_val_ssd_lite_mobilenet_v2",
+    "coco2017_train_ppyoloe",
+    "coco2017_val_ppyoloe",
     "imagenet_train",
     "imagenet_train",
     "imagenet_val",
     "imagenet_val",
     "imagenet_efficientnet_train",
     "imagenet_efficientnet_train",
Discard
@@ -1,29 +1,29 @@
 import os.path
 import os.path
-import pkg_resources
 from typing import Dict
 from typing import Dict
 
 
 import hydra
 import hydra
-from hydra import compose, initialize_config_dir
-from hydra.core.global_hydra import GlobalHydra
-
 import numpy as np
 import numpy as np
-import torch
-from torch.utils.data import BatchSampler, DataLoader, TensorDataset
-
+import pkg_resources
 import super_gradients
 import super_gradients
-
-from super_gradients.training.datasets.detection_datasets.pascal_voc_detection import (
-    PascalVOCUnifiedDetectionTrainDataset,
-    PascalVOCDetectionDataset,
-)
-from super_gradients.training.utils import get_param
+import torch
+from hydra import compose, initialize_config_dir
+from hydra.core.global_hydra import GlobalHydra
+from super_gradients.common.abstractions.abstract_logger import get_logger
 from super_gradients.common.environment.path_utils import normalize_path
 from super_gradients.common.environment.path_utils import normalize_path
+from super_gradients.common.factories.collate_functions_factory import CollateFunctionsFactory
+from super_gradients.common.factories.datasets_factory import DatasetsFactory
+from super_gradients.common.factories.samplers_factory import SamplersFactory
 from super_gradients.training.datasets import ImageNetDataset
 from super_gradients.training.datasets import ImageNetDataset
-from super_gradients.training.datasets.detection_datasets import COCODetectionDataset
 from super_gradients.training.datasets.classification_datasets.cifar import (
 from super_gradients.training.datasets.classification_datasets.cifar import (
     Cifar10,
     Cifar10,
     Cifar100,
     Cifar100,
 )
 )
+from super_gradients.training.datasets.detection_datasets import COCODetectionDataset
+from super_gradients.training.datasets.detection_datasets.pascal_voc_detection import (
+    PascalVOCUnifiedDetectionTrainDataset,
+    PascalVOCDetectionDataset,
+)
+from super_gradients.training.datasets.pose_estimation_datasets import COCOKeypointsDataset
 from super_gradients.training.datasets.segmentation_datasets import (
 from super_gradients.training.datasets.segmentation_datasets import (
     CityscapesDataset,
     CityscapesDataset,
     CoCoSegmentationDataSet,
     CoCoSegmentationDataSet,
@@ -32,16 +32,13 @@ from super_gradients.training.datasets.segmentation_datasets import (
     SuperviselyPersonsDataset,
     SuperviselyPersonsDataset,
     MapillaryDataset,
     MapillaryDataset,
 )
 )
-from super_gradients.common.factories.collate_functions_factory import CollateFunctionsFactory
-from super_gradients.common.factories.samplers_factory import SamplersFactory
+from super_gradients.training.utils import get_param
 from super_gradients.training.utils.distributed_training_utils import (
 from super_gradients.training.utils.distributed_training_utils import (
     wait_for_the_master,
     wait_for_the_master,
     get_local_rank,
     get_local_rank,
 )
 )
-from super_gradients.common.abstractions.abstract_logger import get_logger
 from super_gradients.training.utils.utils import override_default_params_without_nones
 from super_gradients.training.utils.utils import override_default_params_without_nones
-from super_gradients.common.factories.datasets_factory import DatasetsFactory
-from super_gradients.training.datasets.pose_estimation_datasets import COCOKeypointsDataset
+from torch.utils.data import BatchSampler, DataLoader, TensorDataset
 
 
 logger = get_logger(__name__)
 logger = get_logger(__name__)
 
 
@@ -163,6 +160,26 @@ def coco2017_val(dataset_params: Dict = None, dataloader_params: Dict = None):
     )
     )
 
 
 
 
+def coco2017_train_ppyoloe(dataset_params: Dict = None, dataloader_params: Dict = None):
+    return get_data_loader(
+        config_name="coco_detection_ppyoloe_dataset_params",
+        dataset_cls=COCODetectionDataset,
+        train=True,
+        dataset_params=dataset_params,
+        dataloader_params=dataloader_params,
+    )
+
+
+def coco2017_val_ppyoloe(dataset_params: Dict = None, dataloader_params: Dict = None):
+    return get_data_loader(
+        config_name="coco_detection_ppyoloe_dataset_params",
+        dataset_cls=COCODetectionDataset,
+        train=False,
+        dataset_params=dataset_params,
+        dataloader_params=dataloader_params,
+    )
+
+
 def coco2017_train_yolox(dataset_params: Dict = None, dataloader_params: Dict = None):
 def coco2017_train_yolox(dataset_params: Dict = None, dataloader_params: Dict = None):
     return coco2017_train(dataset_params, dataloader_params)
     return coco2017_train(dataset_params, dataloader_params)
 
 
@@ -646,6 +663,8 @@ ALL_DATALOADERS = {
     "coco2017_val": coco2017_val,
     "coco2017_val": coco2017_val,
     "coco2017_train_yolox": coco2017_train_yolox,
     "coco2017_train_yolox": coco2017_train_yolox,
     "coco2017_val_yolox": coco2017_val_yolox,
     "coco2017_val_yolox": coco2017_val_yolox,
+    "coco2017_train_ppyoloe": coco2017_train_ppyoloe,
+    "coco2017_val_ppyoloe": coco2017_val_ppyoloe,
     "coco2017_train_ssd_lite_mobilenet_v2": coco2017_train_ssd_lite_mobilenet_v2,
     "coco2017_train_ssd_lite_mobilenet_v2": coco2017_train_ssd_lite_mobilenet_v2,
     "coco2017_val_ssd_lite_mobilenet_v2": coco2017_val_ssd_lite_mobilenet_v2,
     "coco2017_val_ssd_lite_mobilenet_v2": coco2017_val_ssd_lite_mobilenet_v2,
     "coco2017_pose_train": coco2017_pose_train,
     "coco2017_pose_train": coco2017_pose_train,
Discard
@@ -167,13 +167,15 @@ class COCODetectionDataset(DetectionDataset):
             crowd_target[ix, 0:4] = annotation["clean_bbox"]
             crowd_target[ix, 0:4] = annotation["clean_bbox"]
             crowd_target[ix, 4] = cls
             crowd_target[ix, 4] = cls
 
 
-        r = min(self.input_dim[0] / height, self.input_dim[1] / width)
-        target[:, :4] *= r
-        crowd_target[:, :4] *= r
-        target_segmentation *= r
-
         initial_img_shape = (height, width)
         initial_img_shape = (height, width)
-        resized_img_shape = (int(height * r), int(width * r))
+        if self.input_dim is not None:
+            r = min(self.input_dim[0] / height, self.input_dim[1] / width)
+            target[:, :4] *= r
+            crowd_target[:, :4] *= r
+            target_segmentation *= r
+            resized_img_shape = (int(height * r), int(width * r))
+        else:
+            resized_img_shape = initial_img_shape
 
 
         file_name = img_metadata["file_name"] if "file_name" in img_metadata else "{:012}".format(img_id) + ".jpg"
         file_name = img_metadata["file_name"] if "file_name" in img_metadata else "{:012}".format(img_id) + ".jpg"
         img_path = os.path.join(self.data_dir, self.subdir, file_name)
         img_path = os.path.join(self.data_dir, self.subdir, file_name)
Discard
@@ -67,7 +67,7 @@ class DetectionDataset(Dataset):
     def __init__(
     def __init__(
         self,
         self,
         data_dir: str,
         data_dir: str,
-        input_dim: tuple,
+        input_dim: Optional[Tuple[int, int]],
         original_target_format: DetectionTargetsFormat,
         original_target_format: DetectionTargetsFormat,
         max_num_samples: int = None,
         max_num_samples: int = None,
         cache: bool = False,
         cache: bool = False,
@@ -278,11 +278,12 @@ class DetectionDataset(Dataset):
         """
         """
         img = self._load_image(index)
         img = self._load_image(index)
 
 
-        r = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
-        desired_size = (int(img.shape[1] * r), int(img.shape[0] * r))
+        if self.input_dim is not None:
+            r = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
+            desired_size = (int(img.shape[1] * r), int(img.shape[0] * r))
+            img = cv2.resize(src=img, dsize=desired_size, interpolation=cv2.INTER_LINEAR).astype(np.uint8)
 
 
-        resized_img = cv2.resize(src=img, dsize=desired_size, interpolation=cv2.INTER_LINEAR).astype(np.uint8)
-        return resized_img
+        return img
 
 
     def _load_image(self, index: int) -> np.ndarray:
     def _load_image(self, index: int) -> np.ndarray:
         """Loads image at index with its original resolution.
         """Loads image at index with its original resolution.
Discard
@@ -9,6 +9,7 @@ from super_gradients.training.losses.ssd_loss import SSDLoss
 from super_gradients.training.losses.bce_dice_loss import BCEDiceLoss
 from super_gradients.training.losses.bce_dice_loss import BCEDiceLoss
 from super_gradients.training.losses.dice_ce_edge_loss import DiceCEEdgeLoss
 from super_gradients.training.losses.dice_ce_edge_loss import DiceCEEdgeLoss
 from super_gradients.training.losses.all_losses import LOSSES, Losses
 from super_gradients.training.losses.all_losses import LOSSES, Losses
+from super_gradients.training.losses.ppyolo_loss import PPYoloELoss
 
 
 __all__ = [
 __all__ = [
     "LOSSES",
     "LOSSES",
@@ -24,4 +25,5 @@ __all__ = [
     "BCEDiceLoss",
     "BCEDiceLoss",
     "KDLogitsLoss",
     "KDLogitsLoss",
     "DiceCEEdgeLoss",
     "DiceCEEdgeLoss",
+    "PPYoloELoss",
 ]
 ]
Discard
@@ -13,6 +13,7 @@ from super_gradients.training.losses import (
     DiceCEEdgeLoss,
     DiceCEEdgeLoss,
 )
 )
 from super_gradients.training.losses.stdc_loss import STDCLoss
 from super_gradients.training.losses.stdc_loss import STDCLoss
+from super_gradients.training.losses.ppyolo_loss import PPYoloELoss
 
 
 
 
 LOSSES = {
 LOSSES = {
@@ -28,4 +29,5 @@ LOSSES = {
     Losses.BCE_DICE_LOSS: BCEDiceLoss,
     Losses.BCE_DICE_LOSS: BCEDiceLoss,
     Losses.KD_LOSS: KDLogitsLoss,
     Losses.KD_LOSS: KDLogitsLoss,
     Losses.DICE_CE_EDGE_LOSS: DiceCEEdgeLoss,
     Losses.DICE_CE_EDGE_LOSS: DiceCEEdgeLoss,
+    Losses.PPYOLOE_LOSS: PPYoloELoss,
 }
 }
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
  1. from typing import Mapping, Tuple, Union
  2. import numpy as np
  3. import torch
  4. import torch.nn.functional as F
  5. from torch import nn, Tensor
  6. import super_gradients
  7. from super_gradients.training.datasets.data_formats.bbox_formats.cxcywh import cxcywh_to_xyxy
  8. from super_gradients.training.utils.bbox_utils import batch_distance2bbox
  9. from super_gradients.training.utils.distributed_training_utils import (
  10. get_world_size,
  11. )
  12. def batch_iou_similarity(box1, box2, eps=1e-9):
  13. """Calculate iou of box1 and box2 in batch. Bboxes are expected to be in x1y1x2y2 format.
  14. Args:
  15. box1 (Tensor): box with the shape [N, M1, 4]
  16. box2 (Tensor): box with the shape [N, M2, 4]
  17. Return:
  18. iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
  19. """
  20. box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4]
  21. box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4]
  22. px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
  23. gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
  24. x1y1 = torch.maximum(px1y1, gx1y1)
  25. x2y2 = torch.minimum(px2y2, gx2y2)
  26. overlap = (x2y2 - x1y1).clip(0).prod(-1)
  27. area1 = (px2y2 - px1y1).clip(0).prod(-1)
  28. area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
  29. union = area1 + area2 - overlap + eps
  30. return overlap / union
  31. def iou_similarity(box1, box2, eps=1e-10):
  32. """
  33. Calculate iou of box1 and box2. Bboxes are expected to be in x1y1x2y2 format.
  34. Args:
  35. box1 (Tensor): box with the shape [M1, 4]
  36. box2 (Tensor): box with the shape [M2, 4]
  37. Return:
  38. iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
  39. """
  40. box1 = box1.unsqueeze(1) # [M1, 4] -> [M1, 1, 4]
  41. box2 = box2.unsqueeze(0) # [M2, 4] -> [1, M2, 4]
  42. px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
  43. gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
  44. x1y1 = torch.maximum(px1y1, gx1y1)
  45. x2y2 = torch.minimum(px2y2, gx2y2)
  46. overlap = (x2y2 - x1y1).clip(0).prod(-1)
  47. area1 = (px2y2 - px1y1).clip(0).prod(-1)
  48. area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
  49. union = area1 + area2 - overlap + eps
  50. return overlap / union
  51. def bbox_overlaps(bboxes1, bboxes2, mode="iou", is_aligned=False, eps=1e-6):
  52. """Calculate overlap between two set of bboxes.
  53. If ``is_aligned `` is ``False``, then calculate the overlaps between each
  54. bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
  55. pair of bboxes1 and bboxes2.
  56. Args:
  57. bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
  58. bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
  59. B indicates the batch dim, in shape (B1, B2, ..., Bn).
  60. If ``is_aligned `` is ``True``, then m and n must be equal.
  61. mode (str): "iou" (intersection over union) or "iof" (intersection over
  62. foreground).
  63. is_aligned (bool, optional): If True, then m and n must be equal.
  64. Default False.
  65. eps (float, optional): A value added to the denominator for numerical
  66. stability. Default 1e-6.
  67. Returns:
  68. Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
  69. """
  70. assert mode in ["iou", "iof", "giou"], "Unsupported mode {}".format(mode)
  71. # Either the boxes are empty or the length of boxes's last dimenstion is 4
  72. assert bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0
  73. assert bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0
  74. # Batch dim must be the same
  75. # Batch dim: (B1, B2, ... Bn)
  76. assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
  77. batch_shape = bboxes1.shape[:-2]
  78. rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
  79. cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
  80. if is_aligned:
  81. assert rows == cols
  82. if rows * cols == 0:
  83. if is_aligned:
  84. return np.random.random(batch_shape + (rows,))
  85. else:
  86. return np.random.random(batch_shape + (rows, cols))
  87. area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
  88. area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])
  89. if is_aligned:
  90. lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2]
  91. rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2]
  92. wh = (rb - lt).clip(min=0) # [B, rows, 2]
  93. overlap = wh[..., 0] * wh[..., 1]
  94. if mode in ["iou", "giou"]:
  95. union = area1 + area2 - overlap
  96. else:
  97. union = area1
  98. if mode == "giou":
  99. enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
  100. enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
  101. else:
  102. lt = np.maximum(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2]) # [B, rows, cols, 2]
  103. rb = np.minimum(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:]) # [B, rows, cols, 2]
  104. wh = (rb - lt).clip(min=0) # [B, rows, cols, 2]
  105. overlap = wh[..., 0] * wh[..., 1]
  106. if mode in ["iou", "giou"]:
  107. union = area1[..., None] + area2[..., None, :] - overlap
  108. else:
  109. union = area1[..., None]
  110. if mode == "giou":
  111. enclosed_lt = np.minimum(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2])
  112. enclosed_rb = np.maximum(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:])
  113. eps = np.array([eps])
  114. union = np.maximum(union, eps)
  115. ious = overlap / union
  116. if mode in ["iou", "iof"]:
  117. return ious
  118. # calculate gious
  119. enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
  120. enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
  121. enclose_area = np.maximum(enclose_area, eps)
  122. gious = ious - (enclose_area - union) / enclose_area
  123. return gious
  124. def topk_(input, k, axis=1, largest=True):
  125. x = -input if largest else input
  126. if axis == 0:
  127. row_index = np.arange(input.shape[1 - axis])
  128. topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
  129. topk_data = x[topk_index, row_index]
  130. topk_index_sort = np.argsort(topk_data, axis=axis)
  131. topk_data_sort = topk_data[topk_index_sort, row_index]
  132. topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index]
  133. else:
  134. column_index = np.arange(x.shape[1 - axis])[:, None]
  135. topk_index = np.argpartition(x, k, axis=axis)[:, 0:k]
  136. topk_data = x[column_index, topk_index]
  137. topk_data = -topk_data if largest else topk_data
  138. topk_index_sort = np.argsort(topk_data, axis=axis)
  139. topk_data_sort = topk_data[column_index, topk_index_sort]
  140. topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort]
  141. return topk_data_sort, topk_index_sort
  142. def compute_max_iou_anchor(ious: Tensor) -> Tensor:
  143. r"""
  144. For each anchor, find the GT with the largest IOU.
  145. Args:
  146. ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
  147. Returns:
  148. is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
  149. """
  150. num_max_boxes = ious.shape[-2]
  151. max_iou_index = ious.argmax(dim=-2)
  152. is_max_iou: Tensor = torch.nn.functional.one_hot(max_iou_index, num_max_boxes).permute([0, 2, 1])
  153. return is_max_iou.type_as(ious)
  154. def check_points_inside_bboxes(points: Tensor, bboxes, center_radius_tensor=None, eps=1e-9):
  155. r"""
  156. Args:
  157. points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
  158. bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format
  159. center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None.
  160. eps (float): Default: 1e-9
  161. Returns:
  162. is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected
  163. """
  164. points = points.unsqueeze(0).unsqueeze(0)
  165. x, y = points.chunk(2, dim=-1)
  166. xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, dim=-1)
  167. # check whether `points` is in `bboxes`
  168. left = x - xmin
  169. top = y - ymin
  170. right = xmax - x
  171. bottom = ymax - y
  172. delta_ltrb = torch.cat([left, top, right, bottom], dim=-1)
  173. is_in_bboxes = delta_ltrb.min(dim=-1).values > eps
  174. if center_radius_tensor is not None:
  175. # check whether `points` is in `center_radius`
  176. center_radius_tensor = center_radius_tensor.unsqueeze(0).unsqueeze(0)
  177. cx = (xmin + xmax) * 0.5
  178. cy = (ymin + ymax) * 0.5
  179. left = x - (cx - center_radius_tensor)
  180. top = y - (cy - center_radius_tensor)
  181. right = (cx + center_radius_tensor) - x
  182. bottom = (cy + center_radius_tensor) - y
  183. delta_ltrb_c = torch.cat([left, top, right, bottom], dim=-1)
  184. is_in_center = delta_ltrb_c.min(dim=-1) > eps
  185. return (torch.logical_and(is_in_bboxes, is_in_center), torch.logical_or(is_in_bboxes, is_in_center))
  186. return is_in_bboxes.type_as(bboxes)
  187. def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
  188. r"""
  189. Args:
  190. metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
  191. topk (int): The number of top elements to look for along the axis.
  192. largest (bool) : largest is a flag, if set to true,
  193. algorithm will sort by descending order, otherwise sort by
  194. ascending order. Default: True
  195. topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask,
  196. Default: None
  197. eps (float): Default: 1e-9
  198. Returns:
  199. is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected
  200. """
  201. num_anchors = metrics.shape[-1]
  202. topk_metrics, topk_idxs = torch.topk(metrics, topk, dim=-1, largest=largest)
  203. if topk_mask is None:
  204. topk_mask = (topk_metrics.max(dim=-1, keepdim=True).values > eps).type_as(metrics)
  205. is_in_topk = torch.nn.functional.one_hot(topk_idxs, num_anchors).sum(dim=-2).type_as(metrics)
  206. return is_in_topk * topk_mask
  207. def bbox_center(boxes):
  208. """Get bbox centers from boxes.
  209. Args:
  210. boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
  211. Returns:
  212. Tensor: boxes centers with shape (..., 2), "cx, cy" format.
  213. """
  214. boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
  215. boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
  216. return torch.stack([boxes_cx, boxes_cy], dim=-1)
  217. def compute_max_iou_gt(ious):
  218. r"""
  219. For each GT, find the anchor with the largest IOU.
  220. Args:
  221. ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
  222. Returns:
  223. is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
  224. """
  225. num_anchors = ious.shape[-1]
  226. max_iou_index = ious.argmax(dim=-1)
  227. is_max_iou = torch.nn.functional.one_hot(max_iou_index, num_anchors)
  228. return is_max_iou.astype(ious.dtype)
  229. class ATSSAssigner(nn.Module):
  230. """Bridging the Gap Between Anchor-based and Anchor-free Detection
  231. via Adaptive Training Sample Selection
  232. """
  233. __shared__ = ["num_classes"]
  234. def __init__(self, topk=9, num_classes=80, force_gt_matching=False, eps=1e-9):
  235. """
  236. :param topk: Maximum number of achors that is selected for each gt box
  237. :param num_classes:
  238. :param force_gt_matching: Guarantee that each gt box is matched to at least one anchor.
  239. If two gt boxes match to the same anchor, the one with the larger area will be selected.
  240. And the second-best achnor will be assigned to the other gt box.
  241. :param eps: Small constant for numerical stability
  242. """
  243. super(ATSSAssigner, self).__init__()
  244. self.topk = topk
  245. self.num_classes = num_classes
  246. self.force_gt_matching = force_gt_matching
  247. self.eps = eps
  248. def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list, pad_gt_mask):
  249. gt2anchor_distances_list = torch.split(gt2anchor_distances, num_anchors_list, dim=-1)
  250. num_anchors_index = np.cumsum(num_anchors_list).tolist()
  251. num_anchors_index = [
  252. 0,
  253. ] + num_anchors_index[:-1]
  254. is_in_topk_list = []
  255. topk_idxs_list = []
  256. for distances, anchors_index in zip(gt2anchor_distances_list, num_anchors_index):
  257. num_anchors = distances.shape[-1]
  258. _, topk_idxs = torch.topk(distances, self.topk, dim=-1, largest=False)
  259. topk_idxs_list.append(topk_idxs + anchors_index)
  260. is_in_topk = torch.nn.functional.one_hot(topk_idxs, num_anchors).sum(dim=-2).type_as(gt2anchor_distances)
  261. is_in_topk_list.append(is_in_topk * pad_gt_mask)
  262. is_in_topk_list = torch.cat(is_in_topk_list, dim=-1)
  263. topk_idxs_list = torch.cat(topk_idxs_list, dim=-1)
  264. return is_in_topk_list, topk_idxs_list
  265. @torch.no_grad()
  266. def forward(
  267. self,
  268. anchor_bboxes,
  269. num_anchors_list,
  270. gt_labels,
  271. gt_bboxes,
  272. pad_gt_mask,
  273. bg_index,
  274. gt_scores=None,
  275. pred_bboxes=None,
  276. ):
  277. r"""This code is based on
  278. https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
  279. The assignment is done in following steps
  280. 1. compute iou between all bbox (bbox of all pyramid levels) and gt
  281. 2. compute center distance between all bbox and gt
  282. 3. on each pyramid level, for each gt, select k bbox whose center
  283. are closest to the gt center, so we total select k*l bbox as
  284. candidates for each gt
  285. 4. get corresponding iou for the these candidates, and compute the
  286. mean and std, set mean + std as the iou threshold
  287. 5. select these candidates whose iou are greater than or equal to
  288. the threshold as positive
  289. 6. limit the positive sample's center in gt
  290. 7. if an anchor box is assigned to multiple gts, the one with the
  291. highest iou will be selected.
  292. Args:
  293. anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4),
  294. "xmin, xmax, ymin, ymax" format
  295. num_anchors_list (List): num of anchors in each level
  296. gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
  297. gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
  298. pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
  299. bg_index (int): background index
  300. gt_scores (Tensor|None, float32) Score of gt_bboxes,
  301. shape(B, n, 1), if None, then it will initialize with one_hot label
  302. pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4)
  303. Returns:
  304. assigned_labels (Tensor): (B, L)
  305. assigned_bboxes (Tensor): (B, L, 4)
  306. assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious
  307. """
  308. assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3
  309. num_anchors, _ = anchor_bboxes.shape
  310. batch_size, num_max_boxes, _ = gt_bboxes.shape
  311. # negative batch
  312. if num_max_boxes == 0:
  313. assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype=torch.long, device=anchor_bboxes.device)
  314. assigned_bboxes = torch.zeros([batch_size, num_anchors, 4], device=anchor_bboxes.device)
  315. assigned_scores = torch.zeros([batch_size, num_anchors, self.num_classes], device=anchor_bboxes.device)
  316. return assigned_labels, assigned_bboxes, assigned_scores
  317. # 1. compute iou between gt and anchor bbox, [B, n, L]
  318. ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
  319. ious = ious.reshape([batch_size, -1, num_anchors])
  320. # 2. compute center distance between all anchors and gt, [B, n, L]
  321. gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)
  322. anchor_centers = bbox_center(anchor_bboxes)
  323. # gt2anchor_distances = (
  324. # (gt_centers - anchor_centers.unsqueeze(0)).norm(2, dim=-1).reshape([batch_size, -1, num_anchors])
  325. # )
  326. gt2anchor_distances = torch.norm(gt_centers - anchor_centers.unsqueeze(0), p=2, dim=-1).reshape([batch_size, -1, num_anchors])
  327. # 3. on each pyramid level, selecting top-k closest candidates
  328. # based on the center distance, [B, n, L]
  329. is_in_topk, topk_idxs = self._gather_topk_pyramid(gt2anchor_distances, num_anchors_list, pad_gt_mask)
  330. # 4. get corresponding iou for the these candidates, and compute the
  331. # mean and std, 5. set mean + std as the iou threshold
  332. iou_candidates = ious * is_in_topk
  333. iou_threshold = torch.gather(iou_candidates.flatten(end_dim=-2), dim=1, index=topk_idxs.flatten(end_dim=-2))
  334. iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
  335. iou_threshold = iou_threshold.mean(dim=-1, keepdim=True) + iou_threshold.std(dim=-1, keepdim=True)
  336. is_in_topk = torch.where(iou_candidates > iou_threshold, is_in_topk, torch.zeros_like(is_in_topk))
  337. # 6. check the positive sample's center in gt, [B, n, L]
  338. is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
  339. # select positive sample, [B, n, L]
  340. mask_positive = is_in_topk * is_in_gts * pad_gt_mask
  341. # 7. if an anchor box is assigned to multiple gts,
  342. # the one with the highest iou will be selected.
  343. mask_positive_sum = mask_positive.sum(dim=-2)
  344. if mask_positive_sum.max() > 1:
  345. mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
  346. is_max_iou = compute_max_iou_anchor(ious)
  347. mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
  348. mask_positive_sum = mask_positive.sum(dim=-2)
  349. # 8. make sure every gt_bbox matches the anchor
  350. if self.force_gt_matching:
  351. is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask
  352. mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile([1, num_max_boxes, 1])
  353. mask_positive = torch.where(mask_max_iou, is_max_iou, mask_positive)
  354. mask_positive_sum = mask_positive.sum(dim=-2)
  355. assigned_gt_index = mask_positive.argmax(dim=-2)
  356. # assigned target
  357. batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
  358. assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
  359. assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
  360. assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
  361. assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))
  362. # assigned_bboxes = torch.gather(gt_bboxes.reshape([-1, 4]), index=assigned_gt_index.flatten(), dim=0)
  363. assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
  364. assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
  365. assigned_scores = torch.nn.functional.one_hot(assigned_labels, self.num_classes + 1).float()
  366. ind = list(range(self.num_classes + 1))
  367. ind.remove(bg_index)
  368. assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device), dim=-1)
  369. if pred_bboxes is not None:
  370. # assigned iou
  371. ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive
  372. ious = ious.max(dim=-2).values.unsqueeze(-1)
  373. assigned_scores *= ious
  374. elif gt_scores is not None:
  375. gather_scores = torch.gather(gt_scores.flatten(), assigned_gt_index.flatten(), dim=0)
  376. gather_scores = gather_scores.reshape([batch_size, num_anchors])
  377. gather_scores = torch.where(mask_positive_sum > 0, gather_scores, torch.zeros_like(gather_scores))
  378. assigned_scores *= gather_scores.unsqueeze(-1)
  379. return assigned_labels, assigned_bboxes, assigned_scores
  380. class TaskAlignedAssigner(nn.Module):
  381. """TOOD: Task-aligned One-stage Object Detection"""
  382. def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
  383. """
  384. :param topk: Maximum number of achors that is selected for each gt box
  385. :param alpha: Power factor for class probabilities of predicted boxes (Used compute alignment metric)
  386. :param beta: Power factor for IoU score of predicted boxes (Used compute alignment metric)
  387. :param eps: Small constant for numerical stability
  388. """
  389. super(TaskAlignedAssigner, self).__init__()
  390. self.topk = topk
  391. self.alpha = alpha
  392. self.beta = beta
  393. self.eps = eps
  394. @torch.no_grad()
  395. def forward(
  396. self,
  397. pred_scores,
  398. pred_bboxes,
  399. anchor_points,
  400. num_anchors_list,
  401. gt_labels,
  402. gt_bboxes,
  403. pad_gt_mask,
  404. bg_index,
  405. gt_scores=None,
  406. ):
  407. r"""This code is based on
  408. https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
  409. The assignment is done in following steps
  410. 1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
  411. 2. select top-k bbox as candidates for each gt
  412. 3. limit the positive sample's center in gt (because the anchor-free detector
  413. only can predict positive distance)
  414. 4. if an anchor box is assigned to multiple gts, the one with the
  415. highest iou will be selected.
  416. Args:
  417. pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
  418. pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
  419. anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
  420. num_anchors_list (List): num of anchors in each level, shape(L)
  421. gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
  422. gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
  423. pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
  424. bg_index (int): background index
  425. gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
  426. Returns:
  427. assigned_labels (Tensor): (B, L)
  428. assigned_bboxes (Tensor): (B, L, 4)
  429. assigned_scores (Tensor): (B, L, C)
  430. """
  431. assert pred_scores.ndim == pred_bboxes.ndim
  432. assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3
  433. batch_size, num_anchors, num_classes = pred_scores.shape
  434. _, num_max_boxes, _ = gt_bboxes.shape
  435. # negative batch
  436. if num_max_boxes == 0:
  437. assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype="int32")
  438. assigned_bboxes = torch.zeros([batch_size, num_anchors, 4])
  439. assigned_scores = torch.zeros([batch_size, num_anchors, num_classes])
  440. return assigned_labels, assigned_bboxes, assigned_scores
  441. # compute iou between gt and pred bbox, [B, n, L]
  442. ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
  443. # gather pred bboxes class score
  444. pred_scores = torch.permute(pred_scores, [0, 2, 1])
  445. batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
  446. gt_labels_ind = torch.stack([batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], dim=-1)
  447. bbox_cls_scores = pred_scores[gt_labels_ind[..., 0], gt_labels_ind[..., 1]]
  448. # compute alignment metrics, [B, n, L]
  449. alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(self.beta)
  450. # check the positive sample's center in gt, [B, n, L]
  451. is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)
  452. # select topk largest alignment metrics pred bbox as candidates
  453. # for each gt, [B, n, L]
  454. is_in_topk = gather_topk_anchors(alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)
  455. # select positive sample, [B, n, L]
  456. mask_positive = is_in_topk * is_in_gts * pad_gt_mask
  457. # if an anchor box is assigned to multiple gts,
  458. # the one with the highest iou will be selected, [B, n, L]
  459. mask_positive_sum = mask_positive.sum(dim=-2)
  460. if mask_positive_sum.max() > 1:
  461. mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
  462. is_max_iou = compute_max_iou_anchor(ious)
  463. mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
  464. mask_positive_sum = mask_positive.sum(dim=-2)
  465. assigned_gt_index = mask_positive.argmax(dim=-2)
  466. # assigned target
  467. assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
  468. assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
  469. assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
  470. assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))
  471. assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
  472. assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
  473. assigned_scores = torch.nn.functional.one_hot(assigned_labels, num_classes + 1)
  474. ind = list(range(num_classes + 1))
  475. ind.remove(bg_index)
  476. assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device, dtype=torch.long), dim=-1)
  477. # rescale alignment metrics
  478. alignment_metrics *= mask_positive
  479. max_metrics_per_instance = alignment_metrics.max(dim=-1, keepdim=True).values
  480. max_ious_per_instance = (ious * mask_positive).max(dim=-1, keepdim=True).values
  481. alignment_metrics = alignment_metrics / (max_metrics_per_instance + self.eps) * max_ious_per_instance
  482. alignment_metrics = alignment_metrics.max(dim=-2).values.unsqueeze(-1)
  483. assigned_scores = assigned_scores * alignment_metrics
  484. return assigned_labels, assigned_bboxes, assigned_scores
  485. class GIoULoss(object):
  486. """
  487. Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
  488. Args:
  489. loss_weight (float): giou loss weight, default as 1
  490. eps (float): epsilon to avoid divide by zero, default as 1e-10
  491. reduction (string): Options are "none", "mean" and "sum". default as none
  492. """
  493. def __init__(self, loss_weight=1.0, eps=1e-10, reduction="none"):
  494. self.loss_weight = loss_weight
  495. self.eps = eps
  496. assert reduction in ("none", "mean", "sum")
  497. self.reduction = reduction
  498. def bbox_overlap(self, box1, box2, eps=1e-10):
  499. """calculate the iou of box1 and box2
  500. Args:
  501. box1 (Tensor): box1 with the shape (..., 4)
  502. box2 (Tensor): box1 with the shape (..., 4)
  503. eps (float): epsilon to avoid divide by zero
  504. Return:
  505. iou (Tensor): iou of box1 and box2
  506. overlap (Tensor): overlap of box1 and box2
  507. union (Tensor): union of box1 and box2
  508. """
  509. x1, y1, x2, y2 = box1
  510. x1g, y1g, x2g, y2g = box2
  511. xkis1 = torch.maximum(x1, x1g)
  512. ykis1 = torch.maximum(y1, y1g)
  513. xkis2 = torch.minimum(x2, x2g)
  514. ykis2 = torch.minimum(y2, y2g)
  515. w_inter = (xkis2 - xkis1).clip(0)
  516. h_inter = (ykis2 - ykis1).clip(0)
  517. overlap = w_inter * h_inter
  518. area1 = (x2 - x1) * (y2 - y1)
  519. area2 = (x2g - x1g) * (y2g - y1g)
  520. union = area1 + area2 - overlap + eps
  521. iou = overlap / union
  522. return iou, overlap, union
  523. def __call__(self, pbox: Tensor, gbox: Tensor, iou_weight=1.0, loc_reweight=None):
  524. # x1, y1, x2, y2 = torch.split(pbox, split_size_or_sections=4, dim=-1)
  525. # x1g, y1g, x2g, y2g = torch.split(gbox, split_size_or_sections=4, dim=-1)
  526. x1, y1, x2, y2 = pbox.chunk(4, dim=-1)
  527. x1g, y1g, x2g, y2g = gbox.chunk(4, dim=-1)
  528. box1 = [x1, y1, x2, y2]
  529. box2 = [x1g, y1g, x2g, y2g]
  530. iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
  531. xc1 = torch.minimum(x1, x1g)
  532. yc1 = torch.minimum(y1, y1g)
  533. xc2 = torch.maximum(x2, x2g)
  534. yc2 = torch.maximum(y2, y2g)
  535. area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
  536. miou = iou - ((area_c - union) / area_c)
  537. if loc_reweight is not None:
  538. loc_reweight = torch.reshape(loc_reweight, shape=(-1, 1))
  539. loc_thresh = 0.9
  540. giou = 1 - (1 - loc_thresh) * miou - loc_thresh * miou * loc_reweight
  541. else:
  542. giou = 1 - miou
  543. if self.reduction == "none":
  544. loss = giou
  545. elif self.reduction == "sum":
  546. loss = torch.sum(giou * iou_weight)
  547. else:
  548. loss = torch.mean(giou * iou_weight)
  549. return loss * self.loss_weight
  550. class PPYoloELoss(nn.Module):
  551. def __init__(
  552. self,
  553. num_classes: int,
  554. use_varifocal_loss: bool = True,
  555. use_static_assigner: bool = True,
  556. reg_max: int = 16,
  557. classification_loss_weight: float = 1.0,
  558. iou_loss_weight: float = 2.5,
  559. dfl_loss_weight: float = 0.5,
  560. ):
  561. """
  562. :param num_classes: Number of classes
  563. :param use_varifocal_loss: Whether to use Varifocal loss for classification loss; otherwise use Focal loss
  564. :param static_assigner_epoch: Whether to use static assigner or Task-Aligned assigner
  565. :param classification_loss_weight: Classification loss weight
  566. :param iou_loss_weight: IoU loss weight
  567. :param dfl_loss_weight: DFL loss weight
  568. :param reg_max: Number of regression bins (Must match the number of bins in the PPYoloE head)
  569. """
  570. super().__init__()
  571. self.use_varifocal_loss = use_varifocal_loss
  572. self.classification_loss_weight = classification_loss_weight
  573. self.dfl_loss_weight = dfl_loss_weight
  574. self.iou_loss_weight = iou_loss_weight
  575. self.iou_loss = GIoULoss()
  576. self.static_assigner = ATSSAssigner(topk=9, num_classes=num_classes)
  577. self.assigner = TaskAlignedAssigner(topk=13, alpha=1.0, beta=6.0)
  578. self.use_static_assigner = use_static_assigner
  579. self.reg_max = reg_max
  580. self.num_classes = num_classes
  581. # Same as in PPYoloE head
  582. proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
  583. self.register_buffer("proj_conv", proj)
  584. @torch.no_grad()
  585. def _yolox_targets_to_ppyolo(self, targets: torch.Tensor, batch_size: int) -> Mapping[str, torch.Tensor]:
  586. """
  587. Convert targets from YoloX format to PPYolo since its the easiest (not the cleanest) way to
  588. have PP Yolo training & metrics computed
  589. :param targets: (N, 6) format of bboxes is meant to be LABEL_CXCYWH (index, c, cx, cy, w, h)
  590. :return: (Dictionary [str,Tensor]) with keys:
  591. - gt_class: (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
  592. - gt_bbox: (Tensor, float32): Ground truth bboxes, shape(B, n, 4) in x1y1x2y2 format
  593. - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
  594. """
  595. image_index = targets[:, 0]
  596. gt_class = targets[:, 1:2].long()
  597. gt_bbox = cxcywh_to_xyxy(targets[:, 2:6], image_shape=None)
  598. per_image_class = []
  599. per_image_bbox = []
  600. per_image_pad_mask = []
  601. max_boxes = 0
  602. for i in range(batch_size):
  603. mask = image_index == i
  604. image_labels = gt_class[mask]
  605. image_bboxes = gt_bbox[mask, :]
  606. valid_bboxes = image_bboxes.sum(dim=1, keepdims=True) > 0
  607. per_image_class.append(image_labels)
  608. per_image_bbox.append(image_bboxes)
  609. per_image_pad_mask.append(valid_bboxes)
  610. max_boxes = max(max_boxes, mask.sum().item())
  611. for i in range(batch_size):
  612. elements_to_pad = max_boxes - len(per_image_class[i])
  613. padding_left = 0
  614. padding_right = 0
  615. padding_top = 0
  616. padding_bottom = elements_to_pad
  617. pad = padding_left, padding_right, padding_top, padding_bottom
  618. per_image_class[i] = F.pad(per_image_class[i], pad, mode="constant", value=0)
  619. per_image_bbox[i] = F.pad(per_image_bbox[i], pad, mode="constant", value=0)
  620. per_image_pad_mask[i] = F.pad(per_image_pad_mask[i], pad, mode="constant", value=0)
  621. return {
  622. "gt_class": torch.stack(per_image_class, dim=0),
  623. "gt_bbox": torch.stack(per_image_bbox, dim=0),
  624. "pad_gt_mask": torch.stack(per_image_pad_mask, dim=0),
  625. }
  626. def forward(
  627. self,
  628. outputs: Union[
  629. Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor], Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]]
  630. ],
  631. targets: Tensor,
  632. ) -> Mapping[str, Tensor]:
  633. """
  634. :param outputs: Tuple of pred_scores, pred_distri, anchors, anchor_points, num_anchors_list, stride_tensor
  635. :param targets: (Dictionary [str,Tensor]) with keys:
  636. - gt_class: (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
  637. - gt_bbox: (Tensor, float32): Ground truth bboxes, shape(B, n, 4) in x1y1x2y2 format
  638. - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
  639. :return:
  640. """
  641. # in test/eval mode the model outputs a tuple where the second item is the raw predictions
  642. if isinstance(outputs, tuple) and len(outputs) == 2:
  643. # in test/eval mode the Yolo model outputs a tuple where the second item is the raw predictions
  644. _, predictions = outputs
  645. else:
  646. predictions = outputs
  647. (
  648. pred_scores,
  649. pred_distri,
  650. anchors,
  651. anchor_points,
  652. num_anchors_list,
  653. stride_tensor,
  654. ) = predictions
  655. targets = self._yolox_targets_to_ppyolo(targets, batch_size=pred_scores.size(0)) # yolox -> ppyolo
  656. anchor_points_s = anchor_points / stride_tensor
  657. pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
  658. gt_labels = targets["gt_class"]
  659. gt_bboxes = targets["gt_bbox"]
  660. pad_gt_mask = targets["pad_gt_mask"]
  661. # label assignment
  662. if self.use_static_assigner:
  663. assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
  664. anchor_bboxes=anchors,
  665. num_anchors_list=num_anchors_list,
  666. gt_labels=gt_labels,
  667. gt_bboxes=gt_bboxes,
  668. pad_gt_mask=pad_gt_mask,
  669. bg_index=self.num_classes,
  670. pred_bboxes=pred_bboxes.detach() * stride_tensor,
  671. )
  672. alpha_l = 0.25
  673. else:
  674. assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
  675. pred_scores=pred_scores.detach().sigmoid(), # Pred scores are logits on training for numerical stability
  676. pred_bboxes=pred_bboxes.detach() * stride_tensor,
  677. anchor_points=anchor_points,
  678. num_anchors_list=num_anchors_list,
  679. gt_labels=gt_labels,
  680. gt_bboxes=gt_bboxes,
  681. pad_gt_mask=pad_gt_mask,
  682. bg_index=self.num_classes,
  683. )
  684. alpha_l = -1
  685. # rescale bbox
  686. assigned_bboxes /= stride_tensor
  687. # cls loss
  688. if self.use_varifocal_loss:
  689. one_hot_label = torch.nn.functional.one_hot(assigned_labels, self.num_classes + 1)[..., :-1]
  690. loss_cls = self._varifocal_loss(pred_scores, assigned_scores, one_hot_label)
  691. else:
  692. loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)
  693. assigned_scores_sum = assigned_scores.sum()
  694. if super_gradients.is_distributed():
  695. torch.distributed.all_reduce(assigned_scores_sum, op=torch.distributed.ReduceOp.SUM)
  696. assigned_scores_sum /= get_world_size()
  697. assigned_scores_sum = torch.clip(assigned_scores_sum, min=1.0)
  698. loss_cls /= assigned_scores_sum
  699. loss_iou, loss_dfl = self._bbox_loss(
  700. pred_distri,
  701. pred_bboxes,
  702. anchor_points_s,
  703. assigned_labels,
  704. assigned_bboxes,
  705. assigned_scores,
  706. assigned_scores_sum,
  707. )
  708. loss = self.classification_loss_weight * loss_cls + self.iou_loss_weight * loss_iou + self.dfl_loss_weight * loss_dfl
  709. log_losses = torch.stack([loss_cls.detach(), loss_iou.detach(), loss_dfl.detach(), loss.detach()])
  710. return loss, log_losses
  711. @property
  712. def component_names(self):
  713. return ["loss_cls", "loss_iou", "loss_dfl", "loss"]
  714. def _df_loss(self, pred_dist: Tensor, target: Tensor) -> Tensor:
  715. target_left = target.long()
  716. target_right = target_left + 1
  717. weight_left = target_right.float() - target
  718. weight_right = 1 - weight_left
  719. # [B,L,C] -> [B,C,L] to make compatible with torch.nn.functional.cross_entropy
  720. # which expects channel dim to be at index 1
  721. pred_dist = torch.moveaxis(pred_dist, -1, 1)
  722. loss_left = torch.nn.functional.cross_entropy(pred_dist, target_left, reduction="none") * weight_left
  723. loss_right = torch.nn.functional.cross_entropy(pred_dist, target_right, reduction="none") * weight_right
  724. return (loss_left + loss_right).mean(dim=-1, keepdim=True)
  725. def _bbox_loss(
  726. self,
  727. pred_dist,
  728. pred_bboxes,
  729. anchor_points,
  730. assigned_labels,
  731. assigned_bboxes,
  732. assigned_scores,
  733. assigned_scores_sum,
  734. ):
  735. # select positive samples mask
  736. mask_positive = assigned_labels != self.num_classes
  737. num_pos = mask_positive.sum()
  738. # pos/neg loss
  739. if num_pos > 0:
  740. # l1 + iou
  741. bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
  742. pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).reshape([-1, 4])
  743. assigned_bboxes_pos = torch.masked_select(assigned_bboxes, bbox_mask).reshape([-1, 4])
  744. bbox_weight = torch.masked_select(assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
  745. loss_iou = self.iou_loss(pred_bboxes_pos, assigned_bboxes_pos) * bbox_weight
  746. loss_iou = loss_iou.sum() / assigned_scores_sum
  747. dist_mask = mask_positive.unsqueeze(-1).tile([1, 1, (self.reg_max + 1) * 4])
  748. pred_dist_pos = torch.masked_select(pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1])
  749. assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes)
  750. assigned_ltrb_pos = torch.masked_select(assigned_ltrb, bbox_mask).reshape([-1, 4])
  751. loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos) * bbox_weight
  752. loss_dfl = loss_dfl.sum() / assigned_scores_sum
  753. else:
  754. loss_iou = torch.zeros([], device=pred_bboxes.device)
  755. loss_dfl = pred_dist.sum() * 0.0
  756. return loss_iou, loss_dfl
  757. def _bbox_decode(self, anchor_points: Tensor, pred_dist: Tensor):
  758. b, l, *_ = pred_dist.size()
  759. pred_dist = torch.softmax(pred_dist.reshape([b, l, 4, self.reg_max + 1]), dim=-1)
  760. pred_dist = torch.nn.functional.conv2d(pred_dist.permute(0, 3, 1, 2), self.proj_conv).squeeze(1)
  761. return batch_distance2bbox(anchor_points, pred_dist)
  762. def _bbox2distance(self, points, bbox):
  763. x1y1, x2y2 = torch.split(bbox, 2, -1)
  764. lt = points - x1y1
  765. rb = x2y2 - points
  766. return torch.cat([lt, rb], dim=-1).clip(0, self.reg_max - 0.01)
  767. @staticmethod
  768. def _focal_loss(pred_logits: Tensor, label: Tensor, alpha=0.25, gamma=2.0) -> Tensor:
  769. pred_score = pred_logits.sigmoid()
  770. weight = (pred_score - label).pow(gamma)
  771. if alpha > 0:
  772. alpha_t = alpha * label + (1 - alpha) * (1 - label)
  773. weight *= alpha_t
  774. loss = -weight * (label * torch.nn.functional.logsigmoid(pred_logits) + (1 - label) * torch.nn.functional.logsigmoid(-pred_logits))
  775. return loss.sum()
  776. @staticmethod
  777. def _varifocal_loss(pred_logits: Tensor, gt_score: Tensor, label: Tensor, alpha=0.75, gamma=2.0) -> Tensor:
  778. pred_score = pred_logits.sigmoid()
  779. weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
  780. loss = -weight * (gt_score * torch.nn.functional.logsigmoid(pred_logits) + (1 - gt_score) * torch.nn.functional.logsigmoid(-pred_logits))
  781. return loss.sum()
Discard
@@ -84,13 +84,12 @@ class DetectionMetrics(Metric):
         Apply NMS and match all the predictions and targets of a given batch, and update the metric state accordingly.
         Apply NMS and match all the predictions and targets of a given batch, and update the metric state accordingly.
 
 
         :param preds :        Raw output of the model, the format might change from one model to another, but has to fit
         :param preds :        Raw output of the model, the format might change from one model to another, but has to fit
-                                the input format of the post_prediction_callback
-        :param target:        Targets for all images of shape (total_num_targets, 6)
-                                format:  (index, x, y, w, h, label) where x,y,w,h are in range [0,1]
+                                the input format of the post_prediction_callback (cx,cy,wh)
+        :param target:        Targets for all images of shape (total_num_targets, 6) LABEL_CXCYWH
+                                format:  (index, label, cx, cy, w, h)
         :param device:        Device to run on
         :param device:        Device to run on
         :param inputs:        Input image tensor of shape (batch_size, n_img, height, width)
         :param inputs:        Input image tensor of shape (batch_size, n_img, height, width)
-        :param crowd_targets: Crowd targets for all images of shape (total_num_targets, 6)
-                                 format:  (index, x, y, w, h, label) where x,y,w,h are in range [0,1]
+        :param crowd_targets: Crowd targets for all images of shape (total_num_targets, 6), LABEL_CXCYWH
         """
         """
         self.iou_thresholds = self.iou_thresholds.to(device)
         self.iou_thresholds = self.iou_thresholds.to(device)
         _, _, height, width = inputs.shape
         _, _, height, width = inputs.shape
Discard
@@ -1,3 +1,4 @@
+from super_gradients.common.object_names import Models
 from super_gradients.training.models import ResNeXt50, ResNeXt101, GoogleNetV1
 from super_gradients.training.models import ResNeXt50, ResNeXt101, GoogleNetV1
 from super_gradients.training.models.classification_models import repvgg, efficientnet, densenet, resnet, regnet
 from super_gradients.training.models.classification_models import repvgg, efficientnet, densenet, resnet, regnet
 from super_gradients.training.models.classification_models.mobilenetv2 import MobileNetV2Base, MobileNetV2_135, CustomMobileNetV2
 from super_gradients.training.models.classification_models.mobilenetv2 import MobileNetV2Base, MobileNetV2_135, CustomMobileNetV2
@@ -12,6 +13,7 @@ from super_gradients.training.models.classification_models.shufflenetv2 import (
 from super_gradients.training.models.classification_models.vit import ViTBase, ViTLarge, ViTHuge
 from super_gradients.training.models.classification_models.vit import ViTBase, ViTLarge, ViTHuge
 from super_gradients.training.models.detection_models.csp_darknet53 import CSPDarknet53
 from super_gradients.training.models.detection_models.csp_darknet53 import CSPDarknet53
 from super_gradients.training.models.detection_models.darknet53 import Darknet53
 from super_gradients.training.models.detection_models.darknet53 import Darknet53
+from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_e import PPYoloE_M, PPYoloE_L, PPYoloE_X, PPYoloE_S
 from super_gradients.training.models.detection_models.ssd import SSDMobileNetV1, SSDLiteMobileNetV2
 from super_gradients.training.models.detection_models.ssd import SSDMobileNetV1, SSDLiteMobileNetV2
 from super_gradients.training.models.detection_models.yolox import YoloX_N, YoloX_T, YoloX_S, YoloX_M, YoloX_L, YoloX_X, CustomYoloX
 from super_gradients.training.models.detection_models.yolox import YoloX_N, YoloX_T, YoloX_S, YoloX_M, YoloX_L, YoloX_X, CustomYoloX
 from super_gradients.training.models.segmentation_models.ddrnet import DDRNet23, DDRNet23Slim, AnyBackBoneDDRNet23, DDRNet39
 from super_gradients.training.models.segmentation_models.ddrnet import DDRNet23, DDRNet23Slim, AnyBackBoneDDRNet23, DDRNet39
@@ -30,7 +32,6 @@ from super_gradients.training.models.kd_modules.kd_module import KDModule
 from super_gradients.training.models.classification_models.beit import BeitBasePatch16_224, BeitLargePatch16_224
 from super_gradients.training.models.classification_models.beit import BeitBasePatch16_224, BeitLargePatch16_224
 from super_gradients.training.models.segmentation_models.ppliteseg import PPLiteSegT, PPLiteSegB
 from super_gradients.training.models.segmentation_models.ppliteseg import PPLiteSegT, PPLiteSegB
 from super_gradients.training.models.segmentation_models.unet import UNetCustom, UnetClassification
 from super_gradients.training.models.segmentation_models.unet import UNetCustom, UnetClassification
-from super_gradients.common.object_names import Models
 
 
 ARCHITECTURES = {
 ARCHITECTURES = {
     Models.RESNET18: resnet.ResNet18,
     Models.RESNET18: resnet.ResNet18,
@@ -135,6 +136,10 @@ ARCHITECTURES = {
     Models.CUSTOM_ANYNET: regnet.CustomAnyNet,
     Models.CUSTOM_ANYNET: regnet.CustomAnyNet,
     Models.UNET_CUSTOM: UNetCustom,
     Models.UNET_CUSTOM: UNetCustom,
     Models.UNET_CUSTOM_CLS: UnetClassification,
     Models.UNET_CUSTOM_CLS: UnetClassification,
+    Models.PP_YOLOE_S: PPYoloE_S,
+    Models.PP_YOLOE_M: PPYoloE_M,
+    Models.PP_YOLOE_L: PPYoloE_L,
+    Models.PP_YOLOE_X: PPYoloE_X,
 }
 }
 
 
 KD_ARCHITECTURES = {Models.KD_MODULE: KDModule}
 KD_ARCHITECTURES = {Models.KD_MODULE: KDModule}
Discard
@@ -1,5 +1,7 @@
 import collections
 import collections
-from typing import List, Type, Tuple
+import os.path
+from pathlib import Path
+from typing import List, Type, Tuple, Union, Optional
 
 
 import torch
 import torch
 from super_gradients.common.decorators.factory_decorator import resolve_param
 from super_gradients.common.decorators.factory_decorator import resolve_param
@@ -8,7 +10,9 @@ from torch import nn, Tensor
 
 
 from super_gradients.modules import RepVGGBlock, EffectiveSEBlock, ConvBNAct
 from super_gradients.modules import RepVGGBlock, EffectiveSEBlock, ConvBNAct
 
 
-__all__ = ["CSPResNet"]
+__all__ = ["CSPResNet", "CSPResNetBasicBlock"]
+
+from super_gradients.training.utils.distributed_training_utils import wait_for_the_master, get_local_rank
 
 
 
 
 class CSPResNetBasicBlock(nn.Module):
 class CSPResNetBasicBlock(nn.Module):
@@ -98,7 +102,7 @@ class CSPResStage(nn.Module):
             x = self.conv_down(x)
             x = self.conv_down(x)
         y1 = self.conv1(x)
         y1 = self.conv1(x)
         y2 = self.blocks(self.conv2(x))
         y2 = self.blocks(self.conv2(x))
-        y = torch.concat([y1, y2], dim=1)
+        y = torch.cat([y1, y2], dim=1)
         y = self.attn(y)
         y = self.attn(y)
         y = self.conv3(y)
         y = self.conv3(y)
         return y
         return y
@@ -120,6 +124,7 @@ class CSPResNet(nn.Module):
         width_mult: float,
         width_mult: float,
         depth_mult: float,
         depth_mult: float,
         use_alpha: bool,
         use_alpha: bool,
+        pretrained_weights: Optional[str] = None,
     ):
     ):
         """
         """
 
 
@@ -131,6 +136,7 @@ class CSPResNet(nn.Module):
         :param width_mult: Scaling factor for a number of channels
         :param width_mult: Scaling factor for a number of channels
         :param depth_mult: Scaling factor for a number of blocks in each stage
         :param depth_mult: Scaling factor for a number of blocks in each stage
         :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
         :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
+        :param pretrained_weights:
         """
         """
         super().__init__()
         super().__init__()
         channels = [max(round(num_channels * width_mult), 1) for num_channels in channels]
         channels = [max(round(num_channels * width_mult), 1) for num_channels in channels]
@@ -198,6 +204,16 @@ class CSPResNet(nn.Module):
         self._out_strides = [4 * 2**i for i in range(n)]
         self._out_strides = [4 * 2**i for i in range(n)]
         self.return_idx = return_idx
         self.return_idx = return_idx
 
 
+        if pretrained_weights:
+            if isinstance(pretrained_weights, (str, Path)) and os.path.isfile(str(pretrained_weights)):
+                state_dict = torch.load(str(pretrained_weights), map_location="cpu")
+            elif isinstance(pretrained_weights, str) and pretrained_weights.startswith("https://"):
+                with wait_for_the_master(get_local_rank()):
+                    state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+            else:
+                raise ValueError("pretrained_weights argument should be a path to local file or url to remote file")
+            self.load_state_dict(state_dict)
+
     def forward(self, x: Tensor) -> List[Tensor]:
     def forward(self, x: Tensor) -> List[Tensor]:
         x = self.stem(x)
         x = self.stem(x)
         outs = []
         outs = []
@@ -207,3 +223,14 @@ class CSPResNet(nn.Module):
                 outs.append(x)
                 outs.append(x)
 
 
         return outs
         return outs
+
+    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
+        """
+        Prepare the model to be converted to ONNX or other frameworks.
+        Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
+        with convertible substitutes and remove all auxiliary or training related parts.
+        :param input_size: [H,W]
+        """
+        for module in self.modules():
+            if isinstance(module, RepVGGBlock):
+                module.fuse_block_residual_branches()
Discard
1
2
3
4
  1. from .pp_yolo_e import PPYoloE
  2. from .post_prediction_callback import PPYoloEPostPredictionCallback
  3. __all__ = ["PPYoloE", "PPYoloEPostPredictionCallback"]
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
  1. import collections
  2. from typing import Type, Tuple, List
  3. import torch
  4. from super_gradients.common.decorators.factory_decorator import resolve_param
  5. from super_gradients.common.factories.activations_type_factory import ActivationsTypeFactory
  6. from torch import nn, Tensor
  7. from super_gradients.training.models.detection_models.csp_resnet import CSPResNetBasicBlock
  8. from super_gradients.modules import ConvBNAct
  9. __all__ = ["CustomCSPPAN"]
  10. class SPP(nn.Module):
  11. def __init__(
  12. self,
  13. in_channels: int,
  14. out_channels: int,
  15. kernel_size: int,
  16. pool_size: Tuple[int, ...],
  17. activation_type: Type[nn.Module],
  18. ):
  19. super().__init__()
  20. mid_channels = in_channels * (1 + len(pool_size))
  21. pools = []
  22. for i, size in enumerate(pool_size):
  23. pool = nn.MaxPool2d(kernel_size=size, stride=1, padding=size // 2, ceil_mode=False)
  24. pools.append(pool)
  25. self.pool = nn.ModuleList(pools)
  26. self.conv = ConvBNAct(mid_channels, out_channels, kernel_size, padding=kernel_size // 2, activation_type=activation_type, stride=1, bias=False)
  27. def forward(self, x: Tensor) -> Tensor:
  28. outs = [x]
  29. for pool in self.pool:
  30. outs.append(pool(x))
  31. y = torch.cat(outs, dim=1)
  32. y = self.conv(y)
  33. return y
  34. class CSPStage(nn.Module):
  35. def __init__(self, in_channels: int, out_channels: int, n, activation_type: Type[nn.Module], spp: bool):
  36. super().__init__()
  37. ch_mid = int(out_channels // 2)
  38. self.conv1 = ConvBNAct(in_channels, ch_mid, kernel_size=1, padding=0, activation_type=activation_type, stride=1, bias=False)
  39. self.conv2 = ConvBNAct(in_channels, ch_mid, kernel_size=1, padding=0, activation_type=activation_type, stride=1, bias=False)
  40. convs = []
  41. next_ch_in = ch_mid
  42. for i in range(n):
  43. convs.append((str(i), CSPResNetBasicBlock(next_ch_in, ch_mid, activation_type=activation_type, use_residual_connection=False)))
  44. if i == (n - 1) // 2 and spp:
  45. convs.append(("spp", SPP(ch_mid, ch_mid, 1, (5, 9, 13), activation_type=activation_type)))
  46. next_ch_in = ch_mid
  47. self.convs = nn.Sequential(collections.OrderedDict(convs))
  48. self.conv3 = ConvBNAct(ch_mid * 2, out_channels, kernel_size=1, padding=0, activation_type=activation_type, stride=1, bias=False)
  49. def forward(self, x):
  50. y1 = self.conv1(x)
  51. y2 = self.conv2(x)
  52. y2 = self.convs(y2)
  53. y = torch.cat([y1, y2], dim=1)
  54. y = self.conv3(y)
  55. return y
  56. class CustomCSPPAN(nn.Module):
  57. @resolve_param("activation", ActivationsTypeFactory())
  58. def __init__(
  59. self,
  60. in_channels: Tuple[int, ...],
  61. out_channels: Tuple[int, ...],
  62. activation: Type[nn.Module],
  63. stage_num: int,
  64. block_num: int,
  65. spp: bool,
  66. width_mult: float,
  67. depth_mult: float,
  68. ):
  69. super().__init__()
  70. in_channels = [max(round(c * width_mult), 1) for c in in_channels]
  71. out_channels = [max(round(c * width_mult), 1) for c in out_channels]
  72. block_num = max(round(block_num * depth_mult), 1)
  73. self.num_blocks = len(in_channels)
  74. self._out_channels = out_channels
  75. in_channels = in_channels[::-1]
  76. fpn_stages = []
  77. fpn_routes = []
  78. ch_pre = None
  79. for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)):
  80. if i > 0:
  81. ch_in += ch_pre // 2
  82. stage = []
  83. for j in range(stage_num):
  84. stage.append(
  85. (
  86. str(j),
  87. CSPStage(
  88. ch_in if j == 0 else ch_out,
  89. ch_out,
  90. block_num,
  91. activation_type=activation,
  92. spp=(spp and i == 0),
  93. ),
  94. ),
  95. )
  96. fpn_stages.append(nn.Sequential(collections.OrderedDict(stage)))
  97. if i < self.num_blocks - 1:
  98. fpn_routes.append(
  99. ConvBNAct(in_channels=ch_out, out_channels=ch_out // 2, kernel_size=1, stride=1, padding=0, activation_type=activation, bias=False)
  100. )
  101. ch_pre = ch_out
  102. self.fpn_stages = nn.ModuleList(fpn_stages)
  103. self.fpn_routes = nn.ModuleList(fpn_routes)
  104. pan_stages = []
  105. pan_routes = []
  106. for i in reversed(range(self.num_blocks - 1)):
  107. pan_routes.append(
  108. ConvBNAct(
  109. in_channels=out_channels[i + 1],
  110. out_channels=out_channels[i + 1],
  111. kernel_size=3,
  112. stride=2,
  113. padding=1,
  114. activation_type=activation,
  115. bias=False,
  116. )
  117. )
  118. ch_in = out_channels[i] + out_channels[i + 1]
  119. ch_out = out_channels[i]
  120. stage = []
  121. for j in range(stage_num):
  122. stage.append(
  123. (
  124. str(j),
  125. CSPStage(
  126. ch_in if j == 0 else ch_out,
  127. ch_out,
  128. block_num,
  129. activation_type=activation,
  130. spp=False,
  131. ),
  132. ),
  133. )
  134. pan_stages.append(nn.Sequential(collections.OrderedDict(stage)))
  135. self.pan_stages = nn.ModuleList(pan_stages[::-1])
  136. self.pan_routes = nn.ModuleList(pan_routes[::-1])
  137. def forward(self, blocks: List[Tensor]) -> List[Tensor]:
  138. blocks = blocks[::-1]
  139. fpn_feats = []
  140. route = None
  141. for i, block in enumerate(blocks):
  142. if i > 0:
  143. block = torch.cat([route, block], dim=1)
  144. route = self.fpn_stages[i](block)
  145. fpn_feats.append(route)
  146. if i < self.num_blocks - 1:
  147. route = self.fpn_routes[i](route)
  148. route = torch.nn.functional.interpolate(route, scale_factor=2, mode="nearest")
  149. pan_feats = [
  150. fpn_feats[-1],
  151. ]
  152. route = fpn_feats[-1]
  153. for i in reversed(range(self.num_blocks - 1)):
  154. block = fpn_feats[i]
  155. route = self.pan_routes[i](route)
  156. block = torch.cat([route, block], dim=1)
  157. route = self.pan_stages[i](block)
  158. pan_feats.append(route)
  159. return pan_feats[::-1]
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  1. from typing import List
  2. import torch
  3. import torchvision
  4. from super_gradients.training.utils.detection_utils import DetectionPostPredictionCallback
  5. class PPYoloEPostPredictionCallback(DetectionPostPredictionCallback):
  6. """Non-Maximum Suppression (NMS) module"""
  7. def __init__(self, score_threshold: float, nms_threshold: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool = True):
  8. """
  9. :param score_threshold: Predictions confidence threshold. Predictions with score lower than score_threshold will not participate in Top-K & NMS
  10. :param iou: IoU threshold for NMS step.
  11. :param nms_top_k: Number of predictions participating in NMS step
  12. :param max_predictions: maximum number of boxes to return after NMS step
  13. """
  14. super(PPYoloEPostPredictionCallback, self).__init__()
  15. self.score_threshold = score_threshold
  16. self.nms_threshold = nms_threshold
  17. self.nms_top_k = nms_top_k
  18. self.max_predictions = max_predictions
  19. self.multi_label_per_box = multi_label_per_box
  20. def forward(self, outputs, device: str):
  21. """
  22. :param x: Tuple of (bboxes, scores) of shape [B, Anchors, 4], [B, Anchors, C]
  23. :param device:
  24. :return:
  25. """
  26. nms_result = []
  27. # First is model predictions, second element of tuple is logits for loss computation
  28. predictions = outputs[0]
  29. for pred_bboxes, pred_scores in zip(*predictions):
  30. # pred_bboxes [Anchors, 4],
  31. # pred_scores [Anchors, C]
  32. # Filter all predictions by self.score_threshold
  33. if self.multi_label_per_box:
  34. i, j = (pred_scores > self.score_threshold).nonzero(as_tuple=False).T
  35. pred_bboxes = pred_bboxes[i]
  36. pred_cls_conf = pred_scores[i, j]
  37. pred_cls_label = j[:]
  38. else:
  39. pred_cls_conf, pred_cls_label = torch.max(pred_scores, dim=1)
  40. conf_mask = pred_cls_conf >= self.score_threshold
  41. pred_cls_conf = pred_cls_conf[conf_mask]
  42. pred_cls_label = pred_cls_label[conf_mask]
  43. pred_bboxes = pred_bboxes[conf_mask, :]
  44. # Filter all predictions by self.nms_top_k
  45. if pred_cls_conf.size(0) > self.nms_top_k:
  46. topk_candidates = torch.topk(pred_cls_conf, k=self.nms_top_k, largest=True)
  47. pred_cls_conf = pred_cls_conf[topk_candidates.indices]
  48. pred_cls_label = pred_cls_label[topk_candidates.indices]
  49. pred_bboxes = pred_bboxes[topk_candidates.indices, :]
  50. # NMS
  51. idx_to_keep = torchvision.ops.boxes.batched_nms(boxes=pred_bboxes, scores=pred_cls_conf, idxs=pred_cls_label, iou_threshold=self.nms_threshold)
  52. pred_cls_conf = pred_cls_conf[idx_to_keep].unsqueeze(-1)
  53. pred_cls_label = pred_cls_label[idx_to_keep].unsqueeze(-1)
  54. pred_bboxes = pred_bboxes[idx_to_keep, :]
  55. # nx6 (x1, y1, x2, y2, confidence, class) in pixel units
  56. final_boxes = torch.cat([pred_bboxes, pred_cls_conf, pred_cls_label], dim=1) # [N,6]
  57. nms_result.append(final_boxes)
  58. return self._filter_max_predictions(nms_result)
  59. def _filter_max_predictions(self, res: List) -> List:
  60. res[:] = [im[: self.max_predictions] if (im is not None and im.shape[0] > self.max_predictions) else im for im in res]
  61. return res
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  1. from typing import Union
  2. from torch import Tensor
  3. from super_gradients.modules import RepVGGBlock
  4. from super_gradients.training.models.sg_module import SgModule
  5. from super_gradients.training.models.detection_models.csp_resnet import CSPResNet
  6. from super_gradients.training.models.detection_models.pp_yolo_e.pan import CustomCSPPAN
  7. from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_head import PPYOLOEHead
  8. from super_gradients.training.utils import HpmStruct
  9. from super_gradients.training.models.arch_params_factory import get_arch_params
  10. class PPYoloE(SgModule):
  11. def __init__(self, arch_params):
  12. super().__init__()
  13. if isinstance(arch_params, HpmStruct):
  14. arch_params = arch_params.to_dict()
  15. self.backbone = CSPResNet(**arch_params["backbone"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
  16. self.neck = CustomCSPPAN(**arch_params["neck"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
  17. self.head = PPYOLOEHead(**arch_params["head"], width_mult=arch_params["width_mult"], num_classes=arch_params["num_classes"])
  18. def forward(self, x: Tensor):
  19. features = self.backbone(x)
  20. features = self.neck(features)
  21. return self.head(features)
  22. def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
  23. """
  24. Prepare the model to be converted to ONNX or other frameworks.
  25. Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
  26. with convertible substitutes and remove all auxiliary or training related parts.
  27. :param input_size: [H,W]
  28. """
  29. for module in self.modules():
  30. if isinstance(module, RepVGGBlock):
  31. module.prep_model_for_conversion(input_size)
  32. def replace_head(self, new_num_classes=None, new_head=None):
  33. if new_num_classes is None and new_head is None:
  34. raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
  35. if new_head is not None:
  36. self.head = new_head
  37. else:
  38. self.head.replace_num_classes(new_num_classes)
  39. class PPYoloE_S(PPYoloE):
  40. def __init__(self, arch_params):
  41. if isinstance(arch_params, HpmStruct):
  42. arch_params = arch_params.to_dict()
  43. arch_params = get_arch_params("ppyoloe_s_arch_params", arch_params)
  44. super().__init__(arch_params)
  45. class PPYoloE_M(PPYoloE):
  46. def __init__(self, arch_params):
  47. if isinstance(arch_params, HpmStruct):
  48. arch_params = arch_params.to_dict()
  49. arch_params = get_arch_params("ppyoloe_m_arch_params", arch_params)
  50. super().__init__(arch_params)
  51. class PPYoloE_L(PPYoloE):
  52. def __init__(self, arch_params):
  53. if isinstance(arch_params, HpmStruct):
  54. arch_params = arch_params.to_dict()
  55. arch_params = get_arch_params("ppyoloe_l_arch_params", arch_params)
  56. super().__init__(arch_params)
  57. class PPYoloE_X(PPYoloE):
  58. def __init__(self, arch_params):
  59. if isinstance(arch_params, HpmStruct):
  60. arch_params = arch_params.to_dict()
  61. arch_params = get_arch_params("ppyoloe_x_arch_params", arch_params)
  62. super().__init__(arch_params)
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
  1. from typing import Tuple, Type
  2. import numpy as np
  3. import torch
  4. from super_gradients.common.decorators.factory_decorator import resolve_param
  5. from super_gradients.common.factories.activations_type_factory import ActivationsTypeFactory
  6. from super_gradients.training.utils.bbox_utils import batch_distance2bbox
  7. from torch import nn, Tensor
  8. from super_gradients.modules import ConvBNAct
  9. from super_gradients.training.utils.version_utils import torch_version_is_greater_or_equal
  10. def bias_init_with_prob(prior_prob=0.01):
  11. """initialize conv/fc bias value according to a given probability value."""
  12. bias_init = float(-np.log((1 - prior_prob) / prior_prob))
  13. return bias_init
  14. @torch.no_grad()
  15. def generate_anchors_for_grid_cell(feats: Tuple[Tensor, ...], fpn_strides: Tuple[int, ...], grid_cell_size=5.0, grid_cell_offset=0.5, dtype=torch.float):
  16. r"""
  17. Like ATSS, generate anchors based on grid size.
  18. Args:
  19. feats (List[Tensor]): shape[s, (b, c, h, w)]
  20. fpn_strides (tuple|list): shape[s], stride for each scale feature
  21. grid_cell_size (float): anchor size
  22. grid_cell_offset (float): The range is between 0 and 1.
  23. Returns:
  24. anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format.
  25. anchor_points (Tensor): shape[l, 2], "x, y" format.
  26. num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...].
  27. stride_tensor (Tensor): shape[l, 1], contains the stride for each scale.
  28. """
  29. assert len(feats) == len(fpn_strides)
  30. device = feats[0].device
  31. anchors = []
  32. anchor_points = []
  33. num_anchors_list = []
  34. stride_tensor = []
  35. for feat, stride in zip(feats, fpn_strides):
  36. _, _, h, w = feat.shape
  37. cell_half_size = grid_cell_size * stride * 0.5
  38. shift_x = (torch.arange(end=w) + grid_cell_offset) * stride
  39. shift_y = (torch.arange(end=h) + grid_cell_offset) * stride
  40. if torch_version_is_greater_or_equal(1, 10):
  41. # https://github.com/pytorch/pytorch/issues/50276
  42. shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
  43. else:
  44. shift_y, shift_x = torch.meshgrid(shift_y, shift_x)
  45. anchor = torch.stack(
  46. [shift_x - cell_half_size, shift_y - cell_half_size, shift_x + cell_half_size, shift_y + cell_half_size],
  47. dim=-1,
  48. ).to(dtype=dtype)
  49. anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
  50. anchors.append(anchor.reshape([-1, 4]))
  51. anchor_points.append(anchor_point.reshape([-1, 2]))
  52. num_anchors_list.append(len(anchors[-1]))
  53. stride_tensor.append(torch.full([num_anchors_list[-1], 1], stride, dtype=dtype))
  54. anchors = torch.cat(anchors).to(device)
  55. anchor_points = torch.cat(anchor_points).to(device)
  56. stride_tensor = torch.cat(stride_tensor).to(device)
  57. return anchors, anchor_points, num_anchors_list, stride_tensor
  58. class ESEAttn(nn.Module):
  59. def __init__(self, feat_channels: int, activation_type: Type[nn.Module]):
  60. super(ESEAttn, self).__init__()
  61. self.fc = nn.Conv2d(feat_channels, feat_channels, kernel_size=1)
  62. self.conv = ConvBNAct(feat_channels, feat_channels, kernel_size=1, padding=0, stride=1, activation_type=activation_type, bias=False)
  63. self._init_weights()
  64. def _init_weights(self):
  65. torch.nn.init.normal_(self.fc.weight, std=0.001)
  66. def forward(self, feat, avg_feat):
  67. weight = torch.sigmoid(self.fc(avg_feat))
  68. return self.conv(feat * weight)
  69. class PPYOLOEHead(nn.Module):
  70. @resolve_param("activation", ActivationsTypeFactory())
  71. def __init__(
  72. self,
  73. num_classes: int,
  74. in_channels: Tuple[int, int, int],
  75. activation: Type[nn.Module] = nn.SiLU,
  76. fpn_strides: Tuple[int, int, int] = (32, 16, 8),
  77. grid_cell_scale=5.0,
  78. grid_cell_offset=0.5,
  79. reg_max=16,
  80. eval_size: Tuple[int, int] = None,
  81. width_mult: float = 1.0,
  82. ):
  83. """
  84. :param num_classes:
  85. :param in_channels: Number of channels for each feature map (See width_mult)
  86. :param activation: Type of the activation used in module
  87. :param fpn_strides: Output strides of the feature maps from the neck
  88. :param grid_cell_scale:
  89. :param grid_cell_offset:
  90. :param reg_max:
  91. :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
  92. since anchors will not be regenerated for each forward call.
  93. :param exclude_nms:
  94. :param exclude_post_process:
  95. :param width_mult: A scaling factor applied to in_channels in order.
  96. """
  97. super(PPYOLOEHead, self).__init__()
  98. in_channels = [max(round(c * width_mult), 1) for c in in_channels]
  99. self.in_channels = tuple(in_channels)
  100. self.num_classes = num_classes
  101. self.fpn_strides = fpn_strides
  102. self.grid_cell_scale = grid_cell_scale
  103. self.grid_cell_offset = grid_cell_offset
  104. self.reg_max = reg_max
  105. self.eval_size = eval_size
  106. # stem
  107. self.stem_cls = nn.ModuleList()
  108. self.stem_reg = nn.ModuleList()
  109. for in_c in self.in_channels:
  110. self.stem_cls.append(ESEAttn(in_c, activation_type=activation))
  111. self.stem_reg.append(ESEAttn(in_c, activation_type=activation))
  112. # pred head
  113. self.pred_cls = nn.ModuleList()
  114. self.pred_reg = nn.ModuleList()
  115. for in_c in self.in_channels:
  116. self.pred_cls.append(nn.Conv2d(in_c, self.num_classes, 3, padding=1))
  117. self.pred_reg.append(nn.Conv2d(in_c, 4 * (self.reg_max + 1), 3, padding=1))
  118. # Do not apply quantization to this tensor
  119. proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
  120. self.register_buffer("proj_conv", proj, persistent=False)
  121. self._init_weights()
  122. def _init_weights(self):
  123. bias_cls = bias_init_with_prob(0.01)
  124. for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
  125. torch.nn.init.constant_(cls_.weight, 0.0)
  126. torch.nn.init.constant_(cls_.bias, bias_cls)
  127. torch.nn.init.constant_(reg_.weight, 0.0)
  128. torch.nn.init.constant_(reg_.bias, 1.0)
  129. if self.eval_size:
  130. anchor_points, stride_tensor = self._generate_anchors()
  131. self.anchor_points = anchor_points
  132. self.stride_tensor = stride_tensor
  133. def replace_num_classes(self, num_classes: int):
  134. bias_cls = bias_init_with_prob(0.01)
  135. self.pred_cls = nn.ModuleList()
  136. self.num_classes = num_classes
  137. for in_c in self.in_channels:
  138. predict_layer = nn.Conv2d(in_c, num_classes, 3, padding=1)
  139. torch.nn.init.constant_(predict_layer.weight, 0.0)
  140. torch.nn.init.constant_(predict_layer.bias, bias_cls)
  141. self.pred_cls.append(predict_layer)
  142. def forward_train(self, feats: Tuple[Tensor, ...]):
  143. anchors, anchor_points, num_anchors_list, stride_tensor = generate_anchors_for_grid_cell(
  144. feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset
  145. )
  146. cls_score_list, reg_distri_list = [], []
  147. for i, feat in enumerate(feats):
  148. avg_feat = torch.nn.functional.adaptive_avg_pool2d(feat, (1, 1))
  149. cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat)
  150. reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
  151. # cls and reg
  152. # Note we don't apply sigmoid on class predictions to ensure good numerical stability at loss computation
  153. cls_score_list.append(torch.permute(cls_logit.flatten(2), [0, 2, 1]))
  154. reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))
  155. cls_score_list = torch.cat(cls_score_list, dim=1)
  156. reg_distri_list = torch.cat(reg_distri_list, dim=1)
  157. return cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor
  158. def forward_eval(self, feats: Tuple[Tensor, ...]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]:
  159. anchors, anchor_points, num_anchors_list, stride_tensor = generate_anchors_for_grid_cell(
  160. feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset
  161. )
  162. cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []
  163. for i, feat in enumerate(feats):
  164. b, _, h, w = feat.shape
  165. height_mul_width = h * w
  166. avg_feat = torch.nn.functional.adaptive_avg_pool2d(feat, (1, 1))
  167. cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat)
  168. reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
  169. reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))
  170. reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
  171. reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)
  172. # cls and reg
  173. cls_score_list.append(cls_logit.reshape([b, self.num_classes, height_mul_width]))
  174. reg_dist_reduced_list.append(reg_dist_reduced)
  175. cls_score_list = torch.cat(cls_score_list, dim=-1) # [B, C, Anchors]
  176. cls_score_list = torch.permute(cls_score_list, [0, 2, 1]) # # [B, Anchors, C]
  177. reg_distri_list = torch.cat(reg_distri_list, dim=1) # [B, Anchors, 4 * (self.reg_max + 1)]
  178. reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1) # [B, Anchors, 4]
  179. # Decode bboxes
  180. # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
  181. if self.eval_size:
  182. anchor_points_inference, _ = self.anchor_points, self.stride_tensor
  183. else:
  184. anchor_points_inference, _ = self._generate_anchors(feats)
  185. pred_scores = cls_score_list.sigmoid()
  186. pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor # [B, Anchors, 4]
  187. decoded_predictions = pred_bboxes, pred_scores
  188. raw_predictions = cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor
  189. return decoded_predictions, raw_predictions
  190. def _generate_anchors(self, feats=None, dtype=torch.float):
  191. # just use in eval time
  192. anchor_points = []
  193. stride_tensor = []
  194. for i, stride in enumerate(self.fpn_strides):
  195. if feats is not None:
  196. _, _, h, w = feats[i].shape
  197. else:
  198. h = int(self.eval_size[0] / stride)
  199. w = int(self.eval_size[1] / stride)
  200. shift_x = torch.arange(end=w) + self.grid_cell_offset
  201. shift_y = torch.arange(end=h) + self.grid_cell_offset
  202. if torch_version_is_greater_or_equal(1, 10):
  203. shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
  204. else:
  205. shift_y, shift_x = torch.meshgrid(shift_y, shift_x)
  206. anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
  207. anchor_points.append(anchor_point.reshape([-1, 2]))
  208. stride_tensor.append(torch.full([h * w, 1], stride, dtype=dtype))
  209. anchor_points = torch.cat(anchor_points)
  210. stride_tensor = torch.cat(stride_tensor)
  211. if feats is not None:
  212. anchor_points = anchor_points.to(feats[0].device)
  213. stride_tensor = stride_tensor.to(feats[0].device)
  214. return anchor_points, stride_tensor
  215. def forward(self, feats: Tuple[Tensor]):
  216. if self.training:
  217. return self.forward_train(feats)
  218. else:
  219. return self.forward_eval(feats)
Discard
@@ -1,3 +1,6 @@
+# TODO: It would be nice to create keys here as: make_pretrained_model_key(Models.RESNET18, Dataset.COCO)
+# TODO: Not only this would reduce risk of making a typo error, it would bring more clarity how the key is created
+# TODO: And allow to "query" pretrained models by dataset
 MODEL_URLS = {
 MODEL_URLS = {
     # RegNet-s
     # RegNet-s
     "regnetY800_imagenet": "https://deci-pretrained-models.s3.amazonaws.com/RegnetY800/average_model.pth",
     "regnetY800_imagenet": "https://deci-pretrained-models.s3.amazonaws.com/RegnetY800/average_model.pth",
@@ -51,6 +54,9 @@ MODEL_URLS = {
     "pp_lite_t_seg75_cityscapes": "https://deci-pretrained-models.s3.amazonaws.com/ppliteseg/cityscapes/pplite_t_seg75/average_model.pth",
     "pp_lite_t_seg75_cityscapes": "https://deci-pretrained-models.s3.amazonaws.com/ppliteseg/cityscapes/pplite_t_seg75/average_model.pth",
     "pp_lite_b_seg50_cityscapes": "https://deci-pretrained-models.s3.amazonaws.com/ppliteseg/cityscapes/pplite_b_seg50/average_model.pth",
     "pp_lite_b_seg50_cityscapes": "https://deci-pretrained-models.s3.amazonaws.com/ppliteseg/cityscapes/pplite_b_seg50/average_model.pth",
     "pp_lite_b_seg75_cityscapes": "https://deci-pretrained-models.s3.amazonaws.com/ppliteseg/cityscapes/pplite_b_seg75/average_model.pth",
     "pp_lite_b_seg75_cityscapes": "https://deci-pretrained-models.s3.amazonaws.com/ppliteseg/cityscapes/pplite_b_seg75/average_model.pth",
+    #
+    "ppyoloe_s_coco": "https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/coco2017_ppyoloe_s.pth",
+    "ppyoloe_m_coco": "https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/coco2017_ppyoloe_m.pth",
 }
 }
 
 
 PRETRAINED_NUM_CLASSES = {"imagenet": 1000, "imagenet21k": 21843, "coco_segmentation_subclass": 21, "cityscapes": 19, "coco": 80, "cifar10": 10}
 PRETRAINED_NUM_CLASSES = {"imagenet": 1000, "imagenet21k": 21843, "coco_segmentation_subclass": 21, "cityscapes": 19, "coco": 80, "cifar10": 10}
Discard
@@ -531,7 +531,7 @@ class Trainer:
         :param loss: The value computed by the loss function
         :param loss: The value computed by the loss function
         :param optimizer: An object that can perform a gradient step and zeroize model gradient
         :param optimizer: An object that can perform a gradient step and zeroize model gradient
         :param epoch: number of epoch the training is on
         :param epoch: number of epoch the training is on
-        :param batch_idx: number of iteration inside the current epoch
+        :param batch_idx: Zero-based number of iteration inside the current epoch
         :param context: current phase context
         :param context: current phase context
         :return:
         :return:
         """
         """
Discard
@@ -4,6 +4,7 @@ from super_gradients.training.transforms.transforms import (
     DetectionMosaic,
     DetectionMosaic,
     DetectionRandomAffine,
     DetectionRandomAffine,
     DetectionHSV,
     DetectionHSV,
+    DetectionRGB2BGR,
     DetectionPaddedRescale,
     DetectionPaddedRescale,
     DetectionTargetsFormatTransform,
     DetectionTargetsFormatTransform,
     Standardize,
     Standardize,
@@ -24,6 +25,7 @@ __all__ = [
     "DetectionMosaic",
     "DetectionMosaic",
     "DetectionRandomAffine",
     "DetectionRandomAffine",
     "DetectionHSV",
     "DetectionHSV",
+    "DetectionRGB2BGR",
     "DetectionPaddedRescale",
     "DetectionPaddedRescale",
     "DetectionTargetsFormatTransform",
     "DetectionTargetsFormatTransform",
     "imported_albumentations_failure",
     "imported_albumentations_failure",
Discard
@@ -27,6 +27,7 @@ from super_gradients.training.transforms.transforms import (
     DetectionRescale,
     DetectionRescale,
     DetectionPaddedRescale,
     DetectionPaddedRescale,
     DetectionTargetsFormatTransform,
     DetectionTargetsFormatTransform,
+    DetectionNormalize,
     Standardize,
     Standardize,
 )
 )
 from torchvision.transforms import (
 from torchvision.transforms import (
@@ -98,6 +99,7 @@ TRANSFORMS = {
     Transforms.DetectionRescale: DetectionRescale,
     Transforms.DetectionRescale: DetectionRescale,
     Transforms.DetectionPaddedRescale: DetectionPaddedRescale,
     Transforms.DetectionPaddedRescale: DetectionPaddedRescale,
     Transforms.DetectionTargetsFormatTransform: DetectionTargetsFormatTransform,
     Transforms.DetectionTargetsFormatTransform: DetectionTargetsFormatTransform,
+    Transforms.DetectionNormalize: DetectionNormalize,
     Transforms.RandomResizedCropAndInterpolation: RandomResizedCropAndInterpolation,
     Transforms.RandomResizedCropAndInterpolation: RandomResizedCropAndInterpolation,
     Transforms.RandAugmentTransform: rand_augment_transform,
     Transforms.RandAugmentTransform: rand_augment_transform,
     Transforms.Lighting: Lighting,
     Transforms.Lighting: Lighting,
Discard
@@ -1,13 +1,15 @@
 import collections
 import collections
 import math
 import math
 import random
 import random
+from numbers import Number
 from typing import Optional, Union, Tuple, List, Sequence, Dict
 from typing import Optional, Union, Tuple, List, Sequence, Dict
 
 
+import cv2
+import numpy as np
 import torch.nn
 import torch.nn
 from PIL import Image, ImageFilter, ImageOps
 from PIL import Image, ImageFilter, ImageOps
 from torchvision import transforms as transforms
 from torchvision import transforms as transforms
-import numpy as np
-import cv2
+
 from super_gradients.common.abstractions.abstract_logger import get_logger
 from super_gradients.common.abstractions.abstract_logger import get_logger
 from super_gradients.common.decorators.factory_decorator import resolve_param
 from super_gradients.common.decorators.factory_decorator import resolve_param
 from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory
 from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory
@@ -541,7 +543,7 @@ class DetectionRandomAffine(DetectionTransform):
         translate=0.1,
         translate=0.1,
         scales=0.1,
         scales=0.1,
         shear=10,
         shear=10,
-        target_size=(640, 640),
+        target_size: Optional[Tuple[int, int]] = (640, 640),
         filter_box_candidates: bool = False,
         filter_box_candidates: bool = False,
         wh_thr=2,
         wh_thr=2,
         ar_thr=20,
         ar_thr=20,
@@ -570,7 +572,7 @@ class DetectionRandomAffine(DetectionTransform):
                 sample["image"],
                 sample["image"],
                 sample["target"],
                 sample["target"],
                 sample.get("target_seg"),
                 sample.get("target_seg"),
-                target_size=self.target_size,
+                target_size=self.target_size or tuple(reversed(sample["image"].shape[:2])),
                 degrees=self.degrees,
                 degrees=self.degrees,
                 translate=self.translate,
                 translate=self.translate,
                 scales=self.scale,
                 scales=self.scale,
@@ -616,6 +618,8 @@ class DetectionMixup(DetectionTransform):
     def __call__(self, sample: dict):
     def __call__(self, sample: dict):
         if self.enable_mixup and random.random() < self.prob:
         if self.enable_mixup and random.random() < self.prob:
             origin_img, origin_labels = sample["image"], sample["target"]
             origin_img, origin_labels = sample["image"], sample["target"]
+            target_dim = self.input_dim if self.input_dim is not None else sample["image"].shape[:2]
+
             cp_sample = sample["additional_samples"][0]
             cp_sample = sample["additional_samples"][0]
             img, cp_labels = cp_sample["image"], cp_sample["target"]
             img, cp_labels = cp_sample["image"], cp_sample["target"]
             cp_boxes = cp_labels[:, :4]
             cp_boxes = cp_labels[:, :4]
@@ -627,11 +631,11 @@ class DetectionMixup(DetectionTransform):
             jit_factor = random.uniform(*self.mixup_scale)
             jit_factor = random.uniform(*self.mixup_scale)
 
 
             if len(img.shape) == 3:
             if len(img.shape) == 3:
-                cp_img = np.ones((self.input_dim[0], self.input_dim[1], img.shape[2]), dtype=np.uint8) * self.border_value
+                cp_img = np.ones((target_dim[0], target_dim[1], 3), dtype=np.uint8) * self.border_value
             else:
             else:
-                cp_img = np.ones(self.input_dim, dtype=np.uint8) * self.border_value
+                cp_img = np.ones(target_dim, dtype=np.uint8) * self.border_value
 
 
-            cp_scale_ratio = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
+            cp_scale_ratio = min(target_dim[0] / img.shape[0], target_dim[1] / img.shape[1])
             resized_img = cv2.resize(
             resized_img = cv2.resize(
                 img,
                 img,
                 (int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)),
                 (int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)),
@@ -755,32 +759,32 @@ class DetectionHorizontalFlip(DetectionTransform):
 class DetectionRescale(DetectionTransform):
 class DetectionRescale(DetectionTransform):
     """
     """
     Resize image and bounding boxes to given image dimensions without preserving aspect ratio
     Resize image and bounding boxes to given image dimensions without preserving aspect ratio
+
     Attributes:
     Attributes:
-        input_dim: (tuple) (rows, cols)
-        swap: image axis's to be rearranged.
+        output_shape: (tuple) (rows, cols)
+
     """
     """
 
 
-    def __init__(self, input_dim: Tuple[int, int], swap=(2, 0, 1)):
+    def __init__(self, output_shape: Tuple[int, int]):
         super().__init__()
         super().__init__()
-        self.swap = swap
-        self.input_dim = input_dim
+        self.output_shape = output_shape
 
 
     def __call__(self, sample: Dict[str, np.array]):
     def __call__(self, sample: Dict[str, np.array]):
         img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
         img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
 
 
         img_resized, scale_factors = self._rescale_image(img)
         img_resized, scale_factors = self._rescale_image(img)
 
 
-        sample["image"] = img_resized.transpose(self.swap).astype(np.float32, copy=True)
+        sample["image"] = img_resized
         sample["target"] = self._rescale_target(targets, scale_factors)
         sample["target"] = self._rescale_target(targets, scale_factors)
         if crowd_targets is not None:
         if crowd_targets is not None:
             sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors)
             sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors)
         return sample
         return sample
 
 
     def _rescale_image(self, image):
     def _rescale_image(self, image):
-        sy, sx = self.input_dim[0] / image.shape[0], self.input_dim[1] / image.shape[1]
+        sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
         resized_img = cv2.resize(
         resized_img = cv2.resize(
             image,
             image,
-            dsize=(int(self.input_dim[1]), int(self.input_dim[0])),
+            dsize=(int(self.output_shape[1]), int(self.output_shape[0])),
             interpolation=cv2.INTER_LINEAR,
             interpolation=cv2.INTER_LINEAR,
         )
         )
         scale_factors = sy, sx
         scale_factors = sy, sx
@@ -789,8 +793,10 @@ class DetectionRescale(DetectionTransform):
     def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
     def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
         """SegRescale the target according to a coefficient used to rescale the image.
         """SegRescale the target according to a coefficient used to rescale the image.
         This is done to have images and targets at the same scale.
         This is done to have images and targets at the same scale.
+
         :param targets:  Target XYXY bboxes to rescale, shape (num_boxes, 5)
         :param targets:  Target XYXY bboxes to rescale, shape (num_boxes, 5)
         :param r:        SegRescale coefficient that was applied to the image
         :param r:        SegRescale coefficient that was applied to the image
+
         :return:         Rescaled targets, shape (num_boxes, 5)
         :return:         Rescaled targets, shape (num_boxes, 5)
         """
         """
         sy, sx = scale_factors
         sy, sx = scale_factors
@@ -829,13 +835,16 @@ class DetectionRandomRotate90(DetectionTransform):
     @classmethod
     @classmethod
     def xyxy_bbox_rot90(cls, bboxes, factor: int, rows: int, cols: int):
     def xyxy_bbox_rot90(cls, bboxes, factor: int, rows: int, cols: int):
         """Rotates a bounding box by 90 degrees CCW (see np.rot90)
         """Rotates a bounding box by 90 degrees CCW (see np.rot90)
+
         Args:
         Args:
             bbox: A bounding box tuple (x_min, y_min, x_max, y_max).
             bbox: A bounding box tuple (x_min, y_min, x_max, y_max).
             factor: Number of CCW rotations. Must be in set {0, 1, 2, 3} See np.rot90.
             factor: Number of CCW rotations. Must be in set {0, 1, 2, 3} See np.rot90.
             rows: Image rows.
             rows: Image rows.
             cols: Image cols.
             cols: Image cols.
+
         Returns:
         Returns:
             tuple: A bounding box tuple (x_min, y_min, x_max, y_max).
             tuple: A bounding box tuple (x_min, y_min, x_max, y_max).
+
         """
         """
         x_min, y_min, x_max, y_max = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
         x_min, y_min, x_max, y_max = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
 
 
@@ -855,8 +864,10 @@ class DetectionRandomRotate90(DetectionTransform):
 class DetectionRGB2BGR(DetectionTransform):
 class DetectionRGB2BGR(DetectionTransform):
     """
     """
     Detection change Red & Blue channel of the image
     Detection change Red & Blue channel of the image
+
     Attributes:
     Attributes:
         prob: (float) probability to apply the transform.
         prob: (float) probability to apply the transform.
+
     """
     """
 
 
     def __init__(self, prob: float = 0.5):
     def __init__(self, prob: float = 0.5):
@@ -864,8 +875,8 @@ class DetectionRGB2BGR(DetectionTransform):
         self.prob = prob
         self.prob = prob
 
 
     def __call__(self, sample: dict) -> dict:
     def __call__(self, sample: dict) -> dict:
-        if sample["image"].shape[2] != 3:
-            raise ValueError("DetectionRGB2BGR expects image to have 3 channels, got: " + str(sample["image"].shape[2]))
+        if sample["image"].shape[2] < 3:
+            raise ValueError("DetectionRGB2BGR transform expects at least 3 channels, got: " + str(sample["image"].shape[2]))
 
 
         if random.random() < self.prob:
         if random.random() < self.prob:
             sample["image"] = sample["image"][..., ::-1]
             sample["image"] = sample["image"][..., ::-1]
@@ -912,6 +923,21 @@ class DetectionHSV(DetectionTransform):
         return sample
         return sample
 
 
 
 
+class DetectionNormalize(DetectionTransform):
+    """
+    Normalize image by subtracting mean and dividing by std.
+    """
+
+    def __init__(self, mean, std):
+        super().__init__()
+        self.mean = np.array(list(mean)).reshape((1, 1, -1)).astype(np.float32)
+        self.std = np.array(list(std)).reshape((1, 1, -1)).astype(np.float32)
+
+    def __call__(self, sample: dict) -> dict:
+        sample["image"] = (sample["image"] - self.mean) / self.std
+        return sample
+
+
 class DetectionTargetsFormatTransform(DetectionTransform):
 class DetectionTargetsFormatTransform(DetectionTransform):
     """
     """
     Detection targets format transform
     Detection targets format transform
@@ -1004,8 +1030,8 @@ def get_aug_params(value: Union[tuple, float], center: float = 0):
     :param center: float, defines center to subtract when value is float.
     :param center: float, defines center to subtract when value is float.
     :return: generated value
     :return: generated value
     """
     """
-    if isinstance(value, float):
-        return random.uniform(center - value, center + value)
+    if isinstance(value, Number):
+        return random.uniform(center - float(value), center + float(value))
     elif len(value) == 2:
     elif len(value) == 2:
         return random.uniform(value[0], value[1])
         return random.uniform(value[0], value[1])
     else:
     else:
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
  1. from typing import Optional
  2. import torch
  3. from torch import Tensor
  4. __all__ = ["batch_distance2bbox"]
  5. def batch_distance2bbox(points: Tensor, distance: Tensor, max_shapes: Optional[Tensor] = None) -> Tensor:
  6. """Decode distance prediction to bounding box for batch.
  7. Args:
  8. points (Tensor): [B, ..., 2], "xy" format
  9. distance (Tensor): [B, ..., 4], "ltrb" format
  10. max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
  11. Returns:
  12. Tensor: Decoded bboxes, "x1y1x2y2" format.
  13. """
  14. lt, rb = torch.split(distance, 2, dim=-1)
  15. # while tensor add parameters, parameters should be better placed on the second place
  16. x1y1 = -lt + points
  17. x2y2 = rb + points
  18. out_bbox = torch.cat([x1y1, x2y2], dim=-1)
  19. if max_shapes is not None:
  20. max_shapes = max_shapes.flip(-1).tile([1, 2])
  21. delta_dim = out_bbox.ndim - max_shapes.ndim
  22. for _ in range(delta_dim):
  23. max_shapes.unsqueeze_(1)
  24. out_bbox = torch.where(out_bbox < max_shapes, out_bbox, max_shapes)
  25. out_bbox = torch.where(out_bbox > 0, out_bbox, torch.zeros_like(out_bbox))
  26. return out_bbox
Discard
@@ -15,6 +15,7 @@ from super_gradients.training.utils.callbacks.callbacks import (
     EpochStepWarmupLRCallback,
     EpochStepWarmupLRCallback,
     BatchStepLinearWarmupLRCallback,
     BatchStepLinearWarmupLRCallback,
 )
 )
+from super_gradients.training.utils.callbacks.ppyoloe_switch_callback import PPYoloETrainingStageSwitchCallback
 from super_gradients.training.utils.deprecated_utils import wrap_with_warning
 from super_gradients.training.utils.deprecated_utils import wrap_with_warning
 from super_gradients.training.utils.early_stopping import EarlyStop
 from super_gradients.training.utils.early_stopping import EarlyStop
 
 
@@ -27,6 +28,7 @@ CALLBACKS = {
     Callbacks.EARLY_STOP: EarlyStop,
     Callbacks.EARLY_STOP: EarlyStop,
     Callbacks.DETECTION_MULTISCALE_PREPREDICTION: DetectionMultiscalePrePredictionCallback,
     Callbacks.DETECTION_MULTISCALE_PREPREDICTION: DetectionMultiscalePrePredictionCallback,
     Callbacks.YOLOX_TRAINING_STAGE_SWITCH: YoloXTrainingStageSwitchCallback,
     Callbacks.YOLOX_TRAINING_STAGE_SWITCH: YoloXTrainingStageSwitchCallback,
+    Callbacks.PPYOLOE_TRAINING_STAGE_SWITCH: PPYoloETrainingStageSwitchCallback,
 }
 }
 
 
 
 
Discard
@@ -13,10 +13,10 @@ import torch
 from deprecate import deprecated
 from deprecate import deprecated
 
 
 from super_gradients.common.abstractions.abstract_logger import get_logger
 from super_gradients.common.abstractions.abstract_logger import get_logger
+from super_gradients.common.plugins.deci_client import DeciClient
 from super_gradients.training.utils.callbacks.base_callbacks import PhaseCallback, PhaseContext, Phase, Callback
 from super_gradients.training.utils.callbacks.base_callbacks import PhaseCallback, PhaseContext, Phase, Callback
 from super_gradients.training.utils.detection_utils import DetectionVisualization, DetectionPostPredictionCallback
 from super_gradients.training.utils.detection_utils import DetectionVisualization, DetectionPostPredictionCallback
 from super_gradients.training.utils.segmentation_utils import BinarySegmentationVisualization
 from super_gradients.training.utils.segmentation_utils import BinarySegmentationVisualization
-from super_gradients.common.plugins.deci_client import DeciClient
 
 
 logger = get_logger(__name__)
 logger = get_logger(__name__)
 
 
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
  1. from super_gradients.training.utils.callbacks import TrainingStageSwitchCallbackBase, PhaseContext
  2. class PPYoloETrainingStageSwitchCallback(TrainingStageSwitchCallbackBase):
  3. """
  4. PPYoloETrainingStageSwitchCallback
  5. Training stage switch for PPYolo training.
  6. It changes static bbox assigner to a task aligned assigned after certain number of epochs passed
  7. """
  8. def __init__(
  9. self,
  10. static_assigner_end_epoch: int = 30,
  11. ):
  12. super().__init__(next_stage_start_epoch=static_assigner_end_epoch)
  13. def apply_stage_change(self, context: PhaseContext):
  14. from super_gradients.training.losses import PPYoloELoss
  15. if not isinstance(context.criterion, PPYoloELoss):
  16. raise RuntimeError(
  17. f"A criterion must be an instance of PPYoloELoss when using PPYoloETrainingStageSwitchCallback. " f"Got criterion {repr(context.criterion)}"
  18. )
  19. context.criterion.use_static_assigner = False
Discard
@@ -1,19 +1,19 @@
 import math
 import math
 import os
 import os
 import pathlib
 import pathlib
+import random
 from abc import ABC, abstractmethod
 from abc import ABC, abstractmethod
 from enum import Enum
 from enum import Enum
 from typing import Callable, List, Union, Tuple, Optional, Dict
 from typing import Callable, List, Union, Tuple, Optional, Dict
 
 
 import cv2
 import cv2
 import matplotlib.pyplot as plt
 import matplotlib.pyplot as plt
-
 import numpy as np
 import numpy as np
 import torch
 import torch
 import torchvision
 import torchvision
+from omegaconf import ListConfig
 from torch import nn
 from torch import nn
 from torch.utils.data._utils.collate import default_collate
 from torch.utils.data._utils.collate import default_collate
-from omegaconf import ListConfig
 
 
 
 
 class DetectionTargetsFormat(Enum):
 class DetectionTargetsFormat(Enum):
@@ -683,6 +683,105 @@ class DetectionCollateFN:
         return torch.cat(targets_merged, 0)
         return torch.cat(targets_merged, 0)
 
 
 
 
+class PPYoloECollateFN:
+    """
+    Collate function for PPYoloE training
+    """
+
+    def __init__(self, random_resize_sizes: Union[List[int], None] = None, random_resize_modes: Union[List[int], None] = None):
+        """
+
+        Args:
+            random_resize_sizes: (rows, cols)
+        """
+        self.random_resize_sizes = random_resize_sizes
+        self.random_resize_modes = random_resize_modes
+
+    def __repr__(self):
+        return f"PPYoloECollateFN(random_resize_sizes={self.random_resize_sizes}, random_resize_modes={self.random_resize_modes})"
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __call__(self, data) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.random_resize_sizes is not None:
+            data = self.random_resize(data)
+
+        batch = default_collate(data)
+        ims, targets = batch
+        targets = self._format_targets(targets)
+        ims = torch.moveaxis(ims, -1, 1).float()
+
+        return ims, targets
+
+    def random_resize(self, batch):
+        target_size = random.choice(self.random_resize_sizes)
+        interpolation = random.choice(self.random_resize_modes)
+        batch = [self.random_resize_sample(sample, target_size, interpolation) for sample in batch]
+        return batch
+
+    def random_resize_sample(self, sample, target_size, interpolation):
+        if len(sample) == 2:
+            image, targets = sample  # TARGETS ARE IN LABEL_CXCYWH
+            with_crowd = False
+        elif len(sample == 3):
+            image, targets, crowd_targets = sample
+            with_crowd = True
+        else:
+            raise RuntimeError()
+
+        dsize = int(target_size), int(target_size)
+        scale_factors = target_size / image.shape[0], target_size / image.shape[1]
+
+        image = cv2.resize(
+            image,
+            dsize=dsize,
+            interpolation=interpolation,
+        )
+
+        sy, sx = scale_factors
+        targets[:, 1:5] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
+        if with_crowd:
+            crowd_targets[:, 1:5] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
+            return image, targets, crowd_targets
+
+        return image, targets
+
+    def _format_targets(self, targets: torch.Tensor) -> torch.Tensor:
+        """
+
+        :param targets:
+        :return: Tensor of shape [B, N, 6], where 6 elements are (index, c, cx, cy, w, h)
+        """
+        # Same collate as in YoloX. We convert to PPYoloTargets in the loss
+        nlabel = (targets.sum(dim=2) > 0).sum(dim=1)  # number of label per image
+        targets_merged = []
+        for i in range(targets.shape[0]):
+            targets_im = targets[i, : nlabel[i]]
+            batch_column = targets.new_ones((targets_im.shape[0], 1)) * i
+            targets_merged.append(torch.cat((batch_column, targets_im), 1))
+
+        return torch.cat(targets_merged, 0)
+
+
+class CrowdDetectionPPYoloECollateFN(PPYoloECollateFN):
+    """
+    Collate function for Yolox training with additional_batch_items that includes crowd targets
+    """
+
+    def __call__(self, data) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
+
+        if self.random_resize_sizes is not None:
+            data = self.random_resize(data)
+
+        batch = default_collate(data)
+        ims, targets, crowd_targets = batch
+        if ims.shape[3] == 3:
+            ims = torch.moveaxis(ims, -1, 1).float()
+
+        return ims, self._format_targets(targets), {"crowd_targets": self._format_targets(crowd_targets)}
+
+
 class CrowdDetectionCollateFN(DetectionCollateFN):
 class CrowdDetectionCollateFN(DetectionCollateFN):
     """
     """
     Collate function for Yolox training with additional_batch_items that includes crowd targets
     Collate function for Yolox training with additional_batch_items that includes crowd targets
@@ -807,7 +906,7 @@ def compute_img_detection_matching(
     :param preds:           Tensor of shape (num_img_predictions, 6)
     :param preds:           Tensor of shape (num_img_predictions, 6)
                             format:     (x1, y1, x2, y2, confidence, class_label) where x1,y1,x2,y2 are according to image size
                             format:     (x1, y1, x2, y2, confidence, class_label) where x1,y1,x2,y2 are according to image size
     :param targets:         targets for this image of shape (num_img_targets, 6)
     :param targets:         targets for this image of shape (num_img_targets, 6)
-                            format:     (index, x, y, w, h, label) where x,y,w,h are in range [0,1]
+                            format:     (label, cx, cy, w, h, label) where cx,cy,w,h
     :param height:          dimensions of the image
     :param height:          dimensions of the image
     :param width:           dimensions of the image
     :param width:           dimensions of the image
     :param iou_thresholds:  Threshold to compute the mAP
     :param iou_thresholds:  Threshold to compute the mAP
@@ -858,9 +957,8 @@ def compute_img_detection_matching(
         # CHANGE bboxes TO FIT THE IMAGE SIZE
         # CHANGE bboxes TO FIT THE IMAGE SIZE
         change_bbox_bounds_for_image_size(preds, (height, width))
         change_bbox_bounds_for_image_size(preds, (height, width))
 
 
-        # if target_format == "xywh":
-        targets_box = convert_xywh_bbox_to_xyxy(targets_box)  # cxcywh2xyxy
-        crowd_target_box = convert_xywh_bbox_to_xyxy(crowd_target_box)  # convert_xywh_bbox_to_xyxy
+        targets_box = cxcywh2xyxy(targets_box)
+        crowd_target_box = cxcywh2xyxy(crowd_target_box)
 
 
         if denormalize_targets:
         if denormalize_targets:
             targets_box[:, [0, 2]] *= width
             targets_box[:, [0, 2]] *= width
Discard