@@ -7,6 +7,7 @@ class Losses:
 
                                 SHELFNET_OHEM_LOSS = "shelfnet_ohem_loss"
                
 
                                 SHELFNET_SE_LOSS = "shelfnet_se_loss"
                
 
                                 YOLOX_LOSS = "yolox_loss"
                
 
                            +    PPYOLOE_LOSS = "ppyoloe_loss"
                
 
                                 YOLOX_FAST_LOSS = "yolox_fast_loss"
                
 
                                 SSD_LOSS = "ssd_loss"
                
 
                                 STDC_LOSS = "stdc_loss"
                
@@ -55,6 +56,8 @@ class Transforms:
 
                                 DetectionRescale = "DetectionRescale"
                
 
                                 DetectionPaddedRescale = "DetectionPaddedRescale"
                
 
                                 DetectionTargetsFormatTransform = "DetectionTargetsFormatTransform"
                
 
                            +    DetectionNormalize = "DetectionNormalize"
                
 
                            +    #
                
 
                                 RandomResizedCropAndInterpolation = "RandomResizedCropAndInterpolation"
                
 
                                 RandAugmentTransform = "RandAugmentTransform"
                
 
                                 Lighting = "Lighting"
                
@@ -131,6 +134,8 @@ class Callbacks:
 
                                 EARLY_STOP = "EarlyStop"
                
 
                                 DETECTION_MULTISCALE_PREPREDICTION = "DetectionMultiscalePrePredictionCallback"
                
 
                                 YOLOX_TRAINING_STAGE_SWITCH = "YoloXTrainingStageSwitchCallback"
                
 
                            +    PPYOLOE_TRAINING_STAGE_SWITCH = "PPYoloETrainingStageSwitchCallback"
                
 
                            +    DETECTION_VISUALIZATION_CALLBACK = "DetectionVisualizationCallback"
                
 
                             class LRSchedulers:
                
@@ -275,6 +280,10 @@ class Models:
 
                                 UNET_CUSTOM_CLS = "unet_custom_cls"
                
 
                                 STDC_CUSTOM = "stdc_custom"
                
 
                                 STDC_CUSTOM_CLS = "stdc_custom_cls"
                
 
                            +    PP_YOLOE_S = "ppyoloe_s"
                
 
                            +    PP_YOLOE_M = "ppyoloe_m"
                
 
                            +    PP_YOLOE_L = "ppyoloe_l"
                
 
                            +    PP_YOLOE_X = "ppyoloe_x"
                
 
                             class ConcatenatedTensorFormats:
                
 
            layers: [3, 6, 6, 3]                # model's structure
channels: [64, 128, 256, 512, 1024] # number of outputs channels for step and consecutive feature maps
activation: silu                    # model's width multiplier
return_idx: [1,2,3]                 # Indexes of feature maps to output
use_large_stem: True                # If True, uses 3 conv+bn+act instead of 2 in stem blocks
width_mult:                         # scaling factor to number of channels
depth_mult:                         # scaling factor to number of layers
use_alpha: False                    # If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock

          
 
            defaults:
  - csp_resnet_arch_params

depth_mult: 1.0
width_mult: 1.0

          
 
            defaults:
  - csp_resnet_arch_params

depth_mult: 0.67
width_mult: 0.75

          
 
            defaults:
  - csp_resnet_arch_params

depth_mult: 0.33
width_mult: 0.50

          
 
            defaults:
  - csp_resnet_arch_params

depth_mult: 1.33
width_mult: 1.25

          
 
            depth_mult:
width_mult:
num_classes: 80

backbone:
  layers: [ 3, 6, 6, 3 ]                # Backbone's structure
  channels: [ 64, 128, 256, 512, 1024 ] # Number of outputs channels for stem and consecutive feature maps
  activation: silu
  return_idx: [ 1, 2, 3 ]               # Indexes of feature maps to output, indiced 1,2,3 correspond to feature maps of stride 8,16,32
  use_large_stem: True                  # If True, uses 3 conv+bn+act instead of 2 in stem blocks
  use_alpha: False                      # If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
  pretrained_weights:

neck:
  in_channels: [256, 512, 1024]
  out_channels: [768, 384, 192]
  activation: silu
  block_num: 3
  stage_num: 1
  spp: True

head:
  in_channels: [768, 384, 192]
  activation: silu

  fpn_strides: [32, 16, 8]
  grid_cell_scale: 5.0
  grid_cell_offset: 0.5
  reg_max: 16 # Number of bins for size prediction

  eval_size:    # Size of the image for evaluation. Setting this value can be beneficial for inference speed since anchors will not be regenerated for each forward call.

          
 
            defaults:
  - ppyoloe_arch_params
  - _self_

depth_mult: 1.0
width_mult: 1.0

backbone:
  pretrained_weights: https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/CSPResNetb_l_pretrained.pth

          
 
            defaults:
  - ppyoloe_arch_params
  - _self_

depth_mult: 0.67
width_mult: 0.75

backbone:
  pretrained_weights: https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/CSPResNetb_m_pretrained.pth

          
 
            defaults:
  - ppyoloe_arch_params
  - _self_

depth_mult: 0.33
width_mult: 0.50

backbone:
  pretrained_weights: https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/CSPResNetb_s_pretrained.pth

          
 
            defaults:
  - ppyoloe_arch_params
  - _self_

depth_mult: 1.33
width_mult: 1.25

backbone:
  pretrained_weights: https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/CSPResNetb_x_pretrained.pth

          
 
            # PP-Yolo-E Detection training on COCO2017 Dataset:
# PP-Yolo-E trained in 640x640
# Checkpoints + tensorboards: https://deci-pretrained-models.s3.amazonaws.com/ppyoloe_coco/
# Recipe runs with batch size = 20 X 8 gpus = 160.


# Instructions:
#   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
#   1. Move to the project root (where you will find the ReadMe and src folder)
#   2. Run the command you want:
#         ppyoloe_s: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_s
#         ppyoloe_m: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_m
#         ppyoloe_l: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_l
#         ppyoloe_x: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_x
#
# Training times and accuracies (mAP@0.5-0.95 (COCO API, confidence 0.001, IoU threshold 0.6, test on 640x640 images):
#         ppyoloe_s: 37h on 8 NVIDIA GeForce RTX 3090, mAP: 42.52 (val)
#         ppyoloe_m: 58h on 8 NVIDIA GeForce RTX 3090, mAP: 47.11 (val)
#         ppyoloe_l: COMING SOON
#         ppyoloe_x: COMING SOON
#

defaults:
  - training_hyperparams: coco2017_ppyoloe_train_params
  - dataset_params: coco_detection_ppyoloe_dataset_params
  - arch_params: ppyoloe_l_arch_params
  - checkpoint_params: default_checkpoint_params
  - _self_

train_dataloader: coco2017_train_ppyoloe
val_dataloader: coco2017_val_ppyoloe

load_checkpoint: False
resume: False

dataset_params:
  train_dataloader_params:
    batch_size: 20

training_hyperparams:
  resume: ${resume}
  mixed_precision: True

  initial_lr:  1e-3

architecture: pp_yoloe_l

multi_gpu: DDP
num_gpus: 8

experiment_suffix: ""
experiment_name: coco2017_${architecture}${experiment_suffix}

ckpt_root_dir:


# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
hydra:
  run:
    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}

          
 
            # PP-Yolo-E Detection training on COCO2017 Dataset:
# PP-Yolo-E trained in 640x640
# Checkpoints + tensorboards: https://deci-pretrained-models.s3.amazonaws.com/ppyoloe_coco/
# Recipe runs with batch size = 24 X 8 gpus = 192.


# Instructions:
#   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
#   1. Move to the project root (where you will find the ReadMe and src folder)
#   2. Run the command you want:
#         ppyoloe_s: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_s
#         ppyoloe_m: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_m
#         ppyoloe_l: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_l
#         ppyoloe_x: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_x
#
# Training times and accuracies (mAP@0.5-0.95 (COCO API, confidence 0.001, IoU threshold 0.6, test on 640x640 images):
#         ppyoloe_s: 37h on 8 NVIDIA GeForce RTX 3090, mAP: 42.52 (val)
#         ppyoloe_m: 58h on 8 NVIDIA GeForce RTX 3090, mAP: 47.11 (val)
#         ppyoloe_l: COMING SOON
#         ppyoloe_x: COMING SOON
#

defaults:
  - training_hyperparams: coco2017_ppyoloe_train_params
  - dataset_params: coco_detection_ppyoloe_dataset_params
  - arch_params: ppyoloe_m_arch_params
  - checkpoint_params: default_checkpoint_params
  - _self_

train_dataloader: coco2017_train_ppyoloe
val_dataloader: coco2017_val_ppyoloe

load_checkpoint: False
resume: False

dataset_params:
  train_dataloader_params:
    batch_size: 24

training_hyperparams:
  resume: ${resume}
  mixed_precision: True

  initial_lr:  1e-3

architecture: pp_yoloe_m

multi_gpu: DDP
num_gpus: 8

experiment_suffix: ""
experiment_name: coco2017_${architecture}${experiment_suffix}

ckpt_root_dir:


# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
hydra:
  run:
    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}

          
 
            # PP-Yolo-E Detection training on COCO2017 Dataset:
# PP-Yolo-E trained in 640x640
# Recipe runs with batch size = 32 X 8 gpus = 256.

# Instructions:
#   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
#   1. Move to the project root (where you will find the ReadMe and src folder)
#   2. Run the command you want:
#         ppyoloe_s: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_s
#         ppyoloe_m: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_m
#         ppyoloe_l: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_l
#         ppyoloe_x: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_x
#
# Training times and accuracies (mAP@0.5-0.95 (COCO API, confidence 0.001, IoU threshold 0.6, test on 640x640 images):
#         ppyoloe_s: 37h on 8 NVIDIA GeForce RTX 3090, mAP: 42.52 (val)
#         ppyoloe_m: 58h on 8 NVIDIA GeForce RTX 3090, mAP: 47.11 (val)
#         ppyoloe_l: COMING SOON
#         ppyoloe_x: COMING SOON
#

defaults:
  - training_hyperparams: coco2017_ppyoloe_train_params
  - dataset_params: coco_detection_ppyoloe_dataset_params
  - arch_params: ppyoloe_s_arch_params
  - checkpoint_params: default_checkpoint_params
  - _self_

train_dataloader: coco2017_train_ppyoloe
val_dataloader: coco2017_val_ppyoloe

load_checkpoint: False
resume: False

dataset_params:
  train_dataloader_params:
    batch_size: 32

training_hyperparams:
  resume: ${resume}
  mixed_precision: True

architecture: pp_yoloe_s

multi_gpu: DDP
num_gpus: 8

experiment_suffix: ""
experiment_name: coco2017_${architecture}${experiment_suffix}

ckpt_root_dir:


# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
hydra:
  run:
    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}

          
 
            # PP-Yolo-E Detection training on COCO2017 Dataset:
# PP-Yolo-E trained in 640x640
# Checkpoints + tensorboards: https://deci-pretrained-models.s3.amazonaws.com/ppyoloe_coco/
# Recipe runs with batch size = 16 X 8 gpus = 128.


# Instructions:
#   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
#   1. Move to the project root (where you will find the ReadMe and src folder)
#   2. Run the command you want:
#         ppyoloe_s: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_s
#         ppyoloe_m: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_m
#         ppyoloe_l: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_l
#         ppyoloe_x: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ppyoloe_x
#
# Training times and accuracies (mAP@0.5-0.95 (COCO API, confidence 0.001, IoU threshold 0.6, test on 640x640 images):
#         ppyoloe_s: 37h on 8 NVIDIA GeForce RTX 3090, mAP: 42.52 (val)
#         ppyoloe_m: 58h on 8 NVIDIA GeForce RTX 3090, mAP: 47.11 (val)
#         ppyoloe_l: COMING SOON
#         ppyoloe_x: COMING SOON
#

defaults:
  - training_hyperparams: coco2017_ppyoloe_train_params
  - dataset_params: coco_detection_ppyoloe_dataset_params
  - arch_params: ppyoloe_x_arch_params
  - checkpoint_params: default_checkpoint_params
  - _self_

train_dataloader: coco2017_train_ppyoloe
val_dataloader: coco2017_val_ppyoloe

load_checkpoint: False
resume: False

dataset_params:
  train_dataloader_params:
    batch_size: 16

training_hyperparams:
  resume: ${resume}
  mixed_precision: True

architecture: pp_yoloe_x

multi_gpu: DDP
num_gpus: 8

experiment_suffix: ""
experiment_name: coco2017_${architecture}${experiment_suffix}

ckpt_root_dir:


# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
hydra:
  run:
    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}

          
 
            train_dataset_params:
  data_dir: /data/coco # root path to coco data
  subdir: images/train2017 # sub directory path of data_dir containing the train data.
  json_file: instances_train2017.json # path to coco train json file, data_dir/annotations/train_json_file.
  input_dim: # None, do not resize dataset on load
  cache_dir:
  cache: False
  transforms:
    - DetectionRandomAffine:
        degrees: 0                    # rotation degrees, randomly sampled from [-degrees, degrees]
        translate: 0.25               # image translation fraction
        scales: [ 0.5, 1.5 ]          # random rescale range (keeps size by padding/cropping) after mosaic transform.
        shear: 0.0                    # shear degrees, randomly sampled from [-degrees, degrees]
        target_size:
        filter_box_candidates: True   # whether to filter out transformed bboxes by edge size, area ratio, and aspect ratio.
        wh_thr: 2                     # edge size threshold when filter_box_candidates = True (pixels)
        area_thr: 0.1                 # threshold for area ratio between original image and the transformed one, when when filter_box_candidates = True
        ar_thr: 20                    # aspect ratio threshold when filter_box_candidates = True
    - DetectionRandomRotate90:
        prob: 0.5
    - DetectionRGB2BGR:
        prob: 0.25
    - DetectionHSV:
        prob: 0.5                       # probability to apply HSV transform
        hgain: 18                       # HSV transform hue gain (randomly sampled from [-hgain, hgain])
        sgain: 30                       # HSV transform saturation gain (randomly sampled from [-sgain, sgain])
        vgain: 30                       # HSV transform value gain (randomly sampled from [-vgain, vgain])
    - DetectionHorizontalFlip:
        prob: 0.5                       # probability to apply horizontal flip
    - DetectionMixup:
        input_dim:
        mixup_scale: [ 0.5, 1.5 ]         # random rescale range for the additional sample in mixup
        prob: 0.5                       # probability to apply per-sample mixup
        flip_prob: 0.5                  # probability to apply horizontal flip
    - DetectionNormalize:
        mean: [ 123.675, 116.28, 103.53 ]
        std: [ 58.395,  57.12,  57.375 ]
    - DetectionTargetsFormatTransform:
        max_targets: 256
        output_format: LABEL_CXCYWH

  tight_box_rotation: False
  class_inclusion_list:
  max_num_samples:
  with_crowd: False

train_dataloader_params:
  batch_size: 32
  num_workers: 8
  shuffle: True
  drop_last: True
  # Disable pin_memory due to presence of PPYoloECollateFN with uses random resize during training
  pin_memory: False
  worker_init_fn:
    _target_: super_gradients.training.utils.utils.load_func
    dotpath: super_gradients.training.datasets.datasets_utils.worker_init_reset_seed
  collate_fn: # collate function for trainset
    _target_: super_gradients.training.utils.detection_utils.PPYoloECollateFN
    random_resize_sizes: [ 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768 ]
    random_resize_modes:
      - 0 # cv::INTER_NEAREST
      - 1 # cv::INTER_LINEAR
      - 2 # cv::INTER_CUBIC
      - 3 # cv::INTER_AREA
      - 4 # cv::INTER_LANCZOS4

val_dataset_params:
  data_dir: /data/coco # root path to coco data
  subdir: images/val2017 # sub directory path of data_dir containing the train data.
  json_file: instances_val2017.json # path to coco train json file, data_dir/annotations/train_json_file.
  input_dim:
  cache_dir:
  cache: False
  transforms:
    - DetectionRescale:
        output_shape: [640, 640]
    - DetectionNormalize:
        mean: [ 123.675, 116.28, 103.53 ]
        std: [ 58.395,  57.12,  57.375 ]
    - DetectionTargetsFormatTransform:
        max_targets: 256
        output_format: LABEL_CXCYWH
  tight_box_rotation: False
  class_inclusion_list:
  max_num_samples:
  with_crowd: False

val_dataloader_params:
  batch_size: 64
  num_workers: 8
  drop_last: False
  shuffle: False
  pin_memory: False
  collate_fn: # collate function for trainset
    _target_: super_gradients.training.utils.detection_utils.PPYoloECollateFN

_convert_: all

          
 
            defaults:
  - default_train_params

max_epochs: 500
static_assigner_end_epoch: 150

warmup_mode: "linear_batch_step"
warmup_initial_lr:  1e-6
lr_warmup_steps: 1000
lr_warmup_epochs: 0

initial_lr:  2e-3
lr_mode: cosine
cosine_final_lr_ratio: 0.1

zero_weight_decay_on_bias_and_bn: False
batch_accumulate: 1

save_ckpt_epoch_list: [200, 250, 300, 350, 400, 450]

loss:
  ppyoloe_loss:
    num_classes: ${arch_params.num_classes}
    reg_max: ${arch_params.head.reg_max}

optimizer: AdamW
optimizer_params:
  weight_decay: 0.0001

ema: True
ema_params:
  decay: 0.9997
  decay_type: threshold

mixed_precision: False
sync_bn: True

valid_metrics_list:
  - DetectionMetrics:
      score_thres: 0.1
      top_k_predictions: 300
      num_cls: ${arch_params.num_classes}
      normalize_targets: True
      post_prediction_callback:
        _target_: super_gradients.training.models.detection_models.pp_yolo_e.PPYoloEPostPredictionCallback
        score_threshold: 0.01
        nms_top_k: 1000
        max_predictions: 300
        nms_threshold: 0.7

pre_prediction_callback:

phase_callbacks:
  - PPYoloETrainingStageSwitchCallback:
      static_assigner_end_epoch: ${training_hyperparams.static_assigner_end_epoch}

metric_to_watch: 'mAP@0.50:0.95'
greater_metric_to_watch_is_better: True

_convert_: all

          
@@ -5,6 +5,8 @@ from .dataloaders import (
 
                                 coco2017_val_yolox,
                
 
                                 coco2017_train_ssd_lite_mobilenet_v2,
                
 
                                 coco2017_val_ssd_lite_mobilenet_v2,
                
 
                            +    coco2017_train_ppyoloe,
                
 
                            +    coco2017_val_ppyoloe,
                
 
                                 imagenet_train,
                
 
                                 imagenet_val,
                
 
                                 imagenet_efficientnet_train,
                
@@ -58,6 +60,8 @@ __all__ = [
 
                                 "coco2017_val_yolox",
                
 
                                 "coco2017_train_ssd_lite_mobilenet_v2",
                
 
                                 "coco2017_val_ssd_lite_mobilenet_v2",
                
 
                            +    "coco2017_train_ppyoloe",
                
 
                            +    "coco2017_val_ppyoloe",
                
 
                                 "imagenet_train",
                
 
                                 "imagenet_val",
                
 
                                 "imagenet_efficientnet_train",
                
@@ -1,29 +1,29 @@
 
                             import os.path
                
 
                            -import pkg_resources
                
 
                             from typing import Dict
                
 
                             import hydra
                
 
                            -from hydra import compose, initialize_config_dir
                
 
                            -from hydra.core.global_hydra import GlobalHydra
                
 
                            -
                
 
                             import numpy as np
                
 
                            -import torch
                
 
                            -from torch.utils.data import BatchSampler, DataLoader, TensorDataset
                
 
                            -
                
 
                            +import pkg_resources
                
 
                             import super_gradients
                
 
                            -
                
 
                            -from super_gradients.training.datasets.detection_datasets.pascal_voc_detection import (
                
 
                            -    PascalVOCUnifiedDetectionTrainDataset,
                
 
                            -    PascalVOCDetectionDataset,
                
 
                            -)
                
 
                            -from super_gradients.training.utils import get_param
                
 
                            +import torch
                
 
                            +from hydra import compose, initialize_config_dir
                
 
                            +from hydra.core.global_hydra import GlobalHydra
                
 
                            +from super_gradients.common.abstractions.abstract_logger import get_logger
                
 
                             from super_gradients.common.environment.path_utils import normalize_path
                
 
                            +from super_gradients.common.factories.collate_functions_factory import CollateFunctionsFactory
                
 
                            +from super_gradients.common.factories.datasets_factory import DatasetsFactory
                
 
                            +from super_gradients.common.factories.samplers_factory import SamplersFactory
                
 
                             from super_gradients.training.datasets import ImageNetDataset
                
 
                            -from super_gradients.training.datasets.detection_datasets import COCODetectionDataset
                
 
                             from super_gradients.training.datasets.classification_datasets.cifar import (
                
 
                                 Cifar10,
                
 
                                 Cifar100,
                
 
                             )
                
 
                            +from super_gradients.training.datasets.detection_datasets import COCODetectionDataset
                
 
                            +from super_gradients.training.datasets.detection_datasets.pascal_voc_detection import (
                
 
                            +    PascalVOCUnifiedDetectionTrainDataset,
                
 
                            +    PascalVOCDetectionDataset,
                
 
                            +)
                
 
                            +from super_gradients.training.datasets.pose_estimation_datasets import COCOKeypointsDataset
                
 
                             from super_gradients.training.datasets.segmentation_datasets import (
                
 
                                 CityscapesDataset,
                
 
                                 CoCoSegmentationDataSet,
                
@@ -32,16 +32,13 @@ from super_gradients.training.datasets.segmentation_datasets import (
 
                                 SuperviselyPersonsDataset,
                
 
                                 MapillaryDataset,
                
 
                             )
                
 
                            -from super_gradients.common.factories.collate_functions_factory import CollateFunctionsFactory
                
 
                            -from super_gradients.common.factories.samplers_factory import SamplersFactory
                
 
                            +from super_gradients.training.utils import get_param
                
 
                             from super_gradients.training.utils.distributed_training_utils import (
                
 
                                 wait_for_the_master,
                
 
                                 get_local_rank,
                
 
                             )
                
 
                            -from super_gradients.common.abstractions.abstract_logger import get_logger
                
 
                             from super_gradients.training.utils.utils import override_default_params_without_nones
                
 
                            -from super_gradients.common.factories.datasets_factory import DatasetsFactory
                
 
                            -from super_gradients.training.datasets.pose_estimation_datasets import COCOKeypointsDataset
                
 
                            +from torch.utils.data import BatchSampler, DataLoader, TensorDataset
                
 
                             logger = get_logger(__name__)
                
@@ -163,6 +160,26 @@ def coco2017_val(dataset_params: Dict = None, dataloader_params: Dict = None):
 
                                 )
                
 
                            +def coco2017_train_ppyoloe(dataset_params: Dict = None, dataloader_params: Dict = None):
                
 
                            +    return get_data_loader(
                
 
                            +        config_name="coco_detection_ppyoloe_dataset_params",
                
 
                            +        dataset_cls=COCODetectionDataset,
                
 
                            +        train=True,
                
 
                            +        dataset_params=dataset_params,
                
 
                            +        dataloader_params=dataloader_params,
                
 
                            +    )
                
 
                            +
                
 
                            +
                
 
                            +def coco2017_val_ppyoloe(dataset_params: Dict = None, dataloader_params: Dict = None):
                
 
                            +    return get_data_loader(
                
 
                            +        config_name="coco_detection_ppyoloe_dataset_params",
                
 
                            +        dataset_cls=COCODetectionDataset,
                
 
                            +        train=False,
                
 
                            +        dataset_params=dataset_params,
                
 
                            +        dataloader_params=dataloader_params,
                
 
                            +    )
                
 
                            +
                
 
                            +
                
 
                             def coco2017_train_yolox(dataset_params: Dict = None, dataloader_params: Dict = None):
                
 
                                 return coco2017_train(dataset_params, dataloader_params)
                
@@ -646,6 +663,8 @@ ALL_DATALOADERS = {
 
                                 "coco2017_val": coco2017_val,
                
 
                                 "coco2017_train_yolox": coco2017_train_yolox,
                
 
                                 "coco2017_val_yolox": coco2017_val_yolox,
                
 
                            +    "coco2017_train_ppyoloe": coco2017_train_ppyoloe,
                
 
                            +    "coco2017_val_ppyoloe": coco2017_val_ppyoloe,
                
 
                                 "coco2017_train_ssd_lite_mobilenet_v2": coco2017_train_ssd_lite_mobilenet_v2,
                
 
                                 "coco2017_val_ssd_lite_mobilenet_v2": coco2017_val_ssd_lite_mobilenet_v2,
                
 
                                 "coco2017_pose_train": coco2017_pose_train,
                
@@ -167,13 +167,15 @@ class COCODetectionDataset(DetectionDataset):
 
                                         crowd_target[ix, 0:4] = annotation["clean_bbox"]
                
 
                                         crowd_target[ix, 4] = cls
                
 
                            -        r = min(self.input_dim[0] / height, self.input_dim[1] / width)
                
 
                            -        target[:, :4] *= r
                
 
                            -        crowd_target[:, :4] *= r
                
 
                            -        target_segmentation *= r
                
 
                            -
                
 
                                     initial_img_shape = (height, width)
                
 
                            -        resized_img_shape = (int(height * r), int(width * r))
                
 
                            +        if self.input_dim is not None:
                
 
                            +            r = min(self.input_dim[0] / height, self.input_dim[1] / width)
                
 
                            +            target[:, :4] *= r
                
 
                            +            crowd_target[:, :4] *= r
                
 
                            +            target_segmentation *= r
                
 
                            +            resized_img_shape = (int(height * r), int(width * r))
                
 
                            +        else:
                
 
                            +            resized_img_shape = initial_img_shape
                
 
                                     file_name = img_metadata["file_name"] if "file_name" in img_metadata else "{:012}".format(img_id) + ".jpg"
                
 
                                     img_path = os.path.join(self.data_dir, self.subdir, file_name)
                
@@ -67,7 +67,7 @@ class DetectionDataset(Dataset):
 
                                 def __init__(
                
 
                                     self,
                
 
                                     data_dir: str,
                
 
                            -        input_dim: tuple,
                
 
                            +        input_dim: Optional[Tuple[int, int]],
                
 
                                     original_target_format: DetectionTargetsFormat,
                
 
                                     max_num_samples: int = None,
                
 
                                     cache: bool = False,
                
@@ -278,11 +278,12 @@ class DetectionDataset(Dataset):
 
                                     """
                
 
                                     img = self._load_image(index)
                
 
                            -        r = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
                
 
                            -        desired_size = (int(img.shape[1] * r), int(img.shape[0] * r))
                
 
                            +        if self.input_dim is not None:
                
 
                            +            r = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
                
 
                            +            desired_size = (int(img.shape[1] * r), int(img.shape[0] * r))
                
 
                            +            img = cv2.resize(src=img, dsize=desired_size, interpolation=cv2.INTER_LINEAR).astype(np.uint8)
                
 
                            -        resized_img = cv2.resize(src=img, dsize=desired_size, interpolation=cv2.INTER_LINEAR).astype(np.uint8)
                
 
                            -        return resized_img
                
 
                            +        return img
                
 
                                 def _load_image(self, index: int) -> np.ndarray:
                
 
                                     """Loads image at index with its original resolution.
                
@@ -9,6 +9,7 @@ from super_gradients.training.losses.ssd_loss import SSDLoss
 
                             from super_gradients.training.losses.bce_dice_loss import BCEDiceLoss
                
 
                             from super_gradients.training.losses.dice_ce_edge_loss import DiceCEEdgeLoss
                
 
                             from super_gradients.training.losses.all_losses import LOSSES, Losses
                
 
                            +from super_gradients.training.losses.ppyolo_loss import PPYoloELoss
                
 
                             __all__ = [
                
 
                                 "LOSSES",
                
@@ -24,4 +25,5 @@ __all__ = [
 
                                 "BCEDiceLoss",
                
 
                                 "KDLogitsLoss",
                
 
                                 "DiceCEEdgeLoss",
                
 
                            +    "PPYoloELoss",
                
 
                             ]
                
@@ -13,6 +13,7 @@ from super_gradients.training.losses import (
 
                                 DiceCEEdgeLoss,
                
 
                             )
                
 
                             from super_gradients.training.losses.stdc_loss import STDCLoss
                
 
                            +from super_gradients.training.losses.ppyolo_loss import PPYoloELoss
                
 
                             LOSSES = {
                
@@ -28,4 +29,5 @@ LOSSES = {
 
                                 Losses.BCE_DICE_LOSS: BCEDiceLoss,
                
 
                                 Losses.KD_LOSS: KDLogitsLoss,
                
 
                                 Losses.DICE_CE_EDGE_LOSS: DiceCEEdgeLoss,
                
 
                            +    Losses.PPYOLOE_LOSS: PPYoloELoss,
                
 
                             }
                
 
            from typing import Mapping, Tuple, Union

import numpy as np
import torch
import torch.nn.functional as F
from torch import nn, Tensor

import super_gradients
from super_gradients.training.datasets.data_formats.bbox_formats.cxcywh import cxcywh_to_xyxy
from super_gradients.training.utils.bbox_utils import batch_distance2bbox
from super_gradients.training.utils.distributed_training_utils import (
    get_world_size,
)


def batch_iou_similarity(box1, box2, eps=1e-9):
    """Calculate iou of box1 and box2 in batch. Bboxes are expected to be in x1y1x2y2 format.

    Args:
        box1 (Tensor): box with the shape [N, M1, 4]
        box2 (Tensor): box with the shape [N, M2, 4]

    Return:
        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
    """
    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
    x1y1 = torch.maximum(px1y1, gx1y1)
    x2y2 = torch.minimum(px2y2, gx2y2)
    overlap = (x2y2 - x1y1).clip(0).prod(-1)
    area1 = (px2y2 - px1y1).clip(0).prod(-1)
    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
    union = area1 + area2 - overlap + eps
    return overlap / union


def iou_similarity(box1, box2, eps=1e-10):
    """
    Calculate iou of box1 and box2. Bboxes are expected to be in x1y1x2y2 format.

    Args:
        box1 (Tensor): box with the shape [M1, 4]
        box2 (Tensor): box with the shape [M2, 4]

    Return:
        iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
    """
    box1 = box1.unsqueeze(1)  # [M1, 4] -> [M1, 1, 4]
    box2 = box2.unsqueeze(0)  # [M2, 4] -> [1, M2, 4]
    px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
    gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
    x1y1 = torch.maximum(px1y1, gx1y1)
    x2y2 = torch.minimum(px2y2, gx2y2)
    overlap = (x2y2 - x1y1).clip(0).prod(-1)
    area1 = (px2y2 - px1y1).clip(0).prod(-1)
    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
    union = area1 + area2 - overlap + eps
    return overlap / union


def bbox_overlaps(bboxes1, bboxes2, mode="iou", is_aligned=False, eps=1e-6):
    """Calculate overlap between two set of bboxes.
    If ``is_aligned `` is ``False``, then calculate the overlaps between each
    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
    pair of bboxes1 and bboxes2.
    Args:
        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
            B indicates the batch dim, in shape (B1, B2, ..., Bn).
            If ``is_aligned `` is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union) or "iof" (intersection over
            foreground).
        is_aligned (bool, optional): If True, then m and n must be equal.
            Default False.
        eps (float, optional): A value added to the denominator for numerical
            stability. Default 1e-6.
    Returns:
        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
    """
    assert mode in ["iou", "iof", "giou"], "Unsupported mode {}".format(mode)
    # Either the boxes are empty or the length of boxes's last dimenstion is 4
    assert bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0
    assert bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0

    # Batch dim must be the same
    # Batch dim: (B1, B2, ... Bn)
    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
    batch_shape = bboxes1.shape[:-2]

    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
    if is_aligned:
        assert rows == cols

    if rows * cols == 0:
        if is_aligned:
            return np.random.random(batch_shape + (rows,))
        else:
            return np.random.random(batch_shape + (rows, cols))

    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])

    if is_aligned:
        lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
        rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]

        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ["iou", "giou"]:
            union = area1 + area2 - overlap
        else:
            union = area1
        if mode == "giou":
            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
    else:
        lt = np.maximum(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
        rb = np.minimum(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]

        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ["iou", "giou"]:
            union = area1[..., None] + area2[..., None, :] - overlap
        else:
            union = area1[..., None]
        if mode == "giou":
            enclosed_lt = np.minimum(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2])
            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:])

    eps = np.array([eps])
    union = np.maximum(union, eps)
    ious = overlap / union
    if mode in ["iou", "iof"]:
        return ious
    # calculate gious
    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
    enclose_area = np.maximum(enclose_area, eps)
    gious = ious - (enclose_area - union) / enclose_area
    return gious


def topk_(input, k, axis=1, largest=True):
    x = -input if largest else input
    if axis == 0:
        row_index = np.arange(input.shape[1 - axis])
        topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
        topk_data = x[topk_index, row_index]

        topk_index_sort = np.argsort(topk_data, axis=axis)
        topk_data_sort = topk_data[topk_index_sort, row_index]
        topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index]
    else:
        column_index = np.arange(x.shape[1 - axis])[:, None]
        topk_index = np.argpartition(x, k, axis=axis)[:, 0:k]
        topk_data = x[column_index, topk_index]
        topk_data = -topk_data if largest else topk_data
        topk_index_sort = np.argsort(topk_data, axis=axis)
        topk_data_sort = topk_data[column_index, topk_index_sort]
        topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort]

    return topk_data_sort, topk_index_sort


def compute_max_iou_anchor(ious: Tensor) -> Tensor:
    r"""
    For each anchor, find the GT with the largest IOU.
    Args:
        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
    Returns:
        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
    """
    num_max_boxes = ious.shape[-2]
    max_iou_index = ious.argmax(dim=-2)
    is_max_iou: Tensor = torch.nn.functional.one_hot(max_iou_index, num_max_boxes).permute([0, 2, 1])
    return is_max_iou.type_as(ious)


def check_points_inside_bboxes(points: Tensor, bboxes, center_radius_tensor=None, eps=1e-9):
    r"""
    Args:
        points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
        bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format
        center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None.
        eps (float): Default: 1e-9
    Returns:
        is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected
    """
    points = points.unsqueeze(0).unsqueeze(0)
    x, y = points.chunk(2, dim=-1)
    xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, dim=-1)
    # check whether `points` is in `bboxes`
    left = x - xmin
    top = y - ymin
    right = xmax - x
    bottom = ymax - y
    delta_ltrb = torch.cat([left, top, right, bottom], dim=-1)
    is_in_bboxes = delta_ltrb.min(dim=-1).values > eps
    if center_radius_tensor is not None:
        # check whether `points` is in `center_radius`
        center_radius_tensor = center_radius_tensor.unsqueeze(0).unsqueeze(0)
        cx = (xmin + xmax) * 0.5
        cy = (ymin + ymax) * 0.5
        left = x - (cx - center_radius_tensor)
        top = y - (cy - center_radius_tensor)
        right = (cx + center_radius_tensor) - x
        bottom = (cy + center_radius_tensor) - y
        delta_ltrb_c = torch.cat([left, top, right, bottom], dim=-1)
        is_in_center = delta_ltrb_c.min(dim=-1) > eps
        return (torch.logical_and(is_in_bboxes, is_in_center), torch.logical_or(is_in_bboxes, is_in_center))

    return is_in_bboxes.type_as(bboxes)


def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
    r"""
    Args:
        metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
        topk (int): The number of top elements to look for along the axis.
        largest (bool) : largest is a flag, if set to true,
            algorithm will sort by descending order, otherwise sort by
            ascending order. Default: True
        topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask,
            Default: None
        eps (float): Default: 1e-9
    Returns:
        is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected
    """
    num_anchors = metrics.shape[-1]
    topk_metrics, topk_idxs = torch.topk(metrics, topk, dim=-1, largest=largest)
    if topk_mask is None:
        topk_mask = (topk_metrics.max(dim=-1, keepdim=True).values > eps).type_as(metrics)
    is_in_topk = torch.nn.functional.one_hot(topk_idxs, num_anchors).sum(dim=-2).type_as(metrics)
    return is_in_topk * topk_mask


def bbox_center(boxes):
    """Get bbox centers from boxes.
    Args:
        boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
    Returns:
        Tensor: boxes centers with shape (..., 2), "cx, cy" format.
    """
    boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
    boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
    return torch.stack([boxes_cx, boxes_cy], dim=-1)


def compute_max_iou_gt(ious):
    r"""
    For each GT, find the anchor with the largest IOU.
    Args:
        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
    Returns:
        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
    """
    num_anchors = ious.shape[-1]
    max_iou_index = ious.argmax(dim=-1)
    is_max_iou = torch.nn.functional.one_hot(max_iou_index, num_anchors)
    return is_max_iou.astype(ious.dtype)


class ATSSAssigner(nn.Module):
    """Bridging the Gap Between Anchor-based and Anchor-free Detection
    via Adaptive Training Sample Selection
    """

    __shared__ = ["num_classes"]

    def __init__(self, topk=9, num_classes=80, force_gt_matching=False, eps=1e-9):
        """

        :param topk: Maximum number of achors that is selected for each gt box
        :param num_classes:
        :param force_gt_matching: Guarantee that each gt box is matched to at least one anchor.
            If two gt boxes match to the same anchor, the one with the larger area will be selected.
            And the second-best achnor will be assigned to the other gt box.
        :param eps: Small constant for numerical stability
        """
        super(ATSSAssigner, self).__init__()
        self.topk = topk
        self.num_classes = num_classes
        self.force_gt_matching = force_gt_matching
        self.eps = eps

    def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list, pad_gt_mask):
        gt2anchor_distances_list = torch.split(gt2anchor_distances, num_anchors_list, dim=-1)
        num_anchors_index = np.cumsum(num_anchors_list).tolist()
        num_anchors_index = [
            0,
        ] + num_anchors_index[:-1]
        is_in_topk_list = []
        topk_idxs_list = []
        for distances, anchors_index in zip(gt2anchor_distances_list, num_anchors_index):
            num_anchors = distances.shape[-1]
            _, topk_idxs = torch.topk(distances, self.topk, dim=-1, largest=False)
            topk_idxs_list.append(topk_idxs + anchors_index)
            is_in_topk = torch.nn.functional.one_hot(topk_idxs, num_anchors).sum(dim=-2).type_as(gt2anchor_distances)
            is_in_topk_list.append(is_in_topk * pad_gt_mask)
        is_in_topk_list = torch.cat(is_in_topk_list, dim=-1)
        topk_idxs_list = torch.cat(topk_idxs_list, dim=-1)
        return is_in_topk_list, topk_idxs_list

    @torch.no_grad()
    def forward(
        self,
        anchor_bboxes,
        num_anchors_list,
        gt_labels,
        gt_bboxes,
        pad_gt_mask,
        bg_index,
        gt_scores=None,
        pred_bboxes=None,
    ):
        r"""This code is based on
            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py

        The assignment is done in following steps
        1. compute iou between all bbox (bbox of all pyramid levels) and gt
        2. compute center distance between all bbox and gt
        3. on each pyramid level, for each gt, select k bbox whose center
           are closest to the gt center, so we total select k*l bbox as
           candidates for each gt
        4. get corresponding iou for the these candidates, and compute the
           mean and std, set mean + std as the iou threshold
        5. select these candidates whose iou are greater than or equal to
           the threshold as positive
        6. limit the positive sample's center in gt
        7. if an anchor box is assigned to multiple gts, the one with the
           highest iou will be selected.
        Args:
            anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4),
                    "xmin, xmax, ymin, ymax" format
            num_anchors_list (List): num of anchors in each level
            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
            bg_index (int): background index
            gt_scores (Tensor|None, float32) Score of gt_bboxes,
                    shape(B, n, 1), if None, then it will initialize with one_hot label
            pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4)
        Returns:
            assigned_labels (Tensor): (B, L)
            assigned_bboxes (Tensor): (B, L, 4)
            assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious
        """
        assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3

        num_anchors, _ = anchor_bboxes.shape
        batch_size, num_max_boxes, _ = gt_bboxes.shape

        # negative batch
        if num_max_boxes == 0:
            assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype=torch.long, device=anchor_bboxes.device)
            assigned_bboxes = torch.zeros([batch_size, num_anchors, 4], device=anchor_bboxes.device)
            assigned_scores = torch.zeros([batch_size, num_anchors, self.num_classes], device=anchor_bboxes.device)
            return assigned_labels, assigned_bboxes, assigned_scores

        # 1. compute iou between gt and anchor bbox, [B, n, L]
        ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
        ious = ious.reshape([batch_size, -1, num_anchors])

        # 2. compute center distance between all anchors and gt, [B, n, L]
        gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)
        anchor_centers = bbox_center(anchor_bboxes)
        # gt2anchor_distances = (
        #     (gt_centers - anchor_centers.unsqueeze(0)).norm(2, dim=-1).reshape([batch_size, -1, num_anchors])
        # )

        gt2anchor_distances = torch.norm(gt_centers - anchor_centers.unsqueeze(0), p=2, dim=-1).reshape([batch_size, -1, num_anchors])

        # 3. on each pyramid level, selecting top-k closest candidates
        # based on the center distance, [B, n, L]
        is_in_topk, topk_idxs = self._gather_topk_pyramid(gt2anchor_distances, num_anchors_list, pad_gt_mask)

        # 4. get corresponding iou for the these candidates, and compute the
        # mean and std, 5. set mean + std as the iou threshold
        iou_candidates = ious * is_in_topk

        iou_threshold = torch.gather(iou_candidates.flatten(end_dim=-2), dim=1, index=topk_idxs.flatten(end_dim=-2))

        iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
        iou_threshold = iou_threshold.mean(dim=-1, keepdim=True) + iou_threshold.std(dim=-1, keepdim=True)
        is_in_topk = torch.where(iou_candidates > iou_threshold, is_in_topk, torch.zeros_like(is_in_topk))

        # 6. check the positive sample's center in gt, [B, n, L]
        is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)

        # select positive sample, [B, n, L]
        mask_positive = is_in_topk * is_in_gts * pad_gt_mask

        # 7. if an anchor box is assigned to multiple gts,
        # the one with the highest iou will be selected.
        mask_positive_sum = mask_positive.sum(dim=-2)
        if mask_positive_sum.max() > 1:
            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
            is_max_iou = compute_max_iou_anchor(ious)
            mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
            mask_positive_sum = mask_positive.sum(dim=-2)
        # 8. make sure every gt_bbox matches the anchor
        if self.force_gt_matching:
            is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask
            mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile([1, num_max_boxes, 1])
            mask_positive = torch.where(mask_max_iou, is_max_iou, mask_positive)
            mask_positive_sum = mask_positive.sum(dim=-2)
        assigned_gt_index = mask_positive.argmax(dim=-2)

        # assigned target
        batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
        assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))

        # assigned_bboxes = torch.gather(gt_bboxes.reshape([-1, 4]), index=assigned_gt_index.flatten(), dim=0)
        assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

        assigned_scores = torch.nn.functional.one_hot(assigned_labels, self.num_classes + 1).float()
        ind = list(range(self.num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device), dim=-1)
        if pred_bboxes is not None:
            # assigned iou
            ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive
            ious = ious.max(dim=-2).values.unsqueeze(-1)
            assigned_scores *= ious
        elif gt_scores is not None:
            gather_scores = torch.gather(gt_scores.flatten(), assigned_gt_index.flatten(), dim=0)
            gather_scores = gather_scores.reshape([batch_size, num_anchors])
            gather_scores = torch.where(mask_positive_sum > 0, gather_scores, torch.zeros_like(gather_scores))
            assigned_scores *= gather_scores.unsqueeze(-1)

        return assigned_labels, assigned_bboxes, assigned_scores


class TaskAlignedAssigner(nn.Module):
    """TOOD: Task-aligned One-stage Object Detection"""

    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
        """

        :param topk: Maximum number of achors that is selected for each gt box
        :param alpha: Power factor for class probabilities of predicted boxes (Used compute alignment metric)
        :param beta: Power factor for IoU score of predicted boxes (Used compute alignment metric)
        :param eps: Small constant for numerical stability
        """
        super(TaskAlignedAssigner, self).__init__()
        self.topk = topk
        self.alpha = alpha
        self.beta = beta
        self.eps = eps

    @torch.no_grad()
    def forward(
        self,
        pred_scores,
        pred_bboxes,
        anchor_points,
        num_anchors_list,
        gt_labels,
        gt_bboxes,
        pad_gt_mask,
        bg_index,
        gt_scores=None,
    ):
        r"""This code is based on
            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py

        The assignment is done in following steps
        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
        2. select top-k bbox as candidates for each gt
        3. limit the positive sample's center in gt (because the anchor-free detector
           only can predict positive distance)
        4. if an anchor box is assigned to multiple gts, the one with the
           highest iou will be selected.
        Args:
            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
            num_anchors_list (List): num of anchors in each level, shape(L)
            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
            bg_index (int): background index
            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
        Returns:
            assigned_labels (Tensor): (B, L)
            assigned_bboxes (Tensor): (B, L, 4)
            assigned_scores (Tensor): (B, L, C)
        """
        assert pred_scores.ndim == pred_bboxes.ndim
        assert gt_labels.ndim == gt_bboxes.ndim and gt_bboxes.ndim == 3

        batch_size, num_anchors, num_classes = pred_scores.shape
        _, num_max_boxes, _ = gt_bboxes.shape

        # negative batch
        if num_max_boxes == 0:
            assigned_labels = torch.full([batch_size, num_anchors], bg_index, dtype="int32")
            assigned_bboxes = torch.zeros([batch_size, num_anchors, 4])
            assigned_scores = torch.zeros([batch_size, num_anchors, num_classes])
            return assigned_labels, assigned_bboxes, assigned_scores

        # compute iou between gt and pred bbox, [B, n, L]
        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
        # gather pred bboxes class score
        pred_scores = torch.permute(pred_scores, [0, 2, 1])
        batch_ind = torch.arange(end=batch_size, dtype=gt_labels.dtype, device=gt_labels.device).unsqueeze(-1)
        gt_labels_ind = torch.stack([batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], dim=-1)

        bbox_cls_scores = pred_scores[gt_labels_ind[..., 0], gt_labels_ind[..., 1]]

        # compute alignment metrics, [B, n, L]
        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(self.beta)

        # check the positive sample's center in gt, [B, n, L]
        is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)

        # select topk largest alignment metrics pred bbox as candidates
        # for each gt, [B, n, L]
        is_in_topk = gather_topk_anchors(alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)

        # select positive sample, [B, n, L]
        mask_positive = is_in_topk * is_in_gts * pad_gt_mask

        # if an anchor box is assigned to multiple gts,
        # the one with the highest iou will be selected, [B, n, L]
        mask_positive_sum = mask_positive.sum(dim=-2)
        if mask_positive_sum.max() > 1:
            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile([1, num_max_boxes, 1])
            is_max_iou = compute_max_iou_anchor(ious)
            mask_positive = torch.where(mask_multiple_gts, is_max_iou, mask_positive)
            mask_positive_sum = mask_positive.sum(dim=-2)
        assigned_gt_index = mask_positive.argmax(dim=-2)

        # assigned target
        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
        assigned_labels = torch.gather(gt_labels.flatten(), index=assigned_gt_index.flatten(), dim=0)
        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
        assigned_labels = torch.where(mask_positive_sum > 0, assigned_labels, torch.full_like(assigned_labels, bg_index))

        assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_index.flatten(), :]
        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])

        assigned_scores = torch.nn.functional.one_hot(assigned_labels, num_classes + 1)
        ind = list(range(num_classes + 1))
        ind.remove(bg_index)
        assigned_scores = torch.index_select(assigned_scores, index=torch.tensor(ind, device=assigned_scores.device, dtype=torch.long), dim=-1)
        # rescale alignment metrics
        alignment_metrics *= mask_positive
        max_metrics_per_instance = alignment_metrics.max(dim=-1, keepdim=True).values
        max_ious_per_instance = (ious * mask_positive).max(dim=-1, keepdim=True).values
        alignment_metrics = alignment_metrics / (max_metrics_per_instance + self.eps) * max_ious_per_instance
        alignment_metrics = alignment_metrics.max(dim=-2).values.unsqueeze(-1)
        assigned_scores = assigned_scores * alignment_metrics

        return assigned_labels, assigned_bboxes, assigned_scores


class GIoULoss(object):
    """
    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
    Args:
        loss_weight (float): giou loss weight, default as 1
        eps (float): epsilon to avoid divide by zero, default as 1e-10
        reduction (string): Options are "none", "mean" and "sum". default as none
    """

    def __init__(self, loss_weight=1.0, eps=1e-10, reduction="none"):
        self.loss_weight = loss_weight
        self.eps = eps
        assert reduction in ("none", "mean", "sum")
        self.reduction = reduction

    def bbox_overlap(self, box1, box2, eps=1e-10):
        """calculate the iou of box1 and box2
        Args:
            box1 (Tensor): box1 with the shape (..., 4)
            box2 (Tensor): box1 with the shape (..., 4)
            eps (float): epsilon to avoid divide by zero
        Return:
            iou (Tensor): iou of box1 and box2
            overlap (Tensor): overlap of box1 and box2
            union (Tensor): union of box1 and box2
        """
        x1, y1, x2, y2 = box1
        x1g, y1g, x2g, y2g = box2

        xkis1 = torch.maximum(x1, x1g)
        ykis1 = torch.maximum(y1, y1g)
        xkis2 = torch.minimum(x2, x2g)
        ykis2 = torch.minimum(y2, y2g)
        w_inter = (xkis2 - xkis1).clip(0)
        h_inter = (ykis2 - ykis1).clip(0)
        overlap = w_inter * h_inter

        area1 = (x2 - x1) * (y2 - y1)
        area2 = (x2g - x1g) * (y2g - y1g)
        union = area1 + area2 - overlap + eps
        iou = overlap / union

        return iou, overlap, union

    def __call__(self, pbox: Tensor, gbox: Tensor, iou_weight=1.0, loc_reweight=None):
        # x1, y1, x2, y2 = torch.split(pbox, split_size_or_sections=4, dim=-1)
        # x1g, y1g, x2g, y2g = torch.split(gbox, split_size_or_sections=4, dim=-1)

        x1, y1, x2, y2 = pbox.chunk(4, dim=-1)
        x1g, y1g, x2g, y2g = gbox.chunk(4, dim=-1)

        box1 = [x1, y1, x2, y2]
        box2 = [x1g, y1g, x2g, y2g]
        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
        xc1 = torch.minimum(x1, x1g)
        yc1 = torch.minimum(y1, y1g)
        xc2 = torch.maximum(x2, x2g)
        yc2 = torch.maximum(y2, y2g)

        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
        miou = iou - ((area_c - union) / area_c)
        if loc_reweight is not None:
            loc_reweight = torch.reshape(loc_reweight, shape=(-1, 1))
            loc_thresh = 0.9
            giou = 1 - (1 - loc_thresh) * miou - loc_thresh * miou * loc_reweight
        else:
            giou = 1 - miou
        if self.reduction == "none":
            loss = giou
        elif self.reduction == "sum":
            loss = torch.sum(giou * iou_weight)
        else:
            loss = torch.mean(giou * iou_weight)
        return loss * self.loss_weight


class PPYoloELoss(nn.Module):
    def __init__(
        self,
        num_classes: int,
        use_varifocal_loss: bool = True,
        use_static_assigner: bool = True,
        reg_max: int = 16,
        classification_loss_weight: float = 1.0,
        iou_loss_weight: float = 2.5,
        dfl_loss_weight: float = 0.5,
    ):
        """
        :param num_classes: Number of classes
        :param use_varifocal_loss: Whether to use Varifocal loss for classification loss; otherwise use Focal loss
        :param static_assigner_epoch: Whether to use static assigner or Task-Aligned assigner
        :param classification_loss_weight: Classification loss weight
        :param iou_loss_weight: IoU loss weight
        :param dfl_loss_weight: DFL loss weight
        :param reg_max: Number of regression bins (Must match the number of bins in the PPYoloE head)
        """
        super().__init__()
        self.use_varifocal_loss = use_varifocal_loss
        self.classification_loss_weight = classification_loss_weight
        self.dfl_loss_weight = dfl_loss_weight
        self.iou_loss_weight = iou_loss_weight

        self.iou_loss = GIoULoss()
        self.static_assigner = ATSSAssigner(topk=9, num_classes=num_classes)
        self.assigner = TaskAlignedAssigner(topk=13, alpha=1.0, beta=6.0)
        self.use_static_assigner = use_static_assigner
        self.reg_max = reg_max
        self.num_classes = num_classes

        # Same as in PPYoloE head
        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
        self.register_buffer("proj_conv", proj)

    @torch.no_grad()
    def _yolox_targets_to_ppyolo(self, targets: torch.Tensor, batch_size: int) -> Mapping[str, torch.Tensor]:
        """
        Convert targets from YoloX format to PPYolo since its the easiest (not the cleanest) way to
        have PP Yolo training & metrics computed

        :param targets: (N, 6) format of bboxes is meant to be LABEL_CXCYWH (index, c, cx, cy, w, h)
        :return: (Dictionary [str,Tensor]) with keys:
         - gt_class: (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
         - gt_bbox: (Tensor, float32): Ground truth bboxes, shape(B, n, 4) in x1y1x2y2 format
         - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
        """
        image_index = targets[:, 0]
        gt_class = targets[:, 1:2].long()
        gt_bbox = cxcywh_to_xyxy(targets[:, 2:6], image_shape=None)

        per_image_class = []
        per_image_bbox = []
        per_image_pad_mask = []

        max_boxes = 0
        for i in range(batch_size):
            mask = image_index == i

            image_labels = gt_class[mask]
            image_bboxes = gt_bbox[mask, :]
            valid_bboxes = image_bboxes.sum(dim=1, keepdims=True) > 0

            per_image_class.append(image_labels)
            per_image_bbox.append(image_bboxes)
            per_image_pad_mask.append(valid_bboxes)

            max_boxes = max(max_boxes, mask.sum().item())

        for i in range(batch_size):
            elements_to_pad = max_boxes - len(per_image_class[i])
            padding_left = 0
            padding_right = 0
            padding_top = 0
            padding_bottom = elements_to_pad
            pad = padding_left, padding_right, padding_top, padding_bottom
            per_image_class[i] = F.pad(per_image_class[i], pad, mode="constant", value=0)
            per_image_bbox[i] = F.pad(per_image_bbox[i], pad, mode="constant", value=0)
            per_image_pad_mask[i] = F.pad(per_image_pad_mask[i], pad, mode="constant", value=0)

        return {
            "gt_class": torch.stack(per_image_class, dim=0),
            "gt_bbox": torch.stack(per_image_bbox, dim=0),
            "pad_gt_mask": torch.stack(per_image_pad_mask, dim=0),
        }

    def forward(
        self,
        outputs: Union[
            Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor], Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]]
        ],
        targets: Tensor,
    ) -> Mapping[str, Tensor]:
        """
        :param outputs: Tuple of pred_scores, pred_distri, anchors, anchor_points, num_anchors_list, stride_tensor
        :param targets: (Dictionary [str,Tensor]) with keys:
         - gt_class: (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
         - gt_bbox: (Tensor, float32): Ground truth bboxes, shape(B, n, 4) in x1y1x2y2 format
         - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
        :return:
        """
        # in test/eval mode the model outputs a tuple where the second item is the raw predictions
        if isinstance(outputs, tuple) and len(outputs) == 2:
            # in test/eval mode the Yolo model outputs a tuple where the second item is the raw predictions
            _, predictions = outputs
        else:
            predictions = outputs

        (
            pred_scores,
            pred_distri,
            anchors,
            anchor_points,
            num_anchors_list,
            stride_tensor,
        ) = predictions

        targets = self._yolox_targets_to_ppyolo(targets, batch_size=pred_scores.size(0))  # yolox -> ppyolo

        anchor_points_s = anchor_points / stride_tensor
        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)

        gt_labels = targets["gt_class"]
        gt_bboxes = targets["gt_bbox"]
        pad_gt_mask = targets["pad_gt_mask"]
        # label assignment
        if self.use_static_assigner:
            assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
                anchor_bboxes=anchors,
                num_anchors_list=num_anchors_list,
                gt_labels=gt_labels,
                gt_bboxes=gt_bboxes,
                pad_gt_mask=pad_gt_mask,
                bg_index=self.num_classes,
                pred_bboxes=pred_bboxes.detach() * stride_tensor,
            )
            alpha_l = 0.25
        else:
            assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
                pred_scores=pred_scores.detach().sigmoid(),  # Pred scores are logits on training for numerical stability
                pred_bboxes=pred_bboxes.detach() * stride_tensor,
                anchor_points=anchor_points,
                num_anchors_list=num_anchors_list,
                gt_labels=gt_labels,
                gt_bboxes=gt_bboxes,
                pad_gt_mask=pad_gt_mask,
                bg_index=self.num_classes,
            )
            alpha_l = -1
        # rescale bbox
        assigned_bboxes /= stride_tensor
        # cls loss
        if self.use_varifocal_loss:
            one_hot_label = torch.nn.functional.one_hot(assigned_labels, self.num_classes + 1)[..., :-1]
            loss_cls = self._varifocal_loss(pred_scores, assigned_scores, one_hot_label)
        else:
            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)

        assigned_scores_sum = assigned_scores.sum()
        if super_gradients.is_distributed():
            torch.distributed.all_reduce(assigned_scores_sum, op=torch.distributed.ReduceOp.SUM)
            assigned_scores_sum /= get_world_size()
        assigned_scores_sum = torch.clip(assigned_scores_sum, min=1.0)
        loss_cls /= assigned_scores_sum

        loss_iou, loss_dfl = self._bbox_loss(
            pred_distri,
            pred_bboxes,
            anchor_points_s,
            assigned_labels,
            assigned_bboxes,
            assigned_scores,
            assigned_scores_sum,
        )

        loss = self.classification_loss_weight * loss_cls + self.iou_loss_weight * loss_iou + self.dfl_loss_weight * loss_dfl
        log_losses = torch.stack([loss_cls.detach(), loss_iou.detach(), loss_dfl.detach(), loss.detach()])

        return loss, log_losses

    @property
    def component_names(self):
        return ["loss_cls", "loss_iou", "loss_dfl", "loss"]

    def _df_loss(self, pred_dist: Tensor, target: Tensor) -> Tensor:
        target_left = target.long()
        target_right = target_left + 1
        weight_left = target_right.float() - target
        weight_right = 1 - weight_left

        # [B,L,C] -> [B,C,L] to make compatible with torch.nn.functional.cross_entropy
        # which expects channel dim to be at index 1
        pred_dist = torch.moveaxis(pred_dist, -1, 1)

        loss_left = torch.nn.functional.cross_entropy(pred_dist, target_left, reduction="none") * weight_left
        loss_right = torch.nn.functional.cross_entropy(pred_dist, target_right, reduction="none") * weight_right
        return (loss_left + loss_right).mean(dim=-1, keepdim=True)

    def _bbox_loss(
        self,
        pred_dist,
        pred_bboxes,
        anchor_points,
        assigned_labels,
        assigned_bboxes,
        assigned_scores,
        assigned_scores_sum,
    ):
        # select positive samples mask
        mask_positive = assigned_labels != self.num_classes
        num_pos = mask_positive.sum()
        # pos/neg loss
        if num_pos > 0:
            # l1 + iou
            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
            pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).reshape([-1, 4])
            assigned_bboxes_pos = torch.masked_select(assigned_bboxes, bbox_mask).reshape([-1, 4])
            bbox_weight = torch.masked_select(assigned_scores.sum(-1), mask_positive).unsqueeze(-1)

            loss_iou = self.iou_loss(pred_bboxes_pos, assigned_bboxes_pos) * bbox_weight
            loss_iou = loss_iou.sum() / assigned_scores_sum

            dist_mask = mask_positive.unsqueeze(-1).tile([1, 1, (self.reg_max + 1) * 4])
            pred_dist_pos = torch.masked_select(pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1])
            assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes)
            assigned_ltrb_pos = torch.masked_select(assigned_ltrb, bbox_mask).reshape([-1, 4])
            loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos) * bbox_weight
            loss_dfl = loss_dfl.sum() / assigned_scores_sum
        else:
            loss_iou = torch.zeros([], device=pred_bboxes.device)
            loss_dfl = pred_dist.sum() * 0.0
        return loss_iou, loss_dfl

    def _bbox_decode(self, anchor_points: Tensor, pred_dist: Tensor):
        b, l, *_ = pred_dist.size()
        pred_dist = torch.softmax(pred_dist.reshape([b, l, 4, self.reg_max + 1]), dim=-1)
        pred_dist = torch.nn.functional.conv2d(pred_dist.permute(0, 3, 1, 2), self.proj_conv).squeeze(1)
        return batch_distance2bbox(anchor_points, pred_dist)

    def _bbox2distance(self, points, bbox):
        x1y1, x2y2 = torch.split(bbox, 2, -1)
        lt = points - x1y1
        rb = x2y2 - points
        return torch.cat([lt, rb], dim=-1).clip(0, self.reg_max - 0.01)

    @staticmethod
    def _focal_loss(pred_logits: Tensor, label: Tensor, alpha=0.25, gamma=2.0) -> Tensor:
        pred_score = pred_logits.sigmoid()
        weight = (pred_score - label).pow(gamma)
        if alpha > 0:
            alpha_t = alpha * label + (1 - alpha) * (1 - label)
            weight *= alpha_t
        loss = -weight * (label * torch.nn.functional.logsigmoid(pred_logits) + (1 - label) * torch.nn.functional.logsigmoid(-pred_logits))
        return loss.sum()

    @staticmethod
    def _varifocal_loss(pred_logits: Tensor, gt_score: Tensor, label: Tensor, alpha=0.75, gamma=2.0) -> Tensor:
        pred_score = pred_logits.sigmoid()
        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
        loss = -weight * (gt_score * torch.nn.functional.logsigmoid(pred_logits) + (1 - gt_score) * torch.nn.functional.logsigmoid(-pred_logits))
        return loss.sum()

          
@@ -84,13 +84,12 @@ class DetectionMetrics(Metric):
 
                                     Apply NMS and match all the predictions and targets of a given batch, and update the metric state accordingly.
                
 
                                     :param preds :        Raw output of the model, the format might change from one model to another, but has to fit
                
 
                            -                                the input format of the post_prediction_callback
                
 
                            -        :param target:        Targets for all images of shape (total_num_targets, 6)
                
 
                            -                                format:  (index, x, y, w, h, label) where x,y,w,h are in range [0,1]
                
 
                            +                                the input format of the post_prediction_callback (cx,cy,wh)
                
 
                            +        :param target:        Targets for all images of shape (total_num_targets, 6) LABEL_CXCYWH
                
 
                            +                                format:  (index, label, cx, cy, w, h)
                
 
                                     :param device:        Device to run on
                
 
                                     :param inputs:        Input image tensor of shape (batch_size, n_img, height, width)
                
 
                            -        :param crowd_targets: Crowd targets for all images of shape (total_num_targets, 6)
                
 
                            -                                 format:  (index, x, y, w, h, label) where x,y,w,h are in range [0,1]
                
 
                            +        :param crowd_targets: Crowd targets for all images of shape (total_num_targets, 6), LABEL_CXCYWH
                
 
                                     """
                
 
                                     self.iou_thresholds = self.iou_thresholds.to(device)
                
 
                                     _, _, height, width = inputs.shape
                
@@ -1,3 +1,4 @@
 
                            +from super_gradients.common.object_names import Models
                
 
                             from super_gradients.training.models import ResNeXt50, ResNeXt101, GoogleNetV1
                
 
                             from super_gradients.training.models.classification_models import repvgg, efficientnet, densenet, resnet, regnet
                
 
                             from super_gradients.training.models.classification_models.mobilenetv2 import MobileNetV2Base, MobileNetV2_135, CustomMobileNetV2
                
@@ -12,6 +13,7 @@ from super_gradients.training.models.classification_models.shufflenetv2 import (
 
                             from super_gradients.training.models.classification_models.vit import ViTBase, ViTLarge, ViTHuge
                
 
                             from super_gradients.training.models.detection_models.csp_darknet53 import CSPDarknet53
                
 
                             from super_gradients.training.models.detection_models.darknet53 import Darknet53
                
 
                            +from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_e import PPYoloE_M, PPYoloE_L, PPYoloE_X, PPYoloE_S
                
 
                             from super_gradients.training.models.detection_models.ssd import SSDMobileNetV1, SSDLiteMobileNetV2
                
 
                             from super_gradients.training.models.detection_models.yolox import YoloX_N, YoloX_T, YoloX_S, YoloX_M, YoloX_L, YoloX_X, CustomYoloX
                
 
                             from super_gradients.training.models.segmentation_models.ddrnet import DDRNet23, DDRNet23Slim, AnyBackBoneDDRNet23, DDRNet39
                
@@ -30,7 +32,6 @@ from super_gradients.training.models.kd_modules.kd_module import KDModule
 
                             from super_gradients.training.models.classification_models.beit import BeitBasePatch16_224, BeitLargePatch16_224
                
 
                             from super_gradients.training.models.segmentation_models.ppliteseg import PPLiteSegT, PPLiteSegB
                
 
                             from super_gradients.training.models.segmentation_models.unet import UNetCustom, UnetClassification
                
 
                            -from super_gradients.common.object_names import Models
                
 
                             ARCHITECTURES = {
                
 
                                 Models.RESNET18: resnet.ResNet18,
                
@@ -135,6 +136,10 @@ ARCHITECTURES = {
 
                                 Models.CUSTOM_ANYNET: regnet.CustomAnyNet,
                
 
                                 Models.UNET_CUSTOM: UNetCustom,
                
 
                                 Models.UNET_CUSTOM_CLS: UnetClassification,
                
 
                            +    Models.PP_YOLOE_S: PPYoloE_S,
                
 
                            +    Models.PP_YOLOE_M: PPYoloE_M,
                
 
                            +    Models.PP_YOLOE_L: PPYoloE_L,
                
 
                            +    Models.PP_YOLOE_X: PPYoloE_X,
                
 
                             }
                
 
                             KD_ARCHITECTURES = {Models.KD_MODULE: KDModule}
                
@@ -1,5 +1,7 @@
 
                             import collections
                
 
                            -from typing import List, Type, Tuple
                
 
                            +import os.path
                
 
                            +from pathlib import Path
                
 
                            +from typing import List, Type, Tuple, Union, Optional
                
 
                             import torch
                
 
                             from super_gradients.common.decorators.factory_decorator import resolve_param
                
@@ -8,7 +10,9 @@ from torch import nn, Tensor
 
                             from super_gradients.modules import RepVGGBlock, EffectiveSEBlock, ConvBNAct
                
 
                            -__all__ = ["CSPResNet"]
                
 
                            +__all__ = ["CSPResNet", "CSPResNetBasicBlock"]
                
 
                            +
                
 
                            +from super_gradients.training.utils.distributed_training_utils import wait_for_the_master, get_local_rank
                
 
                             class CSPResNetBasicBlock(nn.Module):
                
@@ -98,7 +102,7 @@ class CSPResStage(nn.Module):
 
                                         x = self.conv_down(x)
                
 
                                     y1 = self.conv1(x)
                
 
                                     y2 = self.blocks(self.conv2(x))
                
 
                            -        y = torch.concat([y1, y2], dim=1)
                
 
                            +        y = torch.cat([y1, y2], dim=1)
                
 
                                     y = self.attn(y)
                
 
                                     y = self.conv3(y)
                
 
                                     return y
                
@@ -120,6 +124,7 @@ class CSPResNet(nn.Module):
 
                                     width_mult: float,
                
 
                                     depth_mult: float,
                
 
                                     use_alpha: bool,
                
 
                            +        pretrained_weights: Optional[str] = None,
                
 
                                 ):
                
 
                                     """
                
@@ -131,6 +136,7 @@ class CSPResNet(nn.Module):
 
                                     :param width_mult: Scaling factor for a number of channels
                
 
                                     :param depth_mult: Scaling factor for a number of blocks in each stage
                
 
                                     :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch in RepVGGBlock
                
 
                            +        :param pretrained_weights:
                
 
                                     """
                
 
                                     super().__init__()
                
 
                                     channels = [max(round(num_channels * width_mult), 1) for num_channels in channels]
                
@@ -198,6 +204,16 @@ class CSPResNet(nn.Module):
 
                                     self._out_strides = [4 * 2**i for i in range(n)]
                
 
                                     self.return_idx = return_idx
                
 
                            +        if pretrained_weights:
                
 
                            +            if isinstance(pretrained_weights, (str, Path)) and os.path.isfile(str(pretrained_weights)):
                
 
                            +                state_dict = torch.load(str(pretrained_weights), map_location="cpu")
                
 
                            +            elif isinstance(pretrained_weights, str) and pretrained_weights.startswith("https://"):
                
 
                            +                with wait_for_the_master(get_local_rank()):
                
 
                            +                    state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
                
 
                            +            else:
                
 
                            +                raise ValueError("pretrained_weights argument should be a path to local file or url to remote file")
                
 
                            +            self.load_state_dict(state_dict)
                
 
                            +
                
 
                                 def forward(self, x: Tensor) -> List[Tensor]:
                
 
                                     x = self.stem(x)
                
 
                                     outs = []
                
@@ -207,3 +223,14 @@ class CSPResNet(nn.Module):
 
                                             outs.append(x)
                
 
                                     return outs
                
 
                            +
                
 
                            +    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
                
 
                            +        """
                
 
                            +        Prepare the model to be converted to ONNX or other frameworks.
                
 
                            +        Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
                
 
                            +        with convertible substitutes and remove all auxiliary or training related parts.
                
 
                            +        :param input_size: [H,W]
                
 
                            +        """
                
 
                            +        for module in self.modules():
                
 
                            +            if isinstance(module, RepVGGBlock):
                
 
                            +                module.fuse_block_residual_branches()
                
 
            from .pp_yolo_e import PPYoloE
from .post_prediction_callback import PPYoloEPostPredictionCallback

__all__ = ["PPYoloE", "PPYoloEPostPredictionCallback"]

          
 
            import collections
from typing import Type, Tuple, List

import torch
from super_gradients.common.decorators.factory_decorator import resolve_param
from super_gradients.common.factories.activations_type_factory import ActivationsTypeFactory
from torch import nn, Tensor
from super_gradients.training.models.detection_models.csp_resnet import CSPResNetBasicBlock
from super_gradients.modules import ConvBNAct

__all__ = ["CustomCSPPAN"]


class SPP(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        pool_size: Tuple[int, ...],
        activation_type: Type[nn.Module],
    ):
        super().__init__()
        mid_channels = in_channels * (1 + len(pool_size))
        pools = []
        for i, size in enumerate(pool_size):
            pool = nn.MaxPool2d(kernel_size=size, stride=1, padding=size // 2, ceil_mode=False)
            pools.append(pool)
        self.pool = nn.ModuleList(pools)
        self.conv = ConvBNAct(mid_channels, out_channels, kernel_size, padding=kernel_size // 2, activation_type=activation_type, stride=1, bias=False)

    def forward(self, x: Tensor) -> Tensor:
        outs = [x]
        for pool in self.pool:
            outs.append(pool(x))
        y = torch.cat(outs, dim=1)
        y = self.conv(y)
        return y


class CSPStage(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, n, activation_type: Type[nn.Module], spp: bool):
        super().__init__()
        ch_mid = int(out_channels // 2)
        self.conv1 = ConvBNAct(in_channels, ch_mid, kernel_size=1, padding=0, activation_type=activation_type, stride=1, bias=False)
        self.conv2 = ConvBNAct(in_channels, ch_mid, kernel_size=1, padding=0, activation_type=activation_type, stride=1, bias=False)

        convs = []
        next_ch_in = ch_mid
        for i in range(n):
            convs.append((str(i), CSPResNetBasicBlock(next_ch_in, ch_mid, activation_type=activation_type, use_residual_connection=False)))
            if i == (n - 1) // 2 and spp:
                convs.append(("spp", SPP(ch_mid, ch_mid, 1, (5, 9, 13), activation_type=activation_type)))
            next_ch_in = ch_mid

        self.convs = nn.Sequential(collections.OrderedDict(convs))
        self.conv3 = ConvBNAct(ch_mid * 2, out_channels, kernel_size=1, padding=0, activation_type=activation_type, stride=1, bias=False)

    def forward(self, x):
        y1 = self.conv1(x)
        y2 = self.conv2(x)
        y2 = self.convs(y2)
        y = torch.cat([y1, y2], dim=1)
        y = self.conv3(y)
        return y


class CustomCSPPAN(nn.Module):
    @resolve_param("activation", ActivationsTypeFactory())
    def __init__(
        self,
        in_channels: Tuple[int, ...],
        out_channels: Tuple[int, ...],
        activation: Type[nn.Module],
        stage_num: int,
        block_num: int,
        spp: bool,
        width_mult: float,
        depth_mult: float,
    ):
        super().__init__()
        in_channels = [max(round(c * width_mult), 1) for c in in_channels]
        out_channels = [max(round(c * width_mult), 1) for c in out_channels]

        block_num = max(round(block_num * depth_mult), 1)
        self.num_blocks = len(in_channels)
        self._out_channels = out_channels
        in_channels = in_channels[::-1]
        fpn_stages = []
        fpn_routes = []
        ch_pre = None
        for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)):
            if i > 0:
                ch_in += ch_pre // 2

            stage = []
            for j in range(stage_num):
                stage.append(
                    (
                        str(j),
                        CSPStage(
                            ch_in if j == 0 else ch_out,
                            ch_out,
                            block_num,
                            activation_type=activation,
                            spp=(spp and i == 0),
                        ),
                    ),
                )

            fpn_stages.append(nn.Sequential(collections.OrderedDict(stage)))

            if i < self.num_blocks - 1:
                fpn_routes.append(
                    ConvBNAct(in_channels=ch_out, out_channels=ch_out // 2, kernel_size=1, stride=1, padding=0, activation_type=activation, bias=False)
                )

            ch_pre = ch_out

        self.fpn_stages = nn.ModuleList(fpn_stages)
        self.fpn_routes = nn.ModuleList(fpn_routes)

        pan_stages = []
        pan_routes = []
        for i in reversed(range(self.num_blocks - 1)):
            pan_routes.append(
                ConvBNAct(
                    in_channels=out_channels[i + 1],
                    out_channels=out_channels[i + 1],
                    kernel_size=3,
                    stride=2,
                    padding=1,
                    activation_type=activation,
                    bias=False,
                )
            )

            ch_in = out_channels[i] + out_channels[i + 1]
            ch_out = out_channels[i]
            stage = []
            for j in range(stage_num):
                stage.append(
                    (
                        str(j),
                        CSPStage(
                            ch_in if j == 0 else ch_out,
                            ch_out,
                            block_num,
                            activation_type=activation,
                            spp=False,
                        ),
                    ),
                )

            pan_stages.append(nn.Sequential(collections.OrderedDict(stage)))

        self.pan_stages = nn.ModuleList(pan_stages[::-1])
        self.pan_routes = nn.ModuleList(pan_routes[::-1])

    def forward(self, blocks: List[Tensor]) -> List[Tensor]:
        blocks = blocks[::-1]
        fpn_feats = []
        route = None
        for i, block in enumerate(blocks):
            if i > 0:
                block = torch.cat([route, block], dim=1)
            route = self.fpn_stages[i](block)
            fpn_feats.append(route)

            if i < self.num_blocks - 1:
                route = self.fpn_routes[i](route)
                route = torch.nn.functional.interpolate(route, scale_factor=2, mode="nearest")

        pan_feats = [
            fpn_feats[-1],
        ]
        route = fpn_feats[-1]
        for i in reversed(range(self.num_blocks - 1)):
            block = fpn_feats[i]
            route = self.pan_routes[i](route)
            block = torch.cat([route, block], dim=1)
            route = self.pan_stages[i](block)
            pan_feats.append(route)

        return pan_feats[::-1]

          
 
            from typing import List

import torch
import torchvision

from super_gradients.training.utils.detection_utils import DetectionPostPredictionCallback


class PPYoloEPostPredictionCallback(DetectionPostPredictionCallback):
    """Non-Maximum Suppression (NMS) module"""

    def __init__(self, score_threshold: float, nms_threshold: float, nms_top_k: int, max_predictions: int, multi_label_per_box: bool = True):
        """
        :param score_threshold: Predictions confidence threshold. Predictions with score lower than score_threshold will not participate in Top-K & NMS
        :param iou: IoU threshold for NMS step.
        :param nms_top_k: Number of predictions participating in NMS step
        :param max_predictions: maximum number of boxes to return after NMS step

        """
        super(PPYoloEPostPredictionCallback, self).__init__()
        self.score_threshold = score_threshold
        self.nms_threshold = nms_threshold
        self.nms_top_k = nms_top_k
        self.max_predictions = max_predictions
        self.multi_label_per_box = multi_label_per_box

    def forward(self, outputs, device: str):
        """

        :param x: Tuple of (bboxes, scores) of shape [B, Anchors, 4], [B, Anchors, C]
        :param device:
        :return:
        """
        nms_result = []
        # First is model predictions, second element of tuple is logits for loss computation
        predictions = outputs[0]

        for pred_bboxes, pred_scores in zip(*predictions):
            # pred_bboxes [Anchors, 4],
            # pred_scores [Anchors, C]

            # Filter all predictions by self.score_threshold
            if self.multi_label_per_box:
                i, j = (pred_scores > self.score_threshold).nonzero(as_tuple=False).T
                pred_bboxes = pred_bboxes[i]
                pred_cls_conf = pred_scores[i, j]
                pred_cls_label = j[:]

            else:
                pred_cls_conf, pred_cls_label = torch.max(pred_scores, dim=1)
                conf_mask = pred_cls_conf >= self.score_threshold

                pred_cls_conf = pred_cls_conf[conf_mask]
                pred_cls_label = pred_cls_label[conf_mask]
                pred_bboxes = pred_bboxes[conf_mask, :]

            # Filter all predictions by self.nms_top_k
            if pred_cls_conf.size(0) > self.nms_top_k:
                topk_candidates = torch.topk(pred_cls_conf, k=self.nms_top_k, largest=True)
                pred_cls_conf = pred_cls_conf[topk_candidates.indices]
                pred_cls_label = pred_cls_label[topk_candidates.indices]
                pred_bboxes = pred_bboxes[topk_candidates.indices, :]

            # NMS
            idx_to_keep = torchvision.ops.boxes.batched_nms(boxes=pred_bboxes, scores=pred_cls_conf, idxs=pred_cls_label, iou_threshold=self.nms_threshold)

            pred_cls_conf = pred_cls_conf[idx_to_keep].unsqueeze(-1)
            pred_cls_label = pred_cls_label[idx_to_keep].unsqueeze(-1)
            pred_bboxes = pred_bboxes[idx_to_keep, :]

            #  nx6 (x1, y1, x2, y2, confidence, class) in pixel units
            final_boxes = torch.cat([pred_bboxes, pred_cls_conf, pred_cls_label], dim=1)  # [N,6]

            nms_result.append(final_boxes)

        return self._filter_max_predictions(nms_result)

    def _filter_max_predictions(self, res: List) -> List:
        res[:] = [im[: self.max_predictions] if (im is not None and im.shape[0] > self.max_predictions) else im for im in res]

        return res

          
 
            from typing import Union

from torch import Tensor

from super_gradients.modules import RepVGGBlock
from super_gradients.training.models.sg_module import SgModule
from super_gradients.training.models.detection_models.csp_resnet import CSPResNet
from super_gradients.training.models.detection_models.pp_yolo_e.pan import CustomCSPPAN
from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_head import PPYOLOEHead
from super_gradients.training.utils import HpmStruct
from super_gradients.training.models.arch_params_factory import get_arch_params


class PPYoloE(SgModule):
    def __init__(self, arch_params):
        super().__init__()
        if isinstance(arch_params, HpmStruct):
            arch_params = arch_params.to_dict()

        self.backbone = CSPResNet(**arch_params["backbone"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
        self.neck = CustomCSPPAN(**arch_params["neck"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
        self.head = PPYOLOEHead(**arch_params["head"], width_mult=arch_params["width_mult"], num_classes=arch_params["num_classes"])

    def forward(self, x: Tensor):
        features = self.backbone(x)
        features = self.neck(features)
        return self.head(features)

    def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs):
        """
        Prepare the model to be converted to ONNX or other frameworks.
        Typically, this function will freeze the size of layers which is otherwise flexible, replace some modules
        with convertible substitutes and remove all auxiliary or training related parts.
        :param input_size: [H,W]
        """
        for module in self.modules():
            if isinstance(module, RepVGGBlock):
                module.prep_model_for_conversion(input_size)

    def replace_head(self, new_num_classes=None, new_head=None):
        if new_num_classes is None and new_head is None:
            raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
        if new_head is not None:
            self.head = new_head
        else:
            self.head.replace_num_classes(new_num_classes)


class PPYoloE_S(PPYoloE):
    def __init__(self, arch_params):
        if isinstance(arch_params, HpmStruct):
            arch_params = arch_params.to_dict()
        arch_params = get_arch_params("ppyoloe_s_arch_params", arch_params)
        super().__init__(arch_params)


class PPYoloE_M(PPYoloE):
    def __init__(self, arch_params):
        if isinstance(arch_params, HpmStruct):
            arch_params = arch_params.to_dict()
        arch_params = get_arch_params("ppyoloe_m_arch_params", arch_params)
        super().__init__(arch_params)


class PPYoloE_L(PPYoloE):
    def __init__(self, arch_params):
        if isinstance(arch_params, HpmStruct):
            arch_params = arch_params.to_dict()
        arch_params = get_arch_params("ppyoloe_l_arch_params", arch_params)
        super().__init__(arch_params)


class PPYoloE_X(PPYoloE):
    def __init__(self, arch_params):
        if isinstance(arch_params, HpmStruct):
            arch_params = arch_params.to_dict()
        arch_params = get_arch_params("ppyoloe_x_arch_params", arch_params)
        super().__init__(arch_params)

          
 
            from typing import Tuple, Type

import numpy as np
import torch
from super_gradients.common.decorators.factory_decorator import resolve_param
from super_gradients.common.factories.activations_type_factory import ActivationsTypeFactory
from super_gradients.training.utils.bbox_utils import batch_distance2bbox
from torch import nn, Tensor

from super_gradients.modules import ConvBNAct
from super_gradients.training.utils.version_utils import torch_version_is_greater_or_equal


def bias_init_with_prob(prior_prob=0.01):
    """initialize conv/fc bias value according to a given probability value."""
    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
    return bias_init


@torch.no_grad()
def generate_anchors_for_grid_cell(feats: Tuple[Tensor, ...], fpn_strides: Tuple[int, ...], grid_cell_size=5.0, grid_cell_offset=0.5, dtype=torch.float):
    r"""
    Like ATSS, generate anchors based on grid size.
    Args:
        feats (List[Tensor]): shape[s, (b, c, h, w)]
        fpn_strides (tuple|list): shape[s], stride for each scale feature
        grid_cell_size (float): anchor size
        grid_cell_offset (float): The range is between 0 and 1.
    Returns:
        anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format.
        anchor_points (Tensor): shape[l, 2], "x, y" format.
        num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...].
        stride_tensor (Tensor): shape[l, 1], contains the stride for each scale.
    """
    assert len(feats) == len(fpn_strides)
    device = feats[0].device
    anchors = []
    anchor_points = []
    num_anchors_list = []
    stride_tensor = []
    for feat, stride in zip(feats, fpn_strides):
        _, _, h, w = feat.shape
        cell_half_size = grid_cell_size * stride * 0.5
        shift_x = (torch.arange(end=w) + grid_cell_offset) * stride
        shift_y = (torch.arange(end=h) + grid_cell_offset) * stride

        if torch_version_is_greater_or_equal(1, 10):
            # https://github.com/pytorch/pytorch/issues/50276
            shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
        else:
            shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

        anchor = torch.stack(
            [shift_x - cell_half_size, shift_y - cell_half_size, shift_x + cell_half_size, shift_y + cell_half_size],
            dim=-1,
        ).to(dtype=dtype)
        anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)

        anchors.append(anchor.reshape([-1, 4]))
        anchor_points.append(anchor_point.reshape([-1, 2]))
        num_anchors_list.append(len(anchors[-1]))
        stride_tensor.append(torch.full([num_anchors_list[-1], 1], stride, dtype=dtype))

    anchors = torch.cat(anchors).to(device)
    anchor_points = torch.cat(anchor_points).to(device)
    stride_tensor = torch.cat(stride_tensor).to(device)
    return anchors, anchor_points, num_anchors_list, stride_tensor


class ESEAttn(nn.Module):
    def __init__(self, feat_channels: int, activation_type: Type[nn.Module]):
        super(ESEAttn, self).__init__()
        self.fc = nn.Conv2d(feat_channels, feat_channels, kernel_size=1)
        self.conv = ConvBNAct(feat_channels, feat_channels, kernel_size=1, padding=0, stride=1, activation_type=activation_type, bias=False)

        self._init_weights()

    def _init_weights(self):
        torch.nn.init.normal_(self.fc.weight, std=0.001)

    def forward(self, feat, avg_feat):
        weight = torch.sigmoid(self.fc(avg_feat))
        return self.conv(feat * weight)


class PPYOLOEHead(nn.Module):
    @resolve_param("activation", ActivationsTypeFactory())
    def __init__(
        self,
        num_classes: int,
        in_channels: Tuple[int, int, int],
        activation: Type[nn.Module] = nn.SiLU,
        fpn_strides: Tuple[int, int, int] = (32, 16, 8),
        grid_cell_scale=5.0,
        grid_cell_offset=0.5,
        reg_max=16,
        eval_size: Tuple[int, int] = None,
        width_mult: float = 1.0,
    ):
        """

        :param num_classes:
        :param in_channels: Number of channels for each feature map (See width_mult)
        :param activation: Type of the activation used in module
        :param fpn_strides: Output strides of the feature maps from the neck
        :param grid_cell_scale:
        :param grid_cell_offset:
        :param reg_max:
        :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
               since anchors will not be regenerated for each forward call.
        :param exclude_nms:
        :param exclude_post_process:
        :param width_mult: A scaling factor applied to in_channels in order.
        """
        super(PPYOLOEHead, self).__init__()
        in_channels = [max(round(c * width_mult), 1) for c in in_channels]

        self.in_channels = tuple(in_channels)
        self.num_classes = num_classes
        self.fpn_strides = fpn_strides
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        self.reg_max = reg_max
        self.eval_size = eval_size

        # stem
        self.stem_cls = nn.ModuleList()
        self.stem_reg = nn.ModuleList()

        for in_c in self.in_channels:
            self.stem_cls.append(ESEAttn(in_c, activation_type=activation))
            self.stem_reg.append(ESEAttn(in_c, activation_type=activation))
        # pred head
        self.pred_cls = nn.ModuleList()
        self.pred_reg = nn.ModuleList()
        for in_c in self.in_channels:
            self.pred_cls.append(nn.Conv2d(in_c, self.num_classes, 3, padding=1))
            self.pred_reg.append(nn.Conv2d(in_c, 4 * (self.reg_max + 1), 3, padding=1))

        # Do not apply quantization to this tensor
        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
        self.register_buffer("proj_conv", proj, persistent=False)

        self._init_weights()

    def _init_weights(self):
        bias_cls = bias_init_with_prob(0.01)
        for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
            torch.nn.init.constant_(cls_.weight, 0.0)
            torch.nn.init.constant_(cls_.bias, bias_cls)
            torch.nn.init.constant_(reg_.weight, 0.0)
            torch.nn.init.constant_(reg_.bias, 1.0)

        if self.eval_size:
            anchor_points, stride_tensor = self._generate_anchors()
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    def replace_num_classes(self, num_classes: int):
        bias_cls = bias_init_with_prob(0.01)
        self.pred_cls = nn.ModuleList()
        self.num_classes = num_classes

        for in_c in self.in_channels:
            predict_layer = nn.Conv2d(in_c, num_classes, 3, padding=1)
            torch.nn.init.constant_(predict_layer.weight, 0.0)
            torch.nn.init.constant_(predict_layer.bias, bias_cls)
            self.pred_cls.append(predict_layer)

    def forward_train(self, feats: Tuple[Tensor, ...]):
        anchors, anchor_points, num_anchors_list, stride_tensor = generate_anchors_for_grid_cell(
            feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset
        )

        cls_score_list, reg_distri_list = [], []
        for i, feat in enumerate(feats):
            avg_feat = torch.nn.functional.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat)
            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            # cls and reg
            # Note we don't apply sigmoid on class predictions to ensure good numerical stability at loss computation
            cls_score_list.append(torch.permute(cls_logit.flatten(2), [0, 2, 1]))
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))
        cls_score_list = torch.cat(cls_score_list, dim=1)
        reg_distri_list = torch.cat(reg_distri_list, dim=1)

        return cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor

    def forward_eval(self, feats: Tuple[Tensor, ...]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]:
        anchors, anchor_points, num_anchors_list, stride_tensor = generate_anchors_for_grid_cell(
            feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset
        )

        cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []

        for i, feat in enumerate(feats):
            b, _, h, w = feat.shape
            height_mul_width = h * w
            avg_feat = torch.nn.functional.adaptive_avg_pool2d(feat, (1, 1))
            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat)
            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

            reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
            reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

            # cls and reg
            cls_score_list.append(cls_logit.reshape([b, self.num_classes, height_mul_width]))
            reg_dist_reduced_list.append(reg_dist_reduced)

        cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
        cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

        reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
        reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

        # Decode bboxes
        # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
        if self.eval_size:
            anchor_points_inference, _ = self.anchor_points, self.stride_tensor
        else:
            anchor_points_inference, _ = self._generate_anchors(feats)

        pred_scores = cls_score_list.sigmoid()
        pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

        decoded_predictions = pred_bboxes, pred_scores

        raw_predictions = cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor

        return decoded_predictions, raw_predictions

    def _generate_anchors(self, feats=None, dtype=torch.float):
        # just use in eval time
        anchor_points = []
        stride_tensor = []
        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)
            shift_x = torch.arange(end=w) + self.grid_cell_offset
            shift_y = torch.arange(end=h) + self.grid_cell_offset
            if torch_version_is_greater_or_equal(1, 10):
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
            else:
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

            anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(torch.full([h * w, 1], stride, dtype=dtype))
        anchor_points = torch.cat(anchor_points)
        stride_tensor = torch.cat(stride_tensor)
        if feats is not None:
            anchor_points = anchor_points.to(feats[0].device)
            stride_tensor = stride_tensor.to(feats[0].device)
        return anchor_points, stride_tensor

    def forward(self, feats: Tuple[Tensor]):
        if self.training:
            return self.forward_train(feats)
        else:
            return self.forward_eval(feats)

          
@@ -1,3 +1,6 @@
 
                            +# TODO: It would be nice to create keys here as: make_pretrained_model_key(Models.RESNET18, Dataset.COCO)
                
 
                            +# TODO: Not only this would reduce risk of making a typo error, it would bring more clarity how the key is created
                
 
                            +# TODO: And allow to "query" pretrained models by dataset
                
 
                             MODEL_URLS = {
                
 
                                 # RegNet-s
                
 
                                 "regnetY800_imagenet": "https://deci-pretrained-models.s3.amazonaws.com/RegnetY800/average_model.pth",
                
@@ -51,6 +54,9 @@ MODEL_URLS = {
 
                                 "pp_lite_t_seg75_cityscapes": "https://deci-pretrained-models.s3.amazonaws.com/ppliteseg/cityscapes/pplite_t_seg75/average_model.pth",
                
 
                                 "pp_lite_b_seg50_cityscapes": "https://deci-pretrained-models.s3.amazonaws.com/ppliteseg/cityscapes/pplite_b_seg50/average_model.pth",
                
 
                                 "pp_lite_b_seg75_cityscapes": "https://deci-pretrained-models.s3.amazonaws.com/ppliteseg/cityscapes/pplite_b_seg75/average_model.pth",
                
 
                            +    #
                
 
                            +    "ppyoloe_s_coco": "https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/coco2017_ppyoloe_s.pth",
                
 
                            +    "ppyoloe_m_coco": "https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/coco2017_ppyoloe_m.pth",
                
 
                             }
                
 
                             PRETRAINED_NUM_CLASSES = {"imagenet": 1000, "imagenet21k": 21843, "coco_segmentation_subclass": 21, "cityscapes": 19, "coco": 80, "cifar10": 10}
                
@@ -531,7 +531,7 @@ class Trainer:
 
                                     :param loss: The value computed by the loss function
                
 
                                     :param optimizer: An object that can perform a gradient step and zeroize model gradient
                
 
                                     :param epoch: number of epoch the training is on
                
 
                            -        :param batch_idx: number of iteration inside the current epoch
                
 
                            +        :param batch_idx: Zero-based number of iteration inside the current epoch
                
 
                                     :param context: current phase context
                
 
                                     :return:
                
 
                                     """
                
@@ -4,6 +4,7 @@ from super_gradients.training.transforms.transforms import (
 
                                 DetectionMosaic,
                
 
                                 DetectionRandomAffine,
                
 
                                 DetectionHSV,
                
 
                            +    DetectionRGB2BGR,
                
 
                                 DetectionPaddedRescale,
                
 
                                 DetectionTargetsFormatTransform,
                
 
                                 Standardize,
                
@@ -24,6 +25,7 @@ __all__ = [
 
                                 "DetectionMosaic",
                
 
                                 "DetectionRandomAffine",
                
 
                                 "DetectionHSV",
                
 
                            +    "DetectionRGB2BGR",
                
 
                                 "DetectionPaddedRescale",
                
 
                                 "DetectionTargetsFormatTransform",
                
 
                                 "imported_albumentations_failure",
                
@@ -27,6 +27,7 @@ from super_gradients.training.transforms.transforms import (
 
                                 DetectionRescale,
                
 
                                 DetectionPaddedRescale,
                
 
                                 DetectionTargetsFormatTransform,
                
 
                            +    DetectionNormalize,
                
 
                                 Standardize,
                
 
                             )
                
 
                             from torchvision.transforms import (
                
@@ -98,6 +99,7 @@ TRANSFORMS = {
 
                                 Transforms.DetectionRescale: DetectionRescale,
                
 
                                 Transforms.DetectionPaddedRescale: DetectionPaddedRescale,
                
 
                                 Transforms.DetectionTargetsFormatTransform: DetectionTargetsFormatTransform,
                
 
                            +    Transforms.DetectionNormalize: DetectionNormalize,
                
 
                                 Transforms.RandomResizedCropAndInterpolation: RandomResizedCropAndInterpolation,
                
 
                                 Transforms.RandAugmentTransform: rand_augment_transform,
                
 
                                 Transforms.Lighting: Lighting,
                
@@ -1,13 +1,15 @@
 
                             import collections
                
 
                             import math
                
 
                             import random
                
 
                            +from numbers import Number
                
 
                             from typing import Optional, Union, Tuple, List, Sequence, Dict
                
 
                            +import cv2
                
 
                            +import numpy as np
                
 
                             import torch.nn
                
 
                             from PIL import Image, ImageFilter, ImageOps
                
 
                             from torchvision import transforms as transforms
                
 
                            -import numpy as np
                
 
                            -import cv2
                
 
                            +
                
 
                             from super_gradients.common.abstractions.abstract_logger import get_logger
                
 
                             from super_gradients.common.decorators.factory_decorator import resolve_param
                
 
                             from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory
                
@@ -541,7 +543,7 @@ class DetectionRandomAffine(DetectionTransform):
 
                                     translate=0.1,
                
 
                                     scales=0.1,
                
 
                                     shear=10,
                
 
                            -        target_size=(640, 640),
                
 
                            +        target_size: Optional[Tuple[int, int]] = (640, 640),
                
 
                                     filter_box_candidates: bool = False,
                
 
                                     wh_thr=2,
                
 
                                     ar_thr=20,
                
@@ -570,7 +572,7 @@ class DetectionRandomAffine(DetectionTransform):
 
                                             sample["image"],
                
 
                                             sample["target"],
                
 
                                             sample.get("target_seg"),
                
 
                            -                target_size=self.target_size,
                
 
                            +                target_size=self.target_size or tuple(reversed(sample["image"].shape[:2])),
                
 
                                             degrees=self.degrees,
                
 
                                             translate=self.translate,
                
 
                                             scales=self.scale,
                
@@ -616,6 +618,8 @@ class DetectionMixup(DetectionTransform):
 
                                 def __call__(self, sample: dict):
                
 
                                     if self.enable_mixup and random.random() < self.prob:
                
 
                                         origin_img, origin_labels = sample["image"], sample["target"]
                
 
                            +            target_dim = self.input_dim if self.input_dim is not None else sample["image"].shape[:2]
                
 
                            +
                
 
                                         cp_sample = sample["additional_samples"][0]
                
 
                                         img, cp_labels = cp_sample["image"], cp_sample["target"]
                
 
                                         cp_boxes = cp_labels[:, :4]
                
@@ -627,11 +631,11 @@ class DetectionMixup(DetectionTransform):
 
                                         jit_factor = random.uniform(*self.mixup_scale)
                
 
                                         if len(img.shape) == 3:
                
 
                            -                cp_img = np.ones((self.input_dim[0], self.input_dim[1], img.shape[2]), dtype=np.uint8) * self.border_value
                
 
                            +                cp_img = np.ones((target_dim[0], target_dim[1], 3), dtype=np.uint8) * self.border_value
                
 
                                         else:
                
 
                            -                cp_img = np.ones(self.input_dim, dtype=np.uint8) * self.border_value
                
 
                            +                cp_img = np.ones(target_dim, dtype=np.uint8) * self.border_value
                
 
                            -            cp_scale_ratio = min(self.input_dim[0] / img.shape[0], self.input_dim[1] / img.shape[1])
                
 
                            +            cp_scale_ratio = min(target_dim[0] / img.shape[0], target_dim[1] / img.shape[1])
                
 
                                         resized_img = cv2.resize(
                
 
                                             img,
                
 
                                             (int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)),
                
@@ -755,32 +759,32 @@ class DetectionHorizontalFlip(DetectionTransform):
 
                             class DetectionRescale(DetectionTransform):
                
 
                                 """
                
 
                                 Resize image and bounding boxes to given image dimensions without preserving aspect ratio
                
 
                            +
                
 
                                 Attributes:
                
 
                            -        input_dim: (tuple) (rows, cols)
                
 
                            -        swap: image axis's to be rearranged.
                
 
                            +        output_shape: (tuple) (rows, cols)
                
 
                            +
                
 
                                 """
                
 
                            -    def __init__(self, input_dim: Tuple[int, int], swap=(2, 0, 1)):
                
 
                            +    def __init__(self, output_shape: Tuple[int, int]):
                
 
                                     super().__init__()
                
 
                            -        self.swap = swap
                
 
                            -        self.input_dim = input_dim
                
 
                            +        self.output_shape = output_shape
                
 
                                 def __call__(self, sample: Dict[str, np.array]):
                
 
                                     img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
                
 
                                     img_resized, scale_factors = self._rescale_image(img)
                
 
                            -        sample["image"] = img_resized.transpose(self.swap).astype(np.float32, copy=True)
                
 
                            +        sample["image"] = img_resized
                
 
                                     sample["target"] = self._rescale_target(targets, scale_factors)
                
 
                                     if crowd_targets is not None:
                
 
                                         sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors)
                
 
                                     return sample
                
 
                                 def _rescale_image(self, image):
                
 
                            -        sy, sx = self.input_dim[0] / image.shape[0], self.input_dim[1] / image.shape[1]
                
 
                            +        sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
                
 
                                     resized_img = cv2.resize(
                
 
                                         image,
                
 
                            -            dsize=(int(self.input_dim[1]), int(self.input_dim[0])),
                
 
                            +            dsize=(int(self.output_shape[1]), int(self.output_shape[0])),
                
 
                                         interpolation=cv2.INTER_LINEAR,
                
 
                                     )
                
 
                                     scale_factors = sy, sx
                
@@ -789,8 +793,10 @@ class DetectionRescale(DetectionTransform):
 
                                 def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
                
 
                                     """SegRescale the target according to a coefficient used to rescale the image.
                
 
                                     This is done to have images and targets at the same scale.
                
 
                            +
                
 
                                     :param targets:  Target XYXY bboxes to rescale, shape (num_boxes, 5)
                
 
                                     :param r:        SegRescale coefficient that was applied to the image
                
 
                            +
                
 
                                     :return:         Rescaled targets, shape (num_boxes, 5)
                
 
                                     """
                
 
                                     sy, sx = scale_factors
                
@@ -829,13 +835,16 @@ class DetectionRandomRotate90(DetectionTransform):
 
                                 @classmethod
                
 
                                 def xyxy_bbox_rot90(cls, bboxes, factor: int, rows: int, cols: int):
                
 
                                     """Rotates a bounding box by 90 degrees CCW (see np.rot90)
                
 
                            +
                
 
                                     Args:
                
 
                                         bbox: A bounding box tuple (x_min, y_min, x_max, y_max).
                
 
                                         factor: Number of CCW rotations. Must be in set {0, 1, 2, 3} See np.rot90.
                
 
                                         rows: Image rows.
                
 
                                         cols: Image cols.
                
 
                            +
                
 
                                     Returns:
                
 
                                         tuple: A bounding box tuple (x_min, y_min, x_max, y_max).
                
 
                            +
                
 
                                     """
                
 
                                     x_min, y_min, x_max, y_max = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
                
@@ -855,8 +864,10 @@ class DetectionRandomRotate90(DetectionTransform):
 
                             class DetectionRGB2BGR(DetectionTransform):
                
 
                                 """
                
 
                                 Detection change Red & Blue channel of the image
                
 
                            +
                
 
                                 Attributes:
                
 
                                     prob: (float) probability to apply the transform.
                
 
                            +
                
 
                                 """
                
 
                                 def __init__(self, prob: float = 0.5):
                
@@ -864,8 +875,8 @@ class DetectionRGB2BGR(DetectionTransform):
 
                                     self.prob = prob
                
 
                                 def __call__(self, sample: dict) -> dict:
                
 
                            -        if sample["image"].shape[2] != 3:
                
 
                            -            raise ValueError("DetectionRGB2BGR expects image to have 3 channels, got: " + str(sample["image"].shape[2]))
                
 
                            +        if sample["image"].shape[2] < 3:
                
 
                            +            raise ValueError("DetectionRGB2BGR transform expects at least 3 channels, got: " + str(sample["image"].shape[2]))
                
 
                                     if random.random() < self.prob:
                
 
                                         sample["image"] = sample["image"][..., ::-1]
                
@@ -912,6 +923,21 @@ class DetectionHSV(DetectionTransform):
 
                                     return sample
                
 
                            +class DetectionNormalize(DetectionTransform):
                
 
                            +    """
                
 
                            +    Normalize image by subtracting mean and dividing by std.
                
 
                            +    """
                
 
                            +
                
 
                            +    def __init__(self, mean, std):
                
 
                            +        super().__init__()
                
 
                            +        self.mean = np.array(list(mean)).reshape((1, 1, -1)).astype(np.float32)
                
 
                            +        self.std = np.array(list(std)).reshape((1, 1, -1)).astype(np.float32)
                
 
                            +
                
 
                            +    def __call__(self, sample: dict) -> dict:
                
 
                            +        sample["image"] = (sample["image"] - self.mean) / self.std
                
 
                            +        return sample
                
 
                            +
                
 
                            +
                
 
                             class DetectionTargetsFormatTransform(DetectionTransform):
                
 
                                 """
                
 
                                 Detection targets format transform
                
@@ -1004,8 +1030,8 @@ def get_aug_params(value: Union[tuple, float], center: float = 0):
 
                                 :param center: float, defines center to subtract when value is float.
                
 
                                 :return: generated value
                
 
                                 """
                
 
                            -    if isinstance(value, float):
                
 
                            -        return random.uniform(center - value, center + value)
                
 
                            +    if isinstance(value, Number):
                
 
                            +        return random.uniform(center - float(value), center + float(value))
                
 
                                 elif len(value) == 2:
                
 
                                     return random.uniform(value[0], value[1])
                
 
                                 else:
                
 
            from typing import Optional

import torch
from torch import Tensor

__all__ = ["batch_distance2bbox"]


def batch_distance2bbox(points: Tensor, distance: Tensor, max_shapes: Optional[Tensor] = None) -> Tensor:
    """Decode distance prediction to bounding box for batch.
    Args:
        points (Tensor): [B, ..., 2], "xy" format
        distance (Tensor): [B, ..., 4], "ltrb" format
        max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
    Returns:
        Tensor: Decoded bboxes, "x1y1x2y2" format.
    """
    lt, rb = torch.split(distance, 2, dim=-1)
    # while tensor add parameters, parameters should be better placed on the second place
    x1y1 = -lt + points
    x2y2 = rb + points
    out_bbox = torch.cat([x1y1, x2y2], dim=-1)
    if max_shapes is not None:
        max_shapes = max_shapes.flip(-1).tile([1, 2])
        delta_dim = out_bbox.ndim - max_shapes.ndim
        for _ in range(delta_dim):
            max_shapes.unsqueeze_(1)
        out_bbox = torch.where(out_bbox < max_shapes, out_bbox, max_shapes)
        out_bbox = torch.where(out_bbox > 0, out_bbox, torch.zeros_like(out_bbox))
    return out_bbox

          
@@ -15,6 +15,7 @@ from super_gradients.training.utils.callbacks.callbacks import (
 
                                 EpochStepWarmupLRCallback,
                
 
                                 BatchStepLinearWarmupLRCallback,
                
 
                             )
                
 
                            +from super_gradients.training.utils.callbacks.ppyoloe_switch_callback import PPYoloETrainingStageSwitchCallback
                
 
                             from super_gradients.training.utils.deprecated_utils import wrap_with_warning
                
 
                             from super_gradients.training.utils.early_stopping import EarlyStop
                
@@ -27,6 +28,7 @@ CALLBACKS = {
 
                                 Callbacks.EARLY_STOP: EarlyStop,
                
 
                                 Callbacks.DETECTION_MULTISCALE_PREPREDICTION: DetectionMultiscalePrePredictionCallback,
                
 
                                 Callbacks.YOLOX_TRAINING_STAGE_SWITCH: YoloXTrainingStageSwitchCallback,
                
 
                            +    Callbacks.PPYOLOE_TRAINING_STAGE_SWITCH: PPYoloETrainingStageSwitchCallback,
                
 
                             }
                
@@ -13,10 +13,10 @@ import torch
 
                             from deprecate import deprecated
                
 
                             from super_gradients.common.abstractions.abstract_logger import get_logger
                
 
                            +from super_gradients.common.plugins.deci_client import DeciClient
                
 
                             from super_gradients.training.utils.callbacks.base_callbacks import PhaseCallback, PhaseContext, Phase, Callback
                
 
                             from super_gradients.training.utils.detection_utils import DetectionVisualization, DetectionPostPredictionCallback
                
 
                             from super_gradients.training.utils.segmentation_utils import BinarySegmentationVisualization
                
 
                            -from super_gradients.common.plugins.deci_client import DeciClient
                
 
                             logger = get_logger(__name__)
                
 
            from super_gradients.training.utils.callbacks import TrainingStageSwitchCallbackBase, PhaseContext


class PPYoloETrainingStageSwitchCallback(TrainingStageSwitchCallbackBase):
    """
    PPYoloETrainingStageSwitchCallback

    Training stage switch for PPYolo training.
    It changes static bbox assigner to a task aligned assigned after certain number of epochs passed

    """

    def __init__(
        self,
        static_assigner_end_epoch: int = 30,
    ):
        super().__init__(next_stage_start_epoch=static_assigner_end_epoch)

    def apply_stage_change(self, context: PhaseContext):
        from super_gradients.training.losses import PPYoloELoss

        if not isinstance(context.criterion, PPYoloELoss):
            raise RuntimeError(
                f"A criterion must be an instance of PPYoloELoss when using PPYoloETrainingStageSwitchCallback. " f"Got criterion {repr(context.criterion)}"
            )
        context.criterion.use_static_assigner = False

          
@@ -1,19 +1,19 @@
 
                             import math
                
 
                             import os
                
 
                             import pathlib
                
 
                            +import random
                
 
                             from abc import ABC, abstractmethod
                
 
                             from enum import Enum
                
 
                             from typing import Callable, List, Union, Tuple, Optional, Dict
                
 
                             import cv2
                
 
                             import matplotlib.pyplot as plt
                
 
                            -
                
 
                             import numpy as np
                
 
                             import torch
                
 
                             import torchvision
                
 
                            +from omegaconf import ListConfig
                
 
                             from torch import nn
                
 
                             from torch.utils.data._utils.collate import default_collate
                
 
                            -from omegaconf import ListConfig
                
 
                             class DetectionTargetsFormat(Enum):
                
@@ -683,6 +683,105 @@ class DetectionCollateFN:
 
                                     return torch.cat(targets_merged, 0)
                
 
                            +class PPYoloECollateFN:
                
 
                            +    """
                
 
                            +    Collate function for PPYoloE training
                
 
                            +    """
                
 
                            +
                
 
                            +    def __init__(self, random_resize_sizes: Union[List[int], None] = None, random_resize_modes: Union[List[int], None] = None):
                
 
                            +        """
                
 
                            +
                
 
                            +        Args:
                
 
                            +            random_resize_sizes: (rows, cols)
                
 
                            +        """
                
 
                            +        self.random_resize_sizes = random_resize_sizes
                
 
                            +        self.random_resize_modes = random_resize_modes
                
 
                            +
                
 
                            +    def __repr__(self):
                
 
                            +        return f"PPYoloECollateFN(random_resize_sizes={self.random_resize_sizes}, random_resize_modes={self.random_resize_modes})"
                
 
                            +
                
 
                            +    def __str__(self):
                
 
                            +        return self.__repr__()
                
 
                            +
                
 
                            +    def __call__(self, data) -> Tuple[torch.Tensor, torch.Tensor]:
                
 
                            +        if self.random_resize_sizes is not None:
                
 
                            +            data = self.random_resize(data)
                
 
                            +
                
 
                            +        batch = default_collate(data)
                
 
                            +        ims, targets = batch
                
 
                            +        targets = self._format_targets(targets)
                
 
                            +        ims = torch.moveaxis(ims, -1, 1).float()
                
 
                            +
                
 
                            +        return ims, targets
                
 
                            +
                
 
                            +    def random_resize(self, batch):
                
 
                            +        target_size = random.choice(self.random_resize_sizes)
                
 
                            +        interpolation = random.choice(self.random_resize_modes)
                
 
                            +        batch = [self.random_resize_sample(sample, target_size, interpolation) for sample in batch]
                
 
                            +        return batch
                
 
                            +
                
 
                            +    def random_resize_sample(self, sample, target_size, interpolation):
                
 
                            +        if len(sample) == 2:
                
 
                            +            image, targets = sample  # TARGETS ARE IN LABEL_CXCYWH
                
 
                            +            with_crowd = False
                
 
                            +        elif len(sample == 3):
                
 
                            +            image, targets, crowd_targets = sample
                
 
                            +            with_crowd = True
                
 
                            +        else:
                
 
                            +            raise RuntimeError()
                
 
                            +
                
 
                            +        dsize = int(target_size), int(target_size)
                
 
                            +        scale_factors = target_size / image.shape[0], target_size / image.shape[1]
                
 
                            +
                
 
                            +        image = cv2.resize(
                
 
                            +            image,
                
 
                            +            dsize=dsize,
                
 
                            +            interpolation=interpolation,
                
 
                            +        )
                
 
                            +
                
 
                            +        sy, sx = scale_factors
                
 
                            +        targets[:, 1:5] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
                
 
                            +        if with_crowd:
                
 
                            +            crowd_targets[:, 1:5] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
                
 
                            +            return image, targets, crowd_targets
                
 
                            +
                
 
                            +        return image, targets
                
 
                            +
                
 
                            +    def _format_targets(self, targets: torch.Tensor) -> torch.Tensor:
                
 
                            +        """
                
 
                            +
                
 
                            +        :param targets:
                
 
                            +        :return: Tensor of shape [B, N, 6], where 6 elements are (index, c, cx, cy, w, h)
                
 
                            +        """
                
 
                            +        # Same collate as in YoloX. We convert to PPYoloTargets in the loss
                
 
                            +        nlabel = (targets.sum(dim=2) > 0).sum(dim=1)  # number of label per image
                
 
                            +        targets_merged = []
                
 
                            +        for i in range(targets.shape[0]):
                
 
                            +            targets_im = targets[i, : nlabel[i]]
                
 
                            +            batch_column = targets.new_ones((targets_im.shape[0], 1)) * i
                
 
                            +            targets_merged.append(torch.cat((batch_column, targets_im), 1))
                
 
                            +
                
 
                            +        return torch.cat(targets_merged, 0)
                
 
                            +
                
 
                            +
                
 
                            +class CrowdDetectionPPYoloECollateFN(PPYoloECollateFN):
                
 
                            +    """
                
 
                            +    Collate function for Yolox training with additional_batch_items that includes crowd targets
                
 
                            +    """
                
 
                            +
                
 
                            +    def __call__(self, data) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
                
 
                            +
                
 
                            +        if self.random_resize_sizes is not None:
                
 
                            +            data = self.random_resize(data)
                
 
                            +
                
 
                            +        batch = default_collate(data)
                
 
                            +        ims, targets, crowd_targets = batch
                
 
                            +        if ims.shape[3] == 3:
                
 
                            +            ims = torch.moveaxis(ims, -1, 1).float()
                
 
                            +
                
 
                            +        return ims, self._format_targets(targets), {"crowd_targets": self._format_targets(crowd_targets)}
                
 
                            +
                
 
                            +
                
 
                             class CrowdDetectionCollateFN(DetectionCollateFN):
                
 
                                 """
                
 
                                 Collate function for Yolox training with additional_batch_items that includes crowd targets
                
@@ -807,7 +906,7 @@ def compute_img_detection_matching(
 
                                 :param preds:           Tensor of shape (num_img_predictions, 6)
                
 
                                                         format:     (x1, y1, x2, y2, confidence, class_label) where x1,y1,x2,y2 are according to image size
                
 
                                 :param targets:         targets for this image of shape (num_img_targets, 6)
                
 
                            -                            format:     (index, x, y, w, h, label) where x,y,w,h are in range [0,1]
                
 
                            +                            format:     (label, cx, cy, w, h, label) where cx,cy,w,h
                
 
                                 :param height:          dimensions of the image
                
 
                                 :param width:           dimensions of the image
                
 
                                 :param iou_thresholds:  Threshold to compute the mAP
                
@@ -858,9 +957,8 @@ def compute_img_detection_matching(
 
                                     # CHANGE bboxes TO FIT THE IMAGE SIZE
                
 
                                     change_bbox_bounds_for_image_size(preds, (height, width))
                
 
                            -        # if target_format == "xywh":
                
 
                            -        targets_box = convert_xywh_bbox_to_xyxy(targets_box)  # cxcywh2xyxy
                
 
                            -        crowd_target_box = convert_xywh_bbox_to_xyxy(crowd_target_box)  # convert_xywh_bbox_to_xyxy
                
 
                            +        targets_box = cxcywh2xyxy(targets_box)
                
 
                            +        crowd_target_box = cxcywh2xyxy(crowd_target_box)
                
 
                                     if denormalize_targets:
                
 
                                         targets_box[:, [0, 2]] *= width