@@ -304,6 +304,9 @@ class Models:
 
                                 DEKR_W32_NO_DC = "dekr_w32_no_dc"
                
 
                                 POSE_PP_YOLO_L = "pose_ppyolo_l"
                
 
                                 POSE_DDRNET_39 = "pose_ddrnet39"
                
 
                            +    YOLO_NAS_S = "yolo_nas_s"
                
 
                            +    YOLO_NAS_M = "yolo_nas_m"
                
 
                            +    YOLO_NAS_L = "yolo_nas_l"
                
 
                             class ConcatenatedTensorFormats:
                
@@ -326,8 +329,8 @@ class Dataloaders:
 
                                 COCO2017_VAL = "coco2017_val"
                
 
                                 COCO2017_TRAIN_YOLOX = "coco2017_train_yolox"
                
 
                                 COCO2017_VAL_YOLOX = "coco2017_val_yolox"
                
 
                            -    COCO2017_TRAIN_DECIYOLO = "coco2017_train_deci_yolo"
                
 
                            -    COCO2017_VAL_DECIYOLO = "coco2017_val_deci_yolo"
                
 
                            +    COCO2017_TRAIN_YOLO_NAS = "coco2017_train_yolo_nas"
                
 
                            +    COCO2017_VAL_YOLO_NAS = "coco2017_val_yolo_nas"
                
 
                                 COCO2017_TRAIN_PPYOLOE = "coco2017_train_ppyoloe"
                
 
                                 COCO2017_VAL_PPYOLOE = "coco2017_val_ppyoloe"
                
 
                                 COCO2017_TRAIN_SSD_LITE_MOBILENET_V2 = "coco2017_train_ssd_lite_mobilenet_v2"
                
@@ -1,8 +1,8 @@
 
                             from super_gradients.common.object_names import Models
                
 
                             from super_gradients.training import models
                
 
                            -# Note that currently only YoloX and PPYoloE are supported.
                
 
                            -model = models.get(Models.PP_YOLOE_S, pretrained_weights="coco")
                
 
                            +# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported.
                
 
                            +model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco")
                
 
                             IMAGES = [
                
 
                                 "../../../../documentation/source/images/examples/countryside.jpg",
                
@@ -1,8 +1,8 @@
 
                             from super_gradients.common.object_names import Models
                
 
                             from super_gradients.training import models
                
 
                            -# Note that currently only YoloX and PPYoloE are supported.
                
 
                            -model = models.get(Models.YOLOX_N, pretrained_weights="coco")
                
 
                            +# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported.
                
 
                            +model = models.get(Models.YoloNAS_L, pretrained_weights="coco")
                
 
                             image_folder_path = "../../../../documentation/source/images/examples"
                
@@ -2,8 +2,8 @@ import torch
 
                             from super_gradients.common.object_names import Models
                
 
                             from super_gradients.training import models
                
 
                            -# Note that currently only YoloX and PPYoloE are supported.
                
 
                            -model = models.get(Models.YOLOX_N, pretrained_weights="coco")
                
 
                            +# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported.
                
 
                            +model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco")
                
 
                             # We want to use cuda if available to speed up inference.
                
 
                             model = model.to("cuda" if torch.cuda.is_available() else "cpu")
                
@@ -3,8 +3,8 @@ import torch
 
                             from super_gradients.common.object_names import Models
                
 
                             from super_gradients.training import models
                
 
                            -# Note that currently only YoloX and PPYoloE are supported.
                
 
                            -model = models.get(Models.YOLOX_N, pretrained_weights="coco")
                
 
                            +# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported.
                
 
                            +model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco")
                
 
                             # We want to use cuda if available to speed up inference.
                
 
                             model = model.to("cuda" if torch.cuda.is_available() else "cpu")
                
@@ -1,3 +1,3 @@
 
                            -from .module_interfaces import HasPredict, HasPreprocessingParams
                
 
                            +from .module_interfaces import HasPredict, HasPreprocessingParams, SupportsReplaceNumClasses
                
 
                            -__all__ = ["HasPredict", "HasPreprocessingParams"]
                
 
                            +__all__ = ["HasPredict", "HasPreprocessingParams", "SupportsReplaceNumClasses"]
                
@@ -1,3 +1,6 @@
 
                            +from typing import Callable
                
 
                            +
                
 
                            +from torch import nn
                
 
                             from typing_extensions import Protocol, runtime_checkable
                
@@ -31,3 +34,28 @@ class HasPredict(Protocol):
 
                                 def predict_webcam(self, *args, **kwargs):
                
 
                                     ...
                
 
                            +
                
 
                            +
                
 
                            +@runtime_checkable
                
 
                            +class SupportsReplaceNumClasses(Protocol):
                
 
                            +    """
                
 
                            +    Protocol interface for modules that support replacing the number of classes.
                
 
                            +    Derived classes should implement the `replace_num_classes` method.
                
 
                            +
                
 
                            +    This interface class serves a purpose of explicitly indicating whether a class supports optimized head replacement:
                
 
                            +
                
 
                            +    >>> class PredictionHead(nn.Module, SupportsReplaceNumClasses):
                
 
                            +    >>>    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module] = None):
                
 
                            +    >>>       ...
                
 
                            +    """
                
 
                            +
                
 
                            +    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
                
 
                            +        """
                
 
                            +        Replace the number of classes in the module.
                
 
                            +
                
 
                            +        :param num_classes: New number of classes.
                
 
                            +        :param compute_new_weights_fn: (callable) An optional function that computes the new weights for the new classes.
                
 
                            +            It takes existing nn.Module and returns a new one.
                
 
                            +        :return: None
                
 
                            +        """
                
 
                            +        ...
                
@@ -16,7 +16,23 @@ from super_gradients.modules.skip_connections import (
 
                             from super_gradients.common.abstractions.abstract_logger import get_logger
                
 
                             from super_gradients.common.registry.registry import ALL_DETECTION_MODULES
                
 
                            +from super_gradients.modules.base_modules import BaseDetectionModule
                
 
                            +from super_gradients.modules.detection_modules import (
                
 
                            +    PANNeck,
                
 
                            +    NHeads,
                
 
                            +    MultiOutputBackbone,
                
 
                            +    NStageBackbone,
                
 
                            +    MobileNetV1Backbone,
                
 
                            +    MobileNetV2Backbone,
                
 
                            +    SSDNeck,
                
 
                            +    SSDInvertedResidualNeck,
                
 
                            +    SSDBottleneckNeck,
                
 
                            +    SSDHead,
                
 
                            +)
                
 
                            +from super_gradients.module_interfaces import SupportsReplaceNumClasses
                
 
                            +
                
 
                             __all__ = [
                
 
                            +    "BaseDetectionModule",
                
 
                                 "ALL_DETECTION_MODULES",
                
 
                                 "PixelShuffle",
                
 
                                 "AntiAliasDownsample",
                
@@ -33,6 +49,17 @@ __all__ = [
 
                                 "BackboneInternalSkipConnection",
                
 
                                 "HeadInternalSkipConnection",
                
 
                                 "LightweightDEKRHead",
                
 
                            +    "PANNeck",
                
 
                            +    "NHeads",
                
 
                            +    "MultiOutputBackbone",
                
 
                            +    "NStageBackbone",
                
 
                            +    "MobileNetV1Backbone",
                
 
                            +    "MobileNetV2Backbone",
                
 
                            +    "SSDNeck",
                
 
                            +    "SSDInvertedResidualNeck",
                
 
                            +    "SSDBottleneckNeck",
                
 
                            +    "SSDHead",
                
 
                            +    "SupportsReplaceNumClasses",
                
 
                             ]
                
 
                             logger = get_logger(__name__)
                
 
            from abc import abstractmethod, ABC
from typing import Union, List

from torch import nn

__all__ = ["BaseDetectionModule"]


class BaseDetectionModule(nn.Module, ABC):
    """
    An interface for a module that is easy to integrate into a model with complex connections
    """

    def __init__(self, in_channels: Union[List[int], int], **kwargs):
        """
        :param in_channels: defines channels of tensor(s) that will be accepted by a module in forward
        """
        super().__init__()
        self.in_channels = in_channels

    @property
    @abstractmethod
    def out_channels(self) -> Union[List[int], int]:
        """
        :return: channels of tensor(s) that will be returned by a module  in forward
        """
        raise NotImplementedError()

          
@@ -1,37 +1,30 @@
 
                            +from abc import ABC, abstractmethod
                
 
                             from typing import Union, List
                
 
                            -from abc import abstractmethod, ABC
                
 
                             import torch
                
 
                            -from torch import nn
                
 
                            -from omegaconf.listconfig import ListConfig
                
 
                             from omegaconf import DictConfig
                
 
                            -
                
 
                            +from omegaconf.listconfig import ListConfig
                
 
                             from super_gradients.common.registry.registry import register_detection_module
                
 
                            +from super_gradients.modules.base_modules import BaseDetectionModule
                
 
                            +from super_gradients.modules.multi_output_modules import MultiOutputModule
                
 
                            +from super_gradients.training.models import MobileNet, MobileNetV2
                
 
                             from super_gradients.training.models.classification_models.mobilenetv2 import InvertedResidual
                
 
                             from super_gradients.training.utils.utils import HpmStruct
                
 
                            -from super_gradients.training.models import MobileNet, MobileNetV2
                
 
                            -from super_gradients.modules.multi_output_modules import MultiOutputModule
                
 
                            -
                
 
                            -
                
 
                            -class BaseDetectionModule(nn.Module, ABC):
                
 
                            -    """
                
 
                            -    An interface for a module that is easy to integrate into a model with complex connections
                
 
                            -    """
                
 
                            -
                
 
                            -    def __init__(self, in_channels: Union[List[int], int], **kwargs):
                
 
                            -        """
                
 
                            -        :param in_channels: defines channels of tensor(s) that will be accepted by a module in forward
                
 
                            -        """
                
 
                            -        super().__init__()
                
 
                            -        self.in_channels = in_channels
                
 
                            +from torch import nn
                
 
                            -    @property
                
 
                            -    @abstractmethod
                
 
                            -    def out_channels(self) -> Union[List[int], int]:
                
 
                            -        """
                
 
                            -        :return: channels of tensor(s) that will be returned by a module  in forward
                
 
                            -        """
                
 
                            -        raise NotImplementedError()
                
 
                            +__all__ = [
                
 
                            +    "PANNeck",
                
 
                            +    "NHeads",
                
 
                            +    "MultiOutputBackbone",
                
 
                            +    "NStageBackbone",
                
 
                            +    "MobileNetV1Backbone",
                
 
                            +    "MobileNetV2Backbone",
                
 
                            +    "SSDNeck",
                
 
                            +    "SSDInvertedResidualNeck",
                
 
                            +    "SSDBottleneckNeck",
                
 
                            +    "SSDHead",
                
 
                            +    "BaseDetectionModule",
                
 
                            +]
                
 
                             @register_detection_module()
                
 
            from typing import Union

import torch
from torch import nn

__all__ = ["replace_num_classes_with_random_weights"]


def replace_num_classes_with_random_weights(module: Union[nn.Conv2d, nn.Linear, nn.Module], num_classes: int) -> nn.Module:
    """
    Replace the number of classes in the module with random weights.
    This is useful for replacing the output layer of a detection/classification head.
    This implementation support Conv2d and Linear layers.
    Returned module will have the same device and dtype as the original module.
    Random weights are initialized with the same mean and std as the original weights.

    :param module: (nn.Module) Module to replace the number of classes in.
    :param num_classes: New number of classes.
    :return: nn.Module
    """
    if isinstance(module, nn.Conv2d):
        new_module = nn.Conv2d(
            module.in_channels,
            num_classes,
            kernel_size=module.kernel_size,
            stride=module.stride,
            padding=module.padding,
            dilation=module.dilation,
            groups=module.groups,
            bias=module.bias is not None,
            device=module.weight.device,
            dtype=module.weight.dtype,
        )
        torch.nn.init.normal_(new_module.weight, mean=module.weight.mean().item(), std=module.weight.std(dim=(0, 1, 2, 3)).item())
        if module.bias is not None:
            torch.nn.init.normal_(new_module.bias, mean=module.bias.mean().item(), std=module.bias.std(dim=0).item())

        return new_module
    elif isinstance(module, nn.Linear):
        new_module = nn.Linear(module.in_features, num_classes, device=module.weight.device, dtype=module.weight.dtype, bias=module.bias is not None)
        torch.nn.init.normal_(new_module.weight, mean=module.weight.mean().item(), std=module.weight.std(dim=(0, 1, 2)).item())
        if module.bias is not None:
            torch.nn.init.normal_(new_module.bias, mean=module.bias.mean().item(), std=module.bias.std(dim=0).item())

        return new_module
    else:
        raise ValueError(f"Module {module} does not support replacing the number of classes")

          
@@ -5,7 +5,7 @@ from super_gradients.common.decorators.factory_decorator import resolve_param
 
                             from super_gradients.common.factories.activations_type_factory import ActivationsTypeFactory
                
 
                             from torch import nn, Tensor
                
 
                            -from super_gradients.modules.detection_modules import BaseDetectionModule
                
 
                            +from super_gradients.modules.base_modules import BaseDetectionModule
                
 
                             from super_gradients.common.registry.registry import register_detection_module
                
 
            backbone:
  NStageBackbone:

    stem:
      YoloNASStem:
        out_channels: 48

    stages:
      - YoloNASStage:
          out_channels: 96
          num_blocks: 2
          activation_type: relu
          hidden_channels: 96
          concat_intermediates: True

      - YoloNASStage:
          out_channels: 192
          num_blocks: 3
          activation_type: relu
          hidden_channels: 128
          concat_intermediates: True

      - YoloNASStage:
          out_channels: 384
          num_blocks: 5
          activation_type: relu
          hidden_channels: 256
          concat_intermediates: True

      - YoloNASStage:
          out_channels: 768
          num_blocks: 2
          activation_type: relu
          hidden_channels: 512
          concat_intermediates: True


    context_module:
      SPP:
        output_channels: 768
        activation_type: relu
        k: [5,9,13]

    out_layers: [stage1, stage2, stage3, context_module]

neck:
  YoloNASPANNeckWithC2:

    neck1:
      YoloNASUpStage:
        out_channels: 192
        num_blocks: 4
        hidden_channels: 128
        width_mult: 1
        depth_mult: 1
        activation_type: relu
        reduce_channels: True

    neck2:
      YoloNASUpStage:
        out_channels: 96
        num_blocks: 4
        hidden_channels: 128
        width_mult: 1
        depth_mult: 1
        activation_type: relu
        reduce_channels: True

    neck3:
      YoloNASDownStage:
        out_channels: 192
        num_blocks: 4
        hidden_channels: 128
        activation_type: relu
        width_mult: 1
        depth_mult: 1

    neck4:
      YoloNASDownStage:
        out_channels: 384
        num_blocks: 4
        hidden_channels: 256
        activation_type: relu
        width_mult: 1
        depth_mult: 1

heads:
  NDFLHeads:
    num_classes: 80
    reg_max: 16
    heads_list:
      - YoloNASDFLHead:
          inter_channels: 128
          width_mult: 1
          first_conv_group_size: 0
          stride: 8
      - YoloNASDFLHead:
          inter_channels: 256
          width_mult: 1
          first_conv_group_size: 0
          stride: 16
      - YoloNASDFLHead:
          inter_channels: 512
          width_mult: 1
          first_conv_group_size: 0
          stride: 32

bn_eps: 1e-3
bn_momentum: 0.03
inplace_act: True

_convert_: all

          
 
            backbone:
  NStageBackbone:

    stem:
      YoloNASStem:
        out_channels: 48

    stages:
      - YoloNASStage:
          out_channels: 96
          num_blocks: 2
          activation_type: relu
          hidden_channels: 64
          concat_intermediates: True

      - YoloNASStage:
          out_channels: 192
          num_blocks: 3
          activation_type: relu
          hidden_channels: 128
          concat_intermediates: True

      - YoloNASStage:
          out_channels: 384
          num_blocks: 5
          activation_type: relu
          hidden_channels: 256
          concat_intermediates: True

      - YoloNASStage:
          out_channels: 768
          num_blocks: 2
          activation_type: relu
          hidden_channels: 384
          concat_intermediates: False


    context_module:
      SPP:
        output_channels: 768
        activation_type: relu
        k: [5,9,13]

    out_layers: [stage1, stage2, stage3, context_module]

neck:
  YoloNASPANNeckWithC2:

    neck1:
      YoloNASUpStage:
        out_channels: 192
        num_blocks: 2
        hidden_channels: 192
        width_mult: 1
        depth_mult: 1
        activation_type: relu
        reduce_channels: True

    neck2:
      YoloNASUpStage:
        out_channels: 96
        num_blocks: 3
        hidden_channels: 64
        width_mult: 1
        depth_mult: 1
        activation_type: relu
        reduce_channels: True

    neck3:
      YoloNASDownStage:
        out_channels: 192
        num_blocks: 2
        hidden_channels: 192
        activation_type: relu
        width_mult: 1
        depth_mult: 1

    neck4:
      YoloNASDownStage:
        out_channels: 384
        num_blocks: 3
        hidden_channels: 256
        activation_type: relu
        width_mult: 1
        depth_mult: 1

heads:
  NDFLHeads:
    num_classes: 80
    reg_max: 16
    heads_list:
      - YoloNASDFLHead:
          inter_channels: 128
          width_mult: 0.75
          first_conv_group_size: 0
          stride: 8
      - YoloNASDFLHead:
          inter_channels: 256
          width_mult: 0.75
          first_conv_group_size: 0
          stride: 16
      - YoloNASDFLHead:
          inter_channels: 512
          width_mult: 0.75
          first_conv_group_size: 0
          stride: 32

bn_eps: 1e-3
bn_momentum: 0.03
inplace_act: True

_convert_: all

          
 
            backbone:
  NStageBackbone:

    stem:
      YoloNASStem:
        out_channels: 48

    stages:
      - YoloNASStage:
          out_channels: 96
          num_blocks: 2
          activation_type: relu
          hidden_channels: 32
          concat_intermediates: False

      - YoloNASStage:
          out_channels: 192
          num_blocks: 3
          activation_type: relu
          hidden_channels: 64
          concat_intermediates: False

      - YoloNASStage:
          out_channels: 384
          num_blocks: 5
          activation_type: relu
          hidden_channels: 96
          concat_intermediates: False

      - YoloNASStage:
          out_channels: 768
          num_blocks: 2
          activation_type: relu
          hidden_channels: 192
          concat_intermediates: False


    context_module:
      SPP:
        output_channels: 768
        activation_type: relu
        k: [5,9,13]

    out_layers: [stage1, stage2, stage3, context_module]

neck:
  YoloNASPANNeckWithC2:

    neck1:
      YoloNASUpStage:
        out_channels: 192
        num_blocks: 2
        hidden_channels: 64
        width_mult: 1
        depth_mult: 1
        activation_type: relu
        reduce_channels: True

    neck2:
      YoloNASUpStage:
        out_channels: 96
        num_blocks: 2
        hidden_channels: 48
        width_mult: 1
        depth_mult: 1
        activation_type: relu
        reduce_channels: True

    neck3:
      YoloNASDownStage:
        out_channels: 192
        num_blocks: 2
        hidden_channels: 64
        activation_type: relu
        width_mult: 1
        depth_mult: 1

    neck4:
      YoloNASDownStage:
        out_channels: 384
        num_blocks: 2
        hidden_channels: 64
        activation_type: relu
        width_mult: 1
        depth_mult: 1

heads:
  NDFLHeads:
    num_classes: 80
    reg_max: 16
    heads_list:
      - YoloNASDFLHead:
          inter_channels: 128
          width_mult: 0.5
          first_conv_group_size: 0
          stride: 8
      - YoloNASDFLHead:
          inter_channels: 256
          width_mult: 0.5
          first_conv_group_size: 0
          stride: 16
      - YoloNASDFLHead:
          inter_channels: 512
          width_mult: 0.5
          first_conv_group_size: 0
          stride: 32

bn_eps: 1e-3
bn_momentum: 0.03
inplace_act: True

_convert_: all

          
 
            # YoloNAS-S Detection training on COCO2017 Dataset:
# This training recipe is for demonstration purposes only. Pretrained models were trained using a different recipe.
# So it will not be possible to reproduce the results of the pretrained models using this recipe.

# Instructions:
#   0. Make sure that the data is stored in dataset_params.dataset_dir or add "dataset_params.data_dir=<PATH-TO-DATASET>" at the end of the command below (feel free to check ReadMe)
#   1. Move to the project root (where you will find the ReadMe and src folder)
#   2. Run the command you want:
#         yolo_nas_s: python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolo_nas_s
#

defaults:
  - training_hyperparams: coco2017_yolo_nas_train_params
  - dataset_params: coco_detection_yolo_nas_dataset_params
  - arch_params: yolo_nas_s_arch_params
  - checkpoint_params: default_checkpoint_params
  - _self_
  - variable_setup

train_dataloader: coco2017_train_yolo_nas
val_dataloader: coco2017_val_yolo_nas

load_checkpoint: False
resume: False

dataset_params:
  train_dataloader_params:
    batch_size: 32

arch_params:
  num_classes: 80

training_hyperparams:
  resume: ${resume}
  mixed_precision: True

architecture: yolo_nas_s

multi_gpu: DDP
num_gpus: 8

experiment_suffix: ""
experiment_name: coco2017_${architecture}${experiment_suffix}

          
@@ -30,15 +30,15 @@ train_dataset_params:
 
                                     mixup_scale: [ 0.5, 1.5 ]         # random rescale range for the additional sample in mixup
                
 
                                     prob: 0.5                       # probability to apply per-sample mixup
                
 
                                     flip_prob: 0.5                  # probability to apply horizontal flip
                
 
                            -    - DetectionStandardizeImage:
                
 
                            -        max_value: 255.
                
 
                                 - DetectionPaddedRescale:
                
 
                                     input_dim: [640, 640]
                
 
                                     max_targets: 120
                
 
                                     pad_value: 114
                
 
                            +    - DetectionStandardize:
                
 
                            +        max_value: 255.
                
 
                                 - DetectionTargetsFormatTransform:
                
 
                                     max_targets: 256
                
 
                            -        output_format: LABEL_NORMALIZED_CXCYWH
                
 
                            +        output_format: LABEL_CXCYWH
                
 
                               tight_box_rotation: False
                
 
                               class_inclusion_list:
                
@@ -67,13 +67,13 @@ val_dataset_params:
 
                                 - DetectionPadToSize:
                
 
                                     output_size: [640, 640]
                
 
                                     pad_value: 114
                
 
                            -    - DetectionStandardizeImage:
                
 
                            +    - DetectionStandardize:
                
 
                                     max_value: 255.
                
 
                                 - DetectionImagePermute
                
 
                                 - DetectionTargetsFormatTransform:
                
 
                                     max_targets: 50
                
 
                                     input_dim: [640, 640]
                
 
                            -        output_format: LABEL_NORMALIZED_CXCYWH
                
 
                            +        output_format: LABEL_CXCYWH
                
 
                               tight_box_rotation: False
                
 
                               class_inclusion_list:
                
 
                               max_num_samples:
                
@@ -83,6 +83,7 @@ val_dataloader_params:
 
                               batch_size: 25
                
 
                               num_workers: 8
                
 
                               drop_last: False
                
 
                            +  shuffle: False
                
 
                               pin_memory: True
                
 
                               collate_fn:
                
 
                                 _target_: super_gradients.training.utils.detection_utils.CrowdDetectionCollateFN
                
@@ -9,18 +9,27 @@ train_dataset_params:
 
                               input_dim: [640, 640]
                
 
                               cache_dir:
                
 
                               cache: False
                
 
                            +  ignore_empty_annotations: False
                
 
                               transforms:
                
 
                            +    - DetectionMosaic:
                
 
                            +        input_dim: ${dataset_params.train_dataset_params.input_dim}
                
 
                            +        prob: 1.
                
 
                                 - DetectionRandomAffine:
                
 
                                     degrees: 0.                  # rotation degrees, randomly sampled from [-degrees, degrees]
                
 
                                     translate: 0.1                # image translation fraction
                
 
                                     scales: [ 0.5, 1.5 ]              # random rescale range (keeps size by padding/cropping) after mosaic transform.
                
 
                                     shear: 0.0                    # shear degrees, randomly sampled from [-degrees, degrees]
                
 
                                     target_size: ${dataset_params.train_dataset_params.input_dim}
                
 
                            -        filter_box_candidates: True   # whether to filter out transformed bboxes by edge size, area ratio, and aspect ratio.
                
 
                            +        filter_box_candidates: False  # whether to filter out transformed bboxes by edge size, area ratio, and aspect ratio.
                
 
                                     wh_thr: 2                     # edge size threshold when filter_box_candidates = True (pixels)
                
 
                                     area_thr: 0.1                 # threshold for area ratio between original image and the transformed one, when filter_box_candidates = True
                
 
                                     ar_thr: 20                    # aspect ratio threshold when filter_box_candidates = True
                
 
                                     border_value: 128
                
 
                            +#    - DetectionMixup:
                
 
                            +#        input_dim: ${dataset_params.train_dataset_params.input_dim}
                
 
                            +#        mixup_scale: [ 0.5, 1.5 ]         # random rescale range for the additional sample in mixup
                
 
                            +#        prob: 1.0                       # probability to apply per-sample mixup
                
 
                            +#        flip_prob: 0.5                  # probability to apply horizontal flip
                
 
                                 - DetectionHSV:
                
 
                                     prob: 1.0                       # probability to apply HSV transform
                
 
                                     hgain: 5                        # HSV transform hue gain (randomly sampled from [-hgain, hgain])
                
@@ -30,8 +39,11 @@ train_dataset_params:
 
                                     prob: 0.5                       # probability to apply horizontal flip
                
 
                                 - DetectionPaddedRescale:
                
 
                                     input_dim: ${dataset_params.train_dataset_params.input_dim}
                
 
                            -        max_targets: 120
                
 
                            +        max_targets: 300
                
 
                            +    - DetectionStandardize:
                
 
                            +        max_value: 255.
                
 
                                 - DetectionTargetsFormatTransform:
                
 
                            +        max_targets: 300
                
 
                                     input_dim: ${dataset_params.train_dataset_params.input_dim}
                
 
                                     output_format: LABEL_CXCYWH
                
 
                               tight_box_rotation: False
                
@@ -43,8 +55,8 @@ train_dataset_params:
 
                             train_dataloader_params:
                
 
                               shuffle: True
                
 
                               batch_size: 16
                
 
                            -  num_workers: 0
                
 
                            -  sampler:
                
 
                            +  min_samples: 512
                
 
                            +  num_workers: 4
                
 
                               drop_last: False
                
 
                               pin_memory: True
                
 
                               worker_init_fn:
                
@@ -60,11 +72,16 @@ val_dataset_params:
 
                               input_dim: [640, 640]
                
 
                               cache_dir:
                
 
                               cache: False
                
 
                            +  ignore_empty_annotations: False
                
 
                               transforms:
                
 
                               - DetectionPaddedRescale:
                
 
                                   input_dim: ${dataset_params.val_dataset_params.input_dim}
                
 
                            +      max_targets: 300
                
 
                            +      pad_value: 114
                
 
                            +  - DetectionStandardize:
                
 
                            +      max_value: 255.
                
 
                               - DetectionTargetsFormatTransform:
                
 
                            -      max_targets: 50
                
 
                            +      max_targets: 300
                
 
                                   input_dim: ${dataset_params.val_dataset_params.input_dim}
                
 
                                   output_format: LABEL_CXCYWH
                
 
                               tight_box_rotation: False
                
@@ -74,10 +91,10 @@ val_dataset_params:
 
                               verbose: 0
                
 
                             val_dataloader_params:
                
 
                            -  batch_size: 64
                
 
                            -  num_workers: 0
                
 
                            -  sampler:
                
 
                            +  batch_size: 32
                
 
                            +  num_workers: 4
                
 
                               drop_last: False
                
 
                            +  shuffle: False
                
 
                               pin_memory: True
                
 
                               collate_fn: # collate function for valset
                
 
                                 _target_: super_gradients.training.utils.detection_utils.CrowdDetectionCollateFN
                
 
            # A recipe to fine-tune YoloNAS on Roboflow datasets.
# Checkout the datasets at https://universe.roboflow.com/roboflow-100?ref=blog.roboflow.com
#
# `dataset_name` refers to the official name of the dataset.
# You can find it in the url of the dataset: https://universe.roboflow.com/roboflow-100/digits-t2eg6 -> digits-t2eg6
#
# Example: python -m super_gradients.train_from_recipe --config-name=roboflow_yolo_nas_m dataset_name=digits-t2eg6

defaults:
  - training_hyperparams: coco2017_yolo_nas_train_params
  - dataset_params: roboflow_detection_dataset_params
  - checkpoint_params: default_checkpoint_params
  - arch_params: yolo_nas_m_arch_params
  - _self_
  - variable_setup

train_dataloader: roboflow_train_yolox
val_dataloader: roboflow_val_yolox

dataset_name: ??? # Placeholder for the name of the dataset you want to use (e.g. "digits-t2eg6")
dataset_params:
  dataset_name: ${dataset_name}

  train_dataloader_params:
    batch_size: 12

  val_dataloader_params:
    batch_size: 16

num_classes: ${roboflow_dataset_num_classes:${dataset_name}}

architecture: yolo_nas_m
arch_params:
  num_classes: ${num_classes}


load_checkpoint: False
checkpoint_params:
  pretrained_weights: coco


result_path: # By defaults saves results in checkpoints directory
resume: False
training_hyperparams:
  resume: ${resume}
  zero_weight_decay_on_bias_and_bn: True

  lr_warmup_epochs: 3
  warmup_mode: linear_epoch_step

  initial_lr: 4e-4
  cosine_final_lr_ratio: 0.1

  optimizer_params:
    weight_decay: 0.0001

  ema: True
  ema_params:
    decay: 0.9

  max_epochs: 100
  mixed_precision: True
  criterion_params:
    num_classes: ${num_classes}


  phase_callbacks: []
  loss:
    ppyoloe_loss:
      num_classes: ${num_classes}
      reg_max: 16

  valid_metrics_list:
    - DetectionMetrics_050:
        score_thres: 0.1
        top_k_predictions: 300
        num_cls: ${num_classes}
        normalize_targets: True
        post_prediction_callback:
          _target_: super_gradients.training.models.detection_models.pp_yolo_e.PPYoloEPostPredictionCallback
          score_threshold: 0.01
          nms_top_k: 1000
          max_predictions: 300
          nms_threshold: 0.7

  metric_to_watch: 'mAP@0.50'

multi_gpu: Off
num_gpus: 1

experiment_suffix: ""
experiment_name: ${architecture}_roboflow_${dataset_name}${experiment_suffix}

          
 
            # A recipe to fine-tune YoloNAS on Roboflow datasets.
# Checkout the datasets at https://universe.roboflow.com/roboflow-100?ref=blog.roboflow.com
#
# `dataset_name` refers to the official name of the dataset.
# You can find it in the url of the dataset: https://universe.roboflow.com/roboflow-100/digits-t2eg6 -> digits-t2eg6
#
# Example: python -m super_gradients.train_from_recipe --config-name=roboflow_yolo_nas_s dataset_name=digits-t2eg6

defaults:
  - training_hyperparams: coco2017_yolo_nas_train_params
  - dataset_params: roboflow_detection_dataset_params
  - checkpoint_params: default_checkpoint_params
  - arch_params: yolo_nas_s_arch_params
  - _self_
  - variable_setup

train_dataloader: roboflow_train_yolox
val_dataloader: roboflow_val_yolox

dataset_name: ??? # Placeholder for the name of the dataset you want to use (e.g. "digits-t2eg6")
dataset_params:
  dataset_name: ${dataset_name}

  train_dataloader_params:
    batch_size: 16

  val_dataloader_params:
    batch_size: 16

num_classes: ${roboflow_dataset_num_classes:${dataset_name}}

architecture: yolo_nas_s
arch_params:
  num_classes: ${num_classes}


load_checkpoint: False
checkpoint_params:
  pretrained_weights: coco


result_path: # By defaults saves results in checkpoints directory
resume: False
training_hyperparams:
  resume: ${resume}
  zero_weight_decay_on_bias_and_bn: True

  lr_warmup_epochs: 3
  warmup_mode: linear_epoch_step

  initial_lr: 5e-4
  cosine_final_lr_ratio: 0.1

  optimizer_params:
    weight_decay: 0.0001

  ema: True
  ema_params:
    decay: 0.9

  max_epochs: 100
  mixed_precision: True
  criterion_params:
    num_classes: ${num_classes}


  phase_callbacks: []
  loss:
    ppyoloe_loss:
      num_classes: ${num_classes}
      reg_max: 16

  valid_metrics_list:
    - DetectionMetrics_050:
        score_thres: 0.1
        top_k_predictions: 300
        num_cls: ${num_classes}
        normalize_targets: True
        post_prediction_callback:
          _target_: super_gradients.training.models.detection_models.pp_yolo_e.PPYoloEPostPredictionCallback
          score_threshold: 0.01
          nms_top_k: 1000
          max_predictions: 300
          nms_threshold: 0.7

  metric_to_watch: 'mAP@0.50'

multi_gpu: Off
num_gpus: 1

experiment_suffix: ""
experiment_name: ${architecture}_roboflow_${dataset_name}${experiment_suffix}

          
 
            defaults:
  - roboflow_yolo_nas_s
  - quantization_params: default_quantization_params
  - _self_

checkpoint_params:
  checkpoint_path: ???
  strict_load: no_key_matching

pre_launch_callbacks_list:
    - QATRecipeModificationCallback:
        batch_size_divisor: 2
        max_epochs_divisor: 10
        lr_decay_factor: 0.01
        warmup_epochs_divisor: 10
        cosine_final_lr_ratio: 0.01
        disable_phase_callbacks: True
        disable_augmentations: False

          
 
            defaults:
  - default_train_params

max_epochs: 300

warmup_mode: "linear_batch_step"
warmup_initial_lr:  1e-6
lr_warmup_steps: 1000
lr_warmup_epochs: 0

initial_lr:  2e-4
lr_mode: cosine
cosine_final_lr_ratio: 0.1

zero_weight_decay_on_bias_and_bn: True
batch_accumulate: 1

save_ckpt_epoch_list: [100, 200, 250]

loss:
  ppyoloe_loss:
    use_static_assigner: False
    num_classes: ${arch_params.num_classes}
    reg_max: 16

optimizer: AdamW
optimizer_params:
  weight_decay: 0.00001

ema: True
ema_params:
  decay: 0.9997
  decay_type: threshold

mixed_precision: False
sync_bn: True

valid_metrics_list:
  - DetectionMetrics:
      score_thres: 0.1
      top_k_predictions: 300
      num_cls: ${arch_params.num_classes}
      normalize_targets: True
      post_prediction_callback:
        _target_: super_gradients.training.models.detection_models.pp_yolo_e.PPYoloEPostPredictionCallback
        score_threshold: 0.01
        nms_top_k: 1000
        max_predictions: 300
        nms_threshold: 0.7

pre_prediction_callback:

metric_to_watch: 'mAP@0.50:0.95'
greater_metric_to_watch_is_better: True

_convert_: all

          
@@ -9,8 +9,8 @@ from .dataloaders import (
 
                                 coco2017_val_ppyoloe,
                
 
                                 coco2017_pose_train,
                
 
                                 coco2017_pose_val,
                
 
                            -    coco2017_train_deci_yolo,
                
 
                            -    coco2017_val_deci_yolo,
                
 
                            +    coco2017_train_yolo_nas,
                
 
                            +    coco2017_val_yolo_nas,
                
 
                                 imagenet_train,
                
 
                                 imagenet_val,
                
 
                                 imagenet_efficientnet_train,
                
@@ -68,8 +68,8 @@ __all__ = [
 
                                 "coco2017_val_ppyoloe",
                
 
                                 "coco2017_pose_train",
                
 
                                 "coco2017_pose_val",
                
 
                            -    "coco2017_train_deci_yolo",
                
 
                            -    "coco2017_val_deci_yolo",
                
 
                            +    "coco2017_train_yolo_nas",
                
 
                            +    "coco2017_val_yolo_nas",
                
 
                                 "imagenet_train",
                
 
                                 "imagenet_val",
                
 
                                 "imagenet_efficientnet_train",
                
@@ -172,10 +172,10 @@ def coco2017_val(dataset_params: Dict = None, dataloader_params: Dict = None) ->
 
                                 )
                
 
                            -@register_dataloader(Dataloaders.COCO2017_TRAIN_DECIYOLO)
                
 
                            -def coco2017_train_deci_yolo(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
                
 
                            +@register_dataloader(Dataloaders.COCO2017_TRAIN_YOLO_NAS)
                
 
                            +def coco2017_train_yolo_nas(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
                
 
                                 return get_data_loader(
                
 
                            -        config_name="coco_detection_deci_yolo_dataset_params",
                
 
                            +        config_name="coco_detection_yolo_nas_dataset_params",
                
 
                                     dataset_cls=COCODetectionDataset,
                
 
                                     train=True,
                
 
                                     dataset_params=dataset_params,
                
@@ -183,10 +183,10 @@ def coco2017_train_deci_yolo(dataset_params: Dict = None, dataloader_params: Dic
 
                                 )
                
 
                            -@register_dataloader(Dataloaders.COCO2017_VAL_DECIYOLO)
                
 
                            -def coco2017_val_deci_yolo(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
                
 
                            +@register_dataloader(Dataloaders.COCO2017_VAL_YOLO_NAS)
                
 
                            +def coco2017_val_yolo_nas(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
                
 
                                 return get_data_loader(
                
 
                            -        config_name="coco_detection_deci_yolo_dataset_params",
                
 
                            +        config_name="coco_detection_yolo_nas_dataset_params",
                
 
                                     dataset_cls=COCODetectionDataset,
                
 
                                     train=False,
                
 
                                     dataset_params=dataset_params,
                
@@ -33,7 +33,7 @@ DATASETS_METADATA = {
 
                                 "underwater-objects-5v7p8": {"category": "underwater", "train": 5320, "test": 760, "valid": 1520, "size": 7600, "num_classes": 5, "num_classes_found": 5},
                
 
                                 "coral-lwptl": {"category": "underwater", "train": 427, "test": 74, "valid": 93, "size": 594, "num_classes": 14, "num_classes_found": 14},
                
 
                                 "tweeter-posts": {"category": "documents", "train": 87, "test": 9, "valid": 21, "size": 117, "num_classes": 2, "num_classes_found": 2},
                
 
                            -    "tweeter-profile": {"category": "documents", "train": 425, "test": 61, "valid": 121, "size": 607, "num_classes": 1, "num_classes_found": 0},
                
 
                            +    "tweeter-profile": {"category": "documents", "train": 425, "test": 61, "valid": 121, "size": 607, "num_classes": 1, "num_classes_found": 1},
                
 
                                 "document-parts": {"category": "documents", "train": 906, "test": 150, "valid": 318, "size": 1374, "num_classes": 2, "num_classes_found": 2},
                
 
                                 "activity-diagrams-qdobr": {"category": "documents", "train": 259, "test": 45, "valid": 74, "size": 378, "num_classes": 19, "num_classes_found": 19},
                
 
                                 "signatures-xc8up": {"category": "documents", "train": 257, "test": 37, "valid": 74, "size": 368, "num_classes": 1, "num_classes_found": 1},
                
@@ -148,7 +148,7 @@ _NUM_CLASSES_FOUND = {
 
                                 "underwater-objects-5v7p8": 5,
                
 
                                 "coral-lwptl": 14,
                
 
                                 "tweeter-posts": 2,
                
 
                            -    "tweeter-profile": 0,
                
 
                            +    "tweeter-profile": 1,
                
 
                                 "document-parts": 2,
                
 
                                 "activity-diagrams-qdobr": 19,
                
 
                                 "signatures-xc8up": 1,
                
@@ -62,13 +62,26 @@ from super_gradients.training.models.classification_models.vgg import VGG
 
                             from super_gradients.training.models.classification_models.vit import ViT, ViTBase, ViTLarge, ViTHuge
                
 
                             # Detection models
                
 
                            -from super_gradients.training.models.detection_models.csp_darknet53 import CSPDarknet53
                
 
                            -from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_e import PPYoloE, PPYoloE_S, PPYoloE_M, PPYoloE_L, PPYoloE_X
                
 
                            +from super_gradients.training.models.detection_models.csp_darknet53 import CSPDarknet53, SPP
                
 
                            +from super_gradients.training.models.detection_models.pp_yolo_e import PPYoloE, PPYoloE_S, PPYoloE_M, PPYoloE_L, PPYoloE_X
                
 
                             from super_gradients.training.models.detection_models.darknet53 import Darknet53, Darknet53Base
                
 
                             from super_gradients.training.models.detection_models.ssd import SSDMobileNetV1, SSDLiteMobileNetV2
                
 
                             from super_gradients.training.models.detection_models.yolo_base import YoloBase, YoloPostPredictionCallback
                
 
                             from super_gradients.training.models.detection_models.yolox import YoloX_N, YoloX_T, YoloX_S, YoloX_M, YoloX_L, YoloX_X, CustomYoloX
                
 
                             from super_gradients.training.models.detection_models.customizable_detector import CustomizableDetector
                
 
                            +from super_gradients.training.models.detection_models.yolo_nas import (
                
 
                            +    YoloNASStage,
                
 
                            +    YoloNASStem,
                
 
                            +    YoloNASDownStage,
                
 
                            +    YoloNASUpStage,
                
 
                            +    YoloNASBottleneck,
                
 
                            +    YoloNASDFLHead,
                
 
                            +    NDFLHeads,
                
 
                            +    YoloNASPANNeckWithC2,
                
 
                            +    YoloNAS_S,
                
 
                            +    YoloNAS_M,
                
 
                            +    YoloNAS_L,
                
 
                            +)
                
 
                             # Segmentation models
                
 
                             from super_gradients.training.models.segmentation_models.shelfnet import (
                
@@ -96,7 +109,6 @@ from super_gradients.training.models.segmentation_models.stdc import (
 
                                 STDCSegmentationBase,
                
 
                                 CustomSTDCSegmentation,
                
 
                             )
                
 
                            -from super_gradients.training.models.segmentation_models.segformer import SegFormerB0, SegFormerB1, SegFormerB2, SegFormerB3, SegFormerB4, SegFormerB5
                
 
                             # Pose estimation
                
 
                             from super_gradients.training.models.pose_estimation_models.pose_ppyolo import PosePPYoloL
                
@@ -116,6 +128,18 @@ from super_gradients.common.object_names import Models
 
                             from super_gradients.common.registry.registry import ARCHITECTURES
                
 
                             __all__ = [
                
 
                            +    "SPP",
                
 
                            +    "YoloNAS_S",
                
 
                            +    "YoloNAS_M",
                
 
                            +    "YoloNAS_L",
                
 
                            +    "YoloNASStage",
                
 
                            +    "YoloNASUpStage",
                
 
                            +    "YoloNASStem",
                
 
                            +    "YoloNASDownStage",
                
 
                            +    "YoloNASDFLHead",
                
 
                            +    "YoloNASBottleneck",
                
 
                            +    "NDFLHeads",
                
 
                            +    "YoloNASPANNeckWithC2",
                
 
                                 "SgModule",
                
 
                                 "Beit",
                
 
                                 "BeitLargePatch16_224",
                
@@ -259,10 +283,4 @@ __all__ = [
 
                                 "ARCHITECTURES",
                
 
                                 "Models",
                
 
                                 "user_models",
                
 
                            -    "SegFormerB0",
                
 
                            -    "SegFormerB1",
                
 
                            -    "SegFormerB2",
                
 
                            -    "SegFormerB3",
                
 
                            -    "SegFormerB4",
                
 
                            -    "SegFormerB5",
                
 
                             ]
                
@@ -7,9 +7,11 @@ from typing import Tuple, Type
 
                             import torch
                
 
                             import torch.nn as nn
                
 
                            +from super_gradients.common.decorators.factory_decorator import resolve_param
                
 
                            +from super_gradients.common.factories.activations_type_factory import ActivationsTypeFactory
                
 
                             from super_gradients.common.object_names import Models
                
 
                            -from super_gradients.common.registry.registry import register_model
                
 
                            -from super_gradients.modules import Residual, Conv
                
 
                            +from super_gradients.common.registry.registry import register_model, register_detection_module
                
 
                            +from super_gradients.modules import Residual, Conv, BaseDetectionModule
                
 
                             from super_gradients.modules.utils import width_multiplier
                
 
                             from super_gradients.training.models.sg_module import SgModule
                
 
                             from super_gradients.training.utils.utils import get_param, HpmStruct
                
@@ -127,13 +129,16 @@ class BottleneckCSP(nn.Module):
 
                                     return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
                
 
                            -class SPP(nn.Module):
                
 
                            +@register_detection_module()
                
 
                            +class SPP(BaseDetectionModule):
                
 
                                 # SPATIAL PYRAMID POOLING LAYER
                
 
                            -    def __init__(self, input_channels, output_channels, k: Tuple, activation_type: Type[nn.Module]):
                
 
                            -        super().__init__()
                
 
                            +    @resolve_param("activation_type", ActivationsTypeFactory())
                
 
                            +    def __init__(self, in_channels, output_channels, k: Tuple, activation_type: Type[nn.Module]):
                
 
                            +        super().__init__(in_channels)
                
 
                            +        self._output_channels = output_channels
                
 
                            -        hidden_channels = input_channels // 2
                
 
                            -        self.cv1 = Conv(input_channels, hidden_channels, 1, 1, activation_type)
                
 
                            +        hidden_channels = in_channels // 2
                
 
                            +        self.cv1 = Conv(in_channels, hidden_channels, 1, 1, activation_type)
                
 
                                     self.cv2 = Conv(hidden_channels * (len(k) + 1), output_channels, 1, 1, activation_type)
                
 
                                     self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
                
@@ -141,6 +146,13 @@ class SPP(nn.Module):
 
                                     x = self.cv1(x)
                
 
                                     return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
                
 
                            +    @property
                
 
                            +    def out_channels(self):
                
 
                            +        """
                
 
                            +        :return: channels of tensor(s) that will be returned by a module  in forward
                
 
                            +        """
                
 
                            +        return self._output_channels
                
 
                            +
                
 
                             class ViewModule(nn.Module):
                
 
                                 """
                
@@ -12,6 +12,8 @@ from omegaconf import DictConfig
 
                             from super_gradients.common.decorators.factory_decorator import resolve_param
                
 
                             from super_gradients.common.factories.processing_factory import ProcessingFactory
                
 
                            +from super_gradients.module_interfaces import SupportsReplaceNumClasses
                
 
                            +from super_gradients.modules.head_replacement_utils import replace_num_classes_with_random_weights
                
 
                             from super_gradients.training.utils.utils import HpmStruct
                
 
                             from super_gradients.training.models.sg_module import SgModule
                
 
                             import super_gradients.common.factories.detection_modules_factory as det_factory
                
@@ -102,6 +104,8 @@ class CustomizableDetector(SgModule):
 
                                         raise ValueError("At least one of new_num_classes, new_head must be given to replace output layer.")
                
 
                                     if new_head is not None:
                
 
                                         self.heads = new_head
                
 
                            +        elif isinstance(self.heads, SupportsReplaceNumClasses):
                
 
                            +            self.heads.replace_num_classes(new_num_classes, replace_num_classes_with_random_weights)
                
 
                                     else:
                
 
                                         factory = det_factory.DetectionModulesFactory()
                
 
                                         self.heads_params = factory.insert_module_param(self.heads_params, "num_classes", new_num_classes)
                
@@ -1,4 +1,4 @@
 
                            -from .pp_yolo_e import PPYoloE
                
 
                            +from .pp_yolo_e import PPYoloE, PPYoloE_S, PPYoloE_M, PPYoloE_L, PPYoloE_X
                
 
                             from .post_prediction_callback import PPYoloEPostPredictionCallback
                
 
                            -__all__ = ["PPYoloE", "PPYoloEPostPredictionCallback"]
                
 
                            +__all__ = ["PPYoloE", "PPYoloEPostPredictionCallback", "PPYoloE_L", "PPYoloE_M", "PPYoloE_S", "PPYoloE_X"]
                
@@ -10,10 +10,10 @@ from super_gradients.common.factories.activations_type_factory import Activation
 
                             from super_gradients.training.models.detection_models.csp_resnet import CSPResNetBasicBlock
                
 
                             from super_gradients.modules import ConvBNAct
                
 
                            -__all__ = ["CustomCSPPAN"]
                
 
                            +__all__ = ["PPYoloECSPPAN"]
                
 
                            -class SPP(nn.Module):
                
 
                            +class PPYoloESPP(nn.Module):
                
 
                                 def __init__(
                
 
                                     self,
                
 
                                     in_channels: int,
                
@@ -52,7 +52,7 @@ class CSPStage(nn.Module):
 
                                     for i in range(n):
                
 
                                         convs.append((str(i), CSPResNetBasicBlock(next_ch_in, ch_mid, activation_type=activation_type, use_residual_connection=False)))
                
 
                                         if i == (n - 1) // 2 and spp:
                
 
                            -                convs.append(("spp", SPP(ch_mid, ch_mid, 1, (5, 9, 13), activation_type=activation_type)))
                
 
                            +                convs.append(("spp", PPYoloESPP(ch_mid, ch_mid, 1, (5, 9, 13), activation_type=activation_type)))
                
 
                                         next_ch_in = ch_mid
                
 
                                     self.convs = nn.Sequential(collections.OrderedDict(convs))
                
@@ -68,7 +68,7 @@ class CSPStage(nn.Module):
 
                             @register_detection_module()
                
 
                            -class CustomCSPPAN(nn.Module):
                
 
                            +class PPYoloECSPPAN(nn.Module):
                
 
                                 @resolve_param("activation", ActivationsTypeFactory())
                
 
                                 def __init__(
                
 
                                     self,
                
@@ -1,6 +1,7 @@
 
                             from typing import Union, Optional, List
                
 
                             from torch import Tensor
                
 
                            +
                
 
                             from super_gradients.common.decorators.factory_decorator import resolve_param
                
 
                             from super_gradients.common.factories.processing_factory import ProcessingFactory
                
 
                             from super_gradients.common.registry.registry import register_model
                
@@ -8,7 +9,7 @@ from super_gradients.common.object_names import Models
 
                             from super_gradients.modules import RepVGGBlock
                
 
                             from super_gradients.training.models.sg_module import SgModule
                
 
                             from super_gradients.training.models.detection_models.csp_resnet import CSPResNetBackbone
                
 
                            -from super_gradients.training.models.detection_models.pp_yolo_e.pan import CustomCSPPAN
                
 
                            +from super_gradients.training.models.detection_models.pp_yolo_e.pan import PPYoloECSPPAN
                
 
                             from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_head import PPYOLOEHead
                
 
                             from super_gradients.training.utils import HpmStruct
                
 
                             from super_gradients.training.models.arch_params_factory import get_arch_params
                
@@ -26,7 +27,7 @@ class PPYoloE(SgModule):
 
                                         arch_params = arch_params.to_dict()
                
 
                                     self.backbone = CSPResNetBackbone(**arch_params["backbone"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
                
 
                            -        self.neck = CustomCSPPAN(**arch_params["neck"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
                
 
                            +        self.neck = PPYoloECSPPAN(**arch_params["neck"], depth_mult=arch_params["depth_mult"], width_mult=arch_params["width_mult"])
                
 
                                     self.head = PPYOLOEHead(**arch_params["head"], width_mult=arch_params["width_mult"], num_classes=arch_params["num_classes"])
                
 
                                     self._class_names: Optional[List[str]] = None
                
@@ -175,11 +175,12 @@ class PPYOLOEHead(nn.Module):
 
                                 @torch.jit.ignore
                
 
                                 def replace_num_classes(self, num_classes: int):
                
 
                                     bias_cls = bias_init_with_prob(0.01)
                
 
                            +        device = self.pred_cls[0].weight.device
                
 
                                     self.pred_cls = nn.ModuleList()
                
 
                                     self.num_classes = num_classes
                
 
                                     for in_c in self.in_channels:
                
 
                            -            predict_layer = nn.Conv2d(in_c, num_classes, 3, padding=1)
                
 
                            +            predict_layer = nn.Conv2d(in_c, num_classes, 3, padding=1, device=device)
                
 
                                         torch.nn.init.constant_(predict_layer.weight, 0.0)
                
 
                                         torch.nn.init.constant_(predict_layer.bias, bias_cls)
                
 
                                         self.pred_cls.append(predict_layer)
                
 
            from super_gradients.training.models.detection_models.yolo_nas.dfl_heads import YoloNASDFLHead, NDFLHeads

from super_gradients.training.models.detection_models.yolo_nas.panneck import YoloNASPANNeckWithC2

from super_gradients.training.models.detection_models.yolo_nas.yolo_stages import (
    YoloNASStage,
    YoloNASStem,
    YoloNASDownStage,
    YoloNASUpStage,
    YoloNASBottleneck,
)
from super_gradients.training.models.detection_models.yolo_nas.yolo_nas_variants import YoloNAS_S, YoloNAS_M, YoloNAS_L

__all__ = [
    "YoloNASBottleneck",
    "YoloNASUpStage",
    "YoloNASDownStage",
    "YoloNASStem",
    "YoloNASStage",
    "NDFLHeads",
    "YoloNASDFLHead",
    "YoloNASPANNeckWithC2",
    "YoloNAS_S",
    "YoloNAS_M",
    "YoloNAS_L",
]

          
 
            import math
from typing import Tuple, Union, List, Callable, Optional

import torch
from omegaconf import DictConfig
from torch import nn, Tensor

import super_gradients.common.factories.detection_modules_factory as det_factory
from super_gradients.common.registry import register_detection_module
from super_gradients.modules import ConvBNReLU
from super_gradients.modules.base_modules import BaseDetectionModule
from super_gradients.module_interfaces import SupportsReplaceNumClasses
from super_gradients.modules.utils import width_multiplier
from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_head import generate_anchors_for_grid_cell
from super_gradients.training.utils import HpmStruct, torch_version_is_greater_or_equal
from super_gradients.training.utils.bbox_utils import batch_distance2bbox


@register_detection_module()
class YoloNASDFLHead(BaseDetectionModule, SupportsReplaceNumClasses):
    def __init__(self, in_channels: int, inter_channels: int, width_mult: float, first_conv_group_size: int, num_classes: int, stride: int, reg_max: int):
        """
        Initialize the YoloNASDFLHead
        :param in_channels: Input channels
        :param inter_channels: Intermediate number of channels
        :param width_mult: Width multiplier
        :param first_conv_group_size: Group size
        :param num_classes: Number of detection classes
        :param stride: Output stride for this head
        :param reg_max: Number of bins in the regression head
        """
        super().__init__(in_channels)

        inter_channels = width_multiplier(inter_channels, width_mult, 8)
        if first_conv_group_size == 0:
            groups = 0
        elif first_conv_group_size == -1:
            groups = 1
        else:
            groups = inter_channels // first_conv_group_size

        self.num_classes = num_classes
        self.stem = ConvBNReLU(in_channels, inter_channels, kernel_size=1, stride=1, padding=0, bias=False)

        first_cls_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.cls_convs = nn.Sequential(*first_cls_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        first_reg_conv = [ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, groups=groups, bias=False)] if groups else []
        self.reg_convs = nn.Sequential(*first_reg_conv, ConvBNReLU(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1, bias=False))

        self.cls_pred = nn.Conv2d(inter_channels, self.num_classes, 1, 1, 0)
        self.reg_pred = nn.Conv2d(inter_channels, 4 * (reg_max + 1), 1, 1, 0)

        self.grid = torch.zeros(1)
        self.stride = stride

        self.prior_prob = 1e-2
        self._initialize_biases()

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        self.cls_pred = compute_new_weights_fn(self.cls_pred, num_classes)
        self.num_classes = num_classes

    @property
    def out_channels(self):
        return None

    def forward(self, x):
        x = self.stem(x)

        cls_feat = self.cls_convs(x)
        cls_output = self.cls_pred(cls_feat)

        reg_feat = self.reg_convs(x)
        reg_output = self.reg_pred(reg_feat)

        return reg_output, cls_output

    def _initialize_biases(self):
        prior_bias = -math.log((1 - self.prior_prob) / self.prior_prob)
        torch.nn.init.constant_(self.cls_pred.bias, prior_bias)

    @staticmethod
    def _make_grid(nx=20, ny=20):
        if torch_version_is_greater_or_equal(1, 10):
            # https://github.com/pytorch/pytorch/issues/50276
            yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)], indexing="ij")
        else:
            yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()


@register_detection_module()
class NDFLHeads(BaseDetectionModule, SupportsReplaceNumClasses):
    def __init__(
        self,
        num_classes: int,
        in_channels: Tuple[int, int, int],
        heads_list: Union[str, HpmStruct, DictConfig],
        grid_cell_scale: float = 5.0,
        grid_cell_offset: float = 0.5,
        reg_max: int = 16,
        eval_size: Optional[Tuple[int, int]] = None,
        width_mult: float = 1.0,
    ):
        """
        Initializes the NDFLHeads module.

        :param num_classes: Number of detection classes
        :param in_channels: Number of channels for each feature map (See width_mult)
        :param grid_cell_scale:
        :param grid_cell_offset:
        :param reg_max: Number of bins in the regression head
        :param eval_size: (rows, cols) Size of the image for evaluation. Setting this value can be beneficial for inference speed,
               since anchors will not be regenerated for each forward call.
        :param width_mult: A scaling factor applied to in_channels.
        """
        super(NDFLHeads, self).__init__(in_channels)
        in_channels = [max(round(c * width_mult), 1) for c in in_channels]

        self.in_channels = tuple(in_channels)
        self.num_classes = num_classes
        self.grid_cell_scale = grid_cell_scale
        self.grid_cell_offset = grid_cell_offset
        self.reg_max = reg_max
        self.eval_size = eval_size

        # Do not apply quantization to this tensor
        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).reshape([1, self.reg_max + 1, 1, 1])
        self.register_buffer("proj_conv", proj, persistent=False)

        self._init_weights()

        factory = det_factory.DetectionModulesFactory()
        heads_list = self._pass_args(heads_list, factory, num_classes, reg_max)

        self.num_heads = len(heads_list)
        fpn_strides: List[int] = []
        for i in range(self.num_heads):
            new_head = factory.get(factory.insert_module_param(heads_list[i], "in_channels", in_channels[i]))
            fpn_strides.append(new_head.stride)
            setattr(self, f"head{i + 1}", new_head)

        self.fpn_strides = tuple(fpn_strides)

    def replace_num_classes(self, num_classes: int, compute_new_weights_fn: Callable[[nn.Module, int], nn.Module]):
        for i in range(self.num_heads):
            head = getattr(self, f"head{i + 1}")
            head.replace_num_classes(num_classes, compute_new_weights_fn)

        self.num_classes = num_classes

    @staticmethod
    def _pass_args(heads_list, factory, num_classes, reg_max):
        for i in range(len(heads_list)):
            heads_list[i] = factory.insert_module_param(heads_list[i], "num_classes", num_classes)
            heads_list[i] = factory.insert_module_param(heads_list[i], "reg_max", reg_max)
        return heads_list

    @torch.jit.ignore
    def cache_anchors(self, input_size: Tuple[int, int]):
        self.eval_size = input_size
        anchor_points, stride_tensor = self._generate_anchors()
        self.anchor_points = anchor_points
        self.stride_tensor = stride_tensor

    @torch.jit.ignore
    def _init_weights(self):
        if self.eval_size:
            anchor_points, stride_tensor = self._generate_anchors()
            self.anchor_points = anchor_points
            self.stride_tensor = stride_tensor

    @torch.jit.ignore
    def forward_train(self, feats: Tuple[Tensor, ...]):
        anchors, anchor_points, num_anchors_list, stride_tensor = generate_anchors_for_grid_cell(
            feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset
        )

        cls_score_list, reg_distri_list = [], []
        for i, feat in enumerate(feats):
            reg_distri, cls_logit = getattr(self, f"head{i + 1}")(feat)
            # cls and reg
            # Note we don't apply sigmoid on class predictions to ensure good numerical stability at loss computation
            cls_score_list.append(torch.permute(cls_logit.flatten(2), [0, 2, 1]))
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))
        cls_score_list = torch.cat(cls_score_list, dim=1)
        reg_distri_list = torch.cat(reg_distri_list, dim=1)

        return cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor

    def forward_eval(self, feats: Tuple[Tensor, ...]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, ...]]:

        cls_score_list, reg_distri_list, reg_dist_reduced_list = [], [], []

        for i, feat in enumerate(feats):
            b, _, h, w = feat.shape
            height_mul_width = h * w
            reg_distri, cls_logit = getattr(self, f"head{i + 1}")(feat)
            reg_distri_list.append(torch.permute(reg_distri.flatten(2), [0, 2, 1]))

            reg_dist_reduced = torch.permute(reg_distri.reshape([-1, 4, self.reg_max + 1, height_mul_width]), [0, 2, 3, 1])
            reg_dist_reduced = torch.nn.functional.conv2d(torch.nn.functional.softmax(reg_dist_reduced, dim=1), weight=self.proj_conv).squeeze(1)

            # cls and reg
            cls_score_list.append(cls_logit.reshape([b, self.num_classes, height_mul_width]))
            reg_dist_reduced_list.append(reg_dist_reduced)

        cls_score_list = torch.cat(cls_score_list, dim=-1)  # [B, C, Anchors]
        cls_score_list = torch.permute(cls_score_list, [0, 2, 1])  # # [B, Anchors, C]

        reg_distri_list = torch.cat(reg_distri_list, dim=1)  # [B, Anchors, 4 * (self.reg_max + 1)]
        reg_dist_reduced_list = torch.cat(reg_dist_reduced_list, dim=1)  # [B, Anchors, 4]

        # Decode bboxes
        # Note in eval mode, anchor_points_inference is different from anchor_points computed on train
        if self.eval_size:
            anchor_points_inference, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points_inference, stride_tensor = self._generate_anchors(feats)

        pred_scores = cls_score_list.sigmoid()
        pred_bboxes = batch_distance2bbox(anchor_points_inference, reg_dist_reduced_list) * stride_tensor  # [B, Anchors, 4]

        decoded_predictions = pred_bboxes, pred_scores

        if torch.jit.is_tracing():
            return decoded_predictions

        anchors, anchor_points, num_anchors_list, _ = generate_anchors_for_grid_cell(feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset)

        raw_predictions = cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor
        return decoded_predictions, raw_predictions

    @property
    def out_channels(self):
        return None

    def forward(self, feats: Tuple[Tensor]):
        if self.training:
            return self.forward_train(feats)
        else:
            return self.forward_eval(feats)

    def _generate_anchors(self, feats=None, dtype=torch.float):
        # just use in eval time
        anchor_points = []
        stride_tensor = []
        for i, stride in enumerate(self.fpn_strides):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = int(self.eval_size[0] / stride)
                w = int(self.eval_size[1] / stride)
            shift_x = torch.arange(end=w) + self.grid_cell_offset
            shift_y = torch.arange(end=h) + self.grid_cell_offset
            if torch_version_is_greater_or_equal(1, 10):
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing="ij")
            else:
                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)

            anchor_point = torch.stack([shift_x, shift_y], dim=-1).to(dtype=dtype)
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(torch.full([h * w, 1], stride, dtype=dtype))
        anchor_points = torch.cat(anchor_points)
        stride_tensor = torch.cat(stride_tensor)
        if feats is not None:
            anchor_points = anchor_points.to(feats[0].device)
            stride_tensor = stride_tensor.to(feats[0].device)
        return anchor_points, stride_tensor

          
 
            from typing import Union, List, Tuple

from omegaconf import DictConfig
from torch import Tensor

from super_gradients.common.registry import register_detection_module
from super_gradients.modules.detection_modules import BaseDetectionModule
from super_gradients.training.utils.utils import HpmStruct
import super_gradients.common.factories.detection_modules_factory as det_factory


@register_detection_module("YoloNASPANNeckWithC2")
class YoloNASPANNeckWithC2(BaseDetectionModule):
    """
    A PAN (path aggregation network) neck with 4 stages (2 up-sampling and 2 down-sampling stages)
    where the up-sampling stages include a higher resolution skip
    Returns outputs of neck stage 2, stage 3, stage 4
    """

    def __init__(
        self,
        in_channels: List[int],
        neck1: Union[str, HpmStruct, DictConfig],
        neck2: Union[str, HpmStruct, DictConfig],
        neck3: Union[str, HpmStruct, DictConfig],
        neck4: Union[str, HpmStruct, DictConfig],
    ):
        """
        Initialize the PAN neck

        :param in_channels: Input channels of the 4 feature maps from the backbone
        :param neck1: First neck stage config
        :param neck2: Second neck stage config
        :param neck3: Third neck stage config
        :param neck4: Fourth neck stage config
        """
        super().__init__(in_channels)
        c2_out_channels, c3_out_channels, c4_out_channels, c5_out_channels = in_channels

        factory = det_factory.DetectionModulesFactory()
        self.neck1 = factory.get(factory.insert_module_param(neck1, "in_channels", [c5_out_channels, c4_out_channels, c3_out_channels]))
        self.neck2 = factory.get(factory.insert_module_param(neck2, "in_channels", [self.neck1.out_channels[1], c3_out_channels, c2_out_channels]))
        self.neck3 = factory.get(factory.insert_module_param(neck3, "in_channels", [self.neck2.out_channels[1], self.neck2.out_channels[0]]))
        self.neck4 = factory.get(factory.insert_module_param(neck4, "in_channels", [self.neck3.out_channels, self.neck1.out_channels[0]]))

        self._out_channels = [
            self.neck2.out_channels[1],
            self.neck3.out_channels,
            self.neck4.out_channels,
        ]

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, inputs: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Tuple[Tensor, Tensor, Tensor]:
        c2, c3, c4, c5 = inputs

        x_n1_inter, x = self.neck1([c5, c4, c3])
        x_n2_inter, p3 = self.neck2([x, c3, c2])
        p4 = self.neck3([p3, x_n2_inter])
        p5 = self.neck4([p4, x_n1_inter])

        return p3, p4, p5

          
 
            import copy
from typing import Union

from omegaconf import DictConfig

from super_gradients.common.object_names import Models
from super_gradients.common.registry import register_model
from super_gradients.training.models.arch_params_factory import get_arch_params
from super_gradients.training.models.detection_models.customizable_detector import CustomizableDetector
from super_gradients.training.utils import HpmStruct, get_param

from super_gradients.training.models.detection_models.pp_yolo_e import PPYoloEPostPredictionCallback


@register_model(Models.YOLO_NAS_S)
class YoloNAS_S(CustomizableDetector):
    def __init__(self, arch_params: Union[HpmStruct, DictConfig], in_channels: int = 3):
        default_arch_params = get_arch_params("yolo_nas_s_arch_params")
        merged_arch_params = HpmStruct(**copy.deepcopy(default_arch_params))
        merged_arch_params.override(**arch_params.to_dict())
        super().__init__(
            backbone=merged_arch_params.backbone,
            neck=merged_arch_params.neck,
            heads=merged_arch_params.heads,
            num_classes=get_param(merged_arch_params, "num_classes", None),
            in_channels=in_channels,
            bn_momentum=get_param(merged_arch_params, "bn_momentum", None),
            bn_eps=get_param(merged_arch_params, "bn_eps", None),
            inplace_act=get_param(merged_arch_params, "inplace_act", None),
        )

    @staticmethod
    def get_post_prediction_callback(conf: float, iou: float) -> PPYoloEPostPredictionCallback:
        return PPYoloEPostPredictionCallback(score_threshold=conf, nms_threshold=iou, nms_top_k=1000, max_predictions=300)

    @property
    def num_classes(self):
        return self.heads.num_classes


@register_model(Models.YOLO_NAS_M)
class YoloNAS_M(CustomizableDetector):
    def __init__(self, arch_params: Union[HpmStruct, DictConfig], in_channels: int = 3):
        default_arch_params = get_arch_params("yolo_nas_m_arch_params")
        merged_arch_params = HpmStruct(**copy.deepcopy(default_arch_params))
        merged_arch_params.override(**arch_params.to_dict())
        super().__init__(
            backbone=merged_arch_params.backbone,
            neck=merged_arch_params.neck,
            heads=merged_arch_params.heads,
            num_classes=get_param(merged_arch_params, "num_classes", None),
            in_channels=in_channels,
            bn_momentum=get_param(merged_arch_params, "bn_momentum", None),
            bn_eps=get_param(merged_arch_params, "bn_eps", None),
            inplace_act=get_param(merged_arch_params, "inplace_act", None),
        )

    @staticmethod
    def get_post_prediction_callback(conf: float, iou: float) -> PPYoloEPostPredictionCallback:
        return PPYoloEPostPredictionCallback(score_threshold=conf, nms_threshold=iou, nms_top_k=1000, max_predictions=300)

    @property
    def num_classes(self):
        return self.heads.num_classes


@register_model(Models.YOLO_NAS_L)
class YoloNAS_L(CustomizableDetector):
    def __init__(self, arch_params: Union[HpmStruct, DictConfig], in_channels: int = 3):
        default_arch_params = get_arch_params("yolo_nas_l_arch_params")
        merged_arch_params = HpmStruct(**copy.deepcopy(default_arch_params))
        merged_arch_params.override(**arch_params.to_dict())
        super().__init__(
            backbone=merged_arch_params.backbone,
            neck=merged_arch_params.neck,
            heads=merged_arch_params.heads,
            num_classes=get_param(merged_arch_params, "num_classes", None),
            in_channels=in_channels,
            bn_momentum=get_param(merged_arch_params, "bn_momentum", None),
            bn_eps=get_param(merged_arch_params, "bn_eps", None),
            inplace_act=get_param(merged_arch_params, "inplace_act", None),
        )

    @staticmethod
    def get_post_prediction_callback(conf: float, iou: float) -> PPYoloEPostPredictionCallback:
        return PPYoloEPostPredictionCallback(score_threshold=conf, nms_threshold=iou, nms_top_k=1000, max_predictions=300)

    @property
    def num_classes(self):
        return self.heads.num_classes

          
 
            from functools import partial
from typing import Type, List

import torch
from torch import nn, Tensor

from super_gradients.common.registry import register_detection_module
from super_gradients.modules import Residual, BaseDetectionModule
from super_gradients.common.decorators.factory_decorator import resolve_param
from super_gradients.common.factories.activations_type_factory import ActivationsTypeFactory
from super_gradients.modules import QARepVGGBlock, Conv
from super_gradients.modules.utils import width_multiplier

__all__ = ["YoloNASStage", "YoloNASUpStage", "YoloNASStem", "YoloNASDownStage", "YoloNASBottleneck"]


class YoloNASBottleneck(nn.Module):
    """
    A bottleneck block for YoloNAS. Consists of two consecutive blocks and optional residual connection.
    """

    def __init__(
        self, input_channels: int, output_channels: int, block_type: Type[nn.Module], activation_type: Type[nn.Module], shortcut: bool, use_alpha: bool
    ):
        """
        Initialize the YoloNASBottleneck block

        :param input_channels: Number of input channels
        :param output_channels: Number of output channels
        :param block_type: Type of the convolutional block
        :param activation_type: Activation type for the convolutional block
        :param shortcut: If True, adds the residual connection from input to output.
        :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
        """
        super().__init__()

        self.cv1 = block_type(input_channels, output_channels, activation_type=activation_type)
        self.cv2 = block_type(output_channels, output_channels, activation_type=activation_type)
        self.add = shortcut and input_channels == output_channels
        self.shortcut = Residual() if self.add else None
        if use_alpha:
            self.alpha = torch.nn.Parameter(torch.tensor([1.0]), requires_grad=True)
        else:
            self.alpha = 1.0

    def forward(self, x):
        return self.alpha * self.shortcut(x) + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))


class SequentialWithIntermediates(nn.Sequential):
    """
    A Sequential module that can return all intermediate values as a list of Tensors
    """

    def __init__(self, output_intermediates: bool, *args):
        super(SequentialWithIntermediates, self).__init__(*args)
        self.output_intermediates = output_intermediates

    def forward(self, input: Tensor) -> List[Tensor]:
        if self.output_intermediates:
            output = [input]
            for module in self:
                output.append(module(output[-1]))
            return output
        #  For uniformity, we return a list even if we don't output intermediates
        return [super(SequentialWithIntermediates, self).forward(input)]


class YoloNASCSPLayer(nn.Module):
    """
    Cross-stage layer module for YoloNAS.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_bottlenecks: int,
        block_type: Type[nn.Module],
        activation_type: Type[nn.Module],
        shortcut: bool = True,
        use_alpha: bool = True,
        expansion: float = 0.5,
        hidden_channels: int = None,
        concat_intermediates: bool = False,
    ):
        """

        :param in_channels: Number of input channels.
        :param out_channels:  Number of output channels.
        :param num_bottlenecks: Number of bottleneck blocks.
        :param block_type: Bottleneck block type.
        :param activation_type: Activation type for all blocks.
        :param shortcut: If True, adds the residual connection from input to output.
        :param use_alpha: If True, adds the learnable alpha parameter (multiplier for the residual connection).
        :param expansion: If hidden_channels is None, hidden_channels is set to in_channels * expansion.
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
        :param concat_intermediates:
        """
        super(YoloNASCSPLayer, self).__init__()
        if hidden_channels is None:
            hidden_channels = int(out_channels * expansion)
        self.conv1 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
        self.conv2 = Conv(in_channels, hidden_channels, 1, stride=1, activation_type=activation_type)
        self.conv3 = Conv(hidden_channels * (2 + concat_intermediates * num_bottlenecks), out_channels, 1, stride=1, activation_type=activation_type)
        module_list = [YoloNASBottleneck(hidden_channels, hidden_channels, block_type, activation_type, shortcut, use_alpha) for _ in range(num_bottlenecks)]
        self.bottlenecks = SequentialWithIntermediates(concat_intermediates, *module_list)

    def forward(self, x: Tensor) -> Tensor:
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        x = torch.cat((*x_1, x_2), dim=1)
        return self.conv3(x)


@register_detection_module()
class YoloNASStem(BaseDetectionModule):
    """
    Stem module for YoloNAS. Consists of a single QARepVGGBlock with stride of two.
    """

    def __init__(self, in_channels: int, out_channels: int):
        """
        Initialize the YoloNASStem module
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        """
        super().__init__(in_channels)
        self._out_channels = out_channels
        self.conv = QARepVGGBlock(in_channels, out_channels, stride=2, use_residual_connection=False)

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, x: Tensor) -> Tensor:
        return self.conv(x)


@register_detection_module()
class YoloNASStage(BaseDetectionModule):
    """
    A single stage module for YoloNAS. It consists of a downsample block (QARepVGGBlock) followed by YoloNASCSPLayer.
    """

    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_blocks: int,
        activation_type: Type[nn.Module],
        hidden_channels: int = None,
        concat_intermediates: bool = False,
    ):
        """
        Initialize the YoloNASStage module
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param num_blocks: Number of bottleneck blocks in the YoloNASCSPLayer
        :param activation_type: Activation type for all blocks
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
        :param concat_intermediates: If True, concatenates the intermediate values from the YoloNASCSPLayer.
        """
        super().__init__(in_channels)
        self._out_channels = out_channels
        self.downsample = QARepVGGBlock(in_channels, out_channels, stride=2, activation_type=activation_type, use_residual_connection=False)
        self.blocks = YoloNASCSPLayer(
            out_channels,
            out_channels,
            num_blocks,
            QARepVGGBlock,
            activation_type,
            True,
            hidden_channels=hidden_channels,
            concat_intermediates=concat_intermediates,
        )

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, x):
        return self.blocks(self.downsample(x))


@register_detection_module()
class YoloNASUpStage(BaseDetectionModule):
    """
    Upsampling stage for YoloNAS.
    """

    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(
        self,
        in_channels: List[int],
        out_channels: int,
        width_mult: float,
        num_blocks: int,
        depth_mult: float,
        activation_type: Type[nn.Module],
        hidden_channels: int = None,
        concat_intermediates: bool = False,
        reduce_channels: bool = False,
    ):
        """
        Initialize the YoloNASUpStage module
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param width_mult: Multiplier for the number of channels in the stage.
        :param num_blocks: Number of bottleneck blocks
        :param depth_mult: Multiplier for the number of blocks in the stage.
        :param activation_type: Activation type for all blocks
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks
        :param concat_intermediates:
        :param reduce_channels:
        """
        super().__init__(in_channels)

        num_inputs = len(in_channels)
        if num_inputs == 2:
            in_channels, skip_in_channels = in_channels
        else:
            in_channels, skip_in_channels1, skip_in_channels2 = in_channels
            skip_in_channels = skip_in_channels1 + out_channels  # skip2 downsample results in out_channels channels
        out_channels = width_multiplier(out_channels, width_mult, 8)
        num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

        if num_inputs == 2:
            self.reduce_skip = Conv(skip_in_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
        else:
            self.reduce_skip1 = Conv(skip_in_channels1, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()
            self.reduce_skip2 = Conv(skip_in_channels2, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

        self.conv = Conv(in_channels, out_channels, 1, 1, activation_type)
        self.upsample = nn.ConvTranspose2d(in_channels=out_channels, out_channels=out_channels, kernel_size=2, stride=2)
        if num_inputs == 3:
            self.downsample = Conv(out_channels if reduce_channels else skip_in_channels2, out_channels, kernel=3, stride=2, activation_type=activation_type)

        self.reduce_after_concat = Conv(num_inputs * out_channels, out_channels, 1, 1, activation_type) if reduce_channels else nn.Identity()

        after_concat_channels = out_channels if reduce_channels else out_channels + skip_in_channels
        self.blocks = YoloNASCSPLayer(
            after_concat_channels,
            out_channels,
            num_blocks,
            QARepVGGBlock,
            activation_type,
            hidden_channels=hidden_channels,
            concat_intermediates=concat_intermediates,
        )

        self._out_channels = [out_channels, out_channels]

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, inputs):
        if len(inputs) == 2:
            x, skip_x = inputs
            skip_x = [self.reduce_skip(skip_x)]
        else:
            x, skip_x1, skip_x2 = inputs
            skip_x1, skip_x2 = self.reduce_skip1(skip_x1), self.reduce_skip2(skip_x2)
            skip_x = [skip_x1, self.downsample(skip_x2)]
        x_inter = self.conv(x)
        x = self.upsample(x_inter)
        x = torch.cat([x, *skip_x], 1)
        x = self.reduce_after_concat(x)
        x = self.blocks(x)
        return x_inter, x


@register_detection_module()
class YoloNASDownStage(BaseDetectionModule):
    @resolve_param("activation_type", ActivationsTypeFactory())
    def __init__(
        self,
        in_channels: List[int],
        out_channels: int,
        width_mult: float,
        num_blocks: int,
        depth_mult: float,
        activation_type: Type[nn.Module],
        hidden_channels: int = None,
        concat_intermediates: bool = False,
    ):
        """
        Initializes a YoloNASDownStage.

        :param in_channels: Number of input channels.
        :param out_channels: Number of output channels.
        :param width_mult: Multiplier for the number of channels in the stage.
        :param num_blocks: Number of blocks in the stage.
        :param depth_mult: Multiplier for the number of blocks in the stage.
        :param activation_type: Type of activation to use inside the blocks.
        :param hidden_channels: If not None, sets the number of hidden channels used inside the bottleneck blocks.
        :param concat_intermediates:
        """

        super().__init__(in_channels)

        in_channels, skip_in_channels = in_channels
        out_channels = width_multiplier(out_channels, width_mult, 8)
        num_blocks = max(round(num_blocks * depth_mult), 1) if num_blocks > 1 else num_blocks

        self.conv = Conv(in_channels, out_channels // 2, 3, 2, activation_type)
        after_concat_channels = out_channels // 2 + skip_in_channels
        self.blocks = YoloNASCSPLayer(
            in_channels=after_concat_channels,
            out_channels=out_channels,
            num_bottlenecks=num_blocks,
            block_type=partial(Conv, kernel=3, stride=1),
            activation_type=activation_type,
            hidden_channels=hidden_channels,
            concat_intermediates=concat_intermediates,
        )

        self._out_channels = out_channels

    @property
    def out_channels(self):
        return self._out_channels

    def forward(self, inputs):
        x, skip_x = inputs
        x = self.conv(x)
        x = torch.cat([x, skip_x], 1)
        x = self.blocks(x)
        return x

          
@@ -52,7 +52,7 @@ class Pipeline(ABC):
 
                                 def __init__(self, model: SgModule, image_processor: Union[Processing, List[Processing]], class_names: List[str], device: Optional[str] = None):
                
 
                                     super().__init__()
                
 
                                     self.device = device or next(model.parameters()).device
                
 
                            -        self.model = model.to(device)
                
 
                            +        self.model = model.to(self.device)
                
 
                                     self.class_names = class_names
                
 
                                     if isinstance(image_processor, list):
                
@@ -265,7 +265,12 @@ class DetectionPipeline(Pipeline):
 
                                 def _combine_image_prediction_to_images(
                
 
                                     self, images_predictions: Iterable[ImageDetectionPrediction], n_images: Optional[int] = None
                
 
                                 ) -> ImagesDetectionPrediction:
                
 
                            -        images_predictions = [image_predictions for image_predictions in tqdm(images_predictions, total=n_images, desc="Predicting Images")]
                
 
                            +        if n_images is not None and n_images == 1:
                
 
                            +            # Do not show tqdm progress bar if there is only one image
                
 
                            +            pass
                
 
                            +        else:
                
 
                            +            images_predictions = [image_predictions for image_predictions in tqdm(images_predictions, total=n_images, desc="Predicting Images")]
                
 
                            +
                
 
                                     return ImagesDetectionPrediction(_images_prediction_lst=images_predictions)
                
 
                                 def _combine_image_prediction_to_video(
                
@@ -59,6 +59,10 @@ MODEL_URLS = {
 
                                 "ppyoloe_m_coco": "https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/coco2017_ppyoloe_m.pth",
                
 
                                 "ppyoloe_l_coco": "https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/coco2017_pp_yoloe_l_best_model_21uffbb8.pth",  # 0.4948
                
 
                                 "ppyoloe_x_coco": "https://deci-pretrained-models.s3.amazonaws.com/ppyolo_e/coco2017_pp_yoloe_x_best_model_z03if91o.pth",  # 0.5115
                
 
                            +    #
                
 
                            +    "yolo_nas_s_coco": "https://deci-pretrained-models.s3.amazonaws.com/yolo_nas/yolo_nas_s_coco2017.pth",
                
 
                            +    "yolo_nas_m_coco": "https://deci-pretrained-models.s3.amazonaws.com/yolo_nas/yolo_nas_m_coco2017.pth",
                
 
                            +    "yolo_nas_l_coco": "https://deci-pretrained-models.s3.amazonaws.com/yolo_nas/yolo_nas_l_coco2017.pth",
                
 
                             }
                
 
                             PRETRAINED_NUM_CLASSES = {
                
@@ -305,8 +305,8 @@ def default_ppyoloe_coco_processing_params() -> dict:
 
                                 return params
                
 
                            -def default_deciyolo_coco_processing_params() -> dict:
                
 
                            -    """Processing parameters commonly used for training DeciYolo on COCO dataset.
                
 
                            +def default_yolo_nas_coco_processing_params() -> dict:
                
 
                            +    """Processing parameters commonly used for training YoloNAS on COCO dataset.
                
 
                                 TODO: remove once we load it from the checkpoint
                
 
                                 """
                
@@ -322,8 +322,8 @@ def default_deciyolo_coco_processing_params() -> dict:
 
                                 params = dict(
                
 
                                     class_names=COCO_DETECTION_CLASSES_LIST,
                
 
                                     image_processor=image_processor,
                
 
                            -        iou=0.65,
                
 
                            -        conf=0.5,
                
 
                            +        iou=0.7,
                
 
                            +        conf=0.25,
                
 
                                 )
                
 
                                 return params
                
@@ -337,6 +337,6 @@ def get_pretrained_processing_params(model_name: str, pretrained_weights: str) -
 
                                         return default_yolox_coco_processing_params()
                
 
                                     elif "ppyoloe" in model_name:
                
 
                                         return default_ppyoloe_coco_processing_params()
                
 
                            -        elif "deciyolo" in model_name:
                
 
                            -            return default_deciyolo_coco_processing_params()
                
 
                            +        elif "yolo_nas" in model_name:
                
 
                            +            return default_yolo_nas_coco_processing_params()
                
 
                                 return dict()
                
@@ -291,11 +291,21 @@ def load_pretrained_weights(model: torch.nn.Module, architecture: str, pretraine
 
                                 :param pretrained_weights: name for the pretrianed weights (i.e imagenet)
                
 
                                 :return: None
                
 
                                 """
                
 
                            +    from super_gradients.common.object_names import Models
                
 
                            +
                
 
                                 model_url_key = architecture + "_" + str(pretrained_weights)
                
 
                                 if model_url_key not in MODEL_URLS.keys():
                
 
                                     raise MissingPretrainedWeightsException(model_url_key)
                
 
                                 url = MODEL_URLS[model_url_key]
                
 
                            +
                
 
                            +    if architecture in {Models.YOLO_NAS_S, Models.YOLO_NAS_M, Models.YOLO_NAS_L}:
                
 
                            +        logger.info(
                
 
                            +            "License Notification: YOLO-NAS pre-trained weights are subjected to the specific license terms and conditions detailed in \n"
                
 
                            +            "https://github.com/Deci-AI/super-gradients/LICENSE.YOLONAS.md. \n"
                
 
                            +            "By downloading the pre-trained weight files you agree to comply with these terms."
                
 
                            +        )
                
 
                            +
                
 
                                 unique_filename = url.split("https://deci-pretrained-models.s3.amazonaws.com/")[1].replace("/", "_").replace(" ", "_")
                
 
                                 map_location = torch.device("cpu")
                
 
                                 pretrained_state_dict = load_state_dict_from_url(url=url, map_location=map_location, file_name=unique_filename)