Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

#587 Feature/sg 521 gpu tests

Merged
Ghost merged 1 commits into Deci-AI:master from deci-ai:feature/SG-521_gpu_tests
@@ -104,7 +104,6 @@ jobs:
       - store_artifacts:
           path: ~/sg_logs
 
-
   release_candidate:
     parameters:
       py_version:
@@ -180,6 +179,40 @@ jobs:
           tag: $CIRCLE_TAG
           notes: "This GitHub Release was done automatically by CircleCI"
 
+  recipe_tests:
+    machine: true
+    resource_class: deci-ai/sg-gpu-on-premise
+    parameters:
+      sg_existing_env_path:
+        type: string
+        default: "/env/persistent_env"
+      sg_new_env_name:
+        type: string
+        default: "${CIRCLE_BUILD_NUM}"
+      sg_new_env_python_version:
+        type: string
+        default: "python3.8"
+    steps:
+      - checkout
+      - run:
+          name: install requirements and run recipe tests
+          command: |
+            << parameters.sg_new_env_python_version >> -m venv << parameters.sg_new_env_name >>
+            source << parameters.sg_new_env_name >>/bin/activate
+            python3.8 -m pip install --upgrade setuptools pip wheel
+            python3.8 -m pip install -r requirements.txt
+            python3.8 -m pip install git+https://github.com/Deci-AI/super-gradients.git@${CIRCLE_BRANCH}
+            python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116
+            python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cifar10_resnet experiment_name=shortened_cifar10_resnet_accuracy_test training_hyperparams.max_epochs=100 training_hyperparams.average_best_models=False +multi_gpu=DDP +num_gpus=4
+            python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox experiment_name=shortened_coco2017_yolox_n_map_test architecture=yolox_n training_hyperparams.loss=yolox_fast_loss training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
+            python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=shortened_cityscapes_regseg48_iou_test training_hyperparams.max_epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
+            coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py
+
+      - run:
+          name: Remove new environment when failed
+          command: "rm -r << parameters.sg_new_env_name >>"
+          when: on_fail
+
 
 
 workflows:
@@ -199,10 +232,13 @@ workflows:
             - deci-common/persist_version_info
             - login_to_codeartifact_release
           <<: *release_tag_filter
+      - recipe_tests:
+          <<: *release_tag_filter
       - release_version:
           py_version: "3.7"
           requires:
             - "build3.7"
+            - recipe_tests
           <<: *release_tag_filter
       - deci-common/pip_upload_package_from_codeartifact_to_global_pypi:
           package_name: "super-gradients"
@@ -219,6 +255,7 @@ workflows:
       - deci-common/persist_version_info
       - deci-common/codeartifact_login:
           repo_name: "deci-packages"
+
       - build:
           name: "build3.7"
           py_version: "3.7"
@@ -226,6 +263,7 @@ workflows:
           requires:
             - deci-common/persist_version_info
             - deci-common/codeartifact_login
+
       - release_candidate: # happens on merge
           py_version: "3.7"
           requires:
Discard
@@ -32,3 +32,4 @@ wheel>=0.38.0
 # not directly required, pinned by Snyk to avoid a vulnerability
 pygments>=2.7.4
 stringcase>=1.2.0
+numpy<=1.23
Discard
@@ -24,7 +24,6 @@ resume: False
 training_hyperparams:
   resume: ${resume}
 
-
 ckpt_root_dir:
 
 architecture: resnet18_cifar
Discard
@@ -954,7 +954,13 @@ class Trainer:
             training_params = dict()
         self.train_loader = train_loader or self.train_loader
         self.valid_loader = valid_loader or self.valid_loader
-        if len(self.train_loader.dataset) % self.train_loader.batch_size != 0 and not self.train_loader.drop_last:
+
+        if hasattr(self.train_loader, "batch_sampler") and self.train_loader.batch_sampler is not None:
+            batch_size = self.train_loader.batch_sampler.batch_size
+        else:
+            batch_size = self.train_loader.batch_size
+
+        if len(self.train_loader.dataset) % batch_size != 0 and not self.train_loader.drop_last:
             logger.warning("Train dataset size % batch_size != 0 and drop_last=False, this might result in smaller " "last batch.")
         self._set_dataset_params()
 
Discard
@@ -242,7 +242,7 @@ def restart_script_with_ddp(num_gpus: int = None):
     elastic_launch(config=config, entrypoint=sys.executable)(*sys.argv, *EXTRA_ARGS)
 
     # The code below should actually never be reached as the process will be in a loop inside elastic_launch until any subprocess crashes.
-    sys.exit("Main process finished")
+    sys.exit(0)
 
 
 def get_gpu_mem_utilization():
Discard
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
  1. import sys
  2. import unittest
  3. from tests.recipe_training_tests.shortened_recipes_accuracy_test import ShortenedRecipesAccuracyTests
  4. class CoreUnitTestSuiteRunner:
  5. def __init__(self):
  6. self.test_loader = unittest.TestLoader()
  7. self.recipe_tests_suite = unittest.TestSuite()
  8. self._add_modules_to_unit_tests_suite()
  9. self.test_runner = unittest.TextTestRunner(verbosity=3, stream=sys.stdout)
  10. def _add_modules_to_unit_tests_suite(self):
  11. """
  12. _add_modules_to_unit_tests_suite - Adds unit tests to the Unit Tests Test Suite
  13. :return:
  14. """
  15. self.recipe_tests_suite.addTest(self.test_loader.loadTestsFromModule(ShortenedRecipesAccuracyTests))
  16. if __name__ == "__main__":
  17. unittest.main()
Discard
    Discard
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    1. import unittest
    2. import shutil
    3. from coverage.annotate import os
    4. from super_gradients.common.environment import environment_config
    5. import torch
    6. class ShortenedRecipesAccuracyTests(unittest.TestCase):
    7. @classmethod
    8. def setUp(cls):
    9. cls.experiment_names = ["shortened_cifar10_resnet_accuracy_test", "shortened_coco2017_yolox_n_map_test", "shortened_cityscapes_regseg48_iou_test"]
    10. def test_shortened_cifar10_resnet_accuracy(self):
    11. self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cifar10_resnet_accuracy_test", metric_value=0.9167, delta=0.05))
    12. def test_shortened_coco2017_yolox_n_map(self):
    13. self.assertTrue(self._reached_goal_metric(experiment_name="shortened_coco2017_yolox_n_map_test", metric_value=0.044, delta=0.02))
    14. def test_shortened_cityscapes_regseg48_iou(self):
    15. self.assertTrue(self._reached_goal_metric(experiment_name="shortened_cityscapes_regseg48_iou_test", metric_value=0.263, delta=0.05))
    16. @classmethod
    17. def _reached_goal_metric(cls, experiment_name: str, metric_value: float, delta: float):
    18. ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, experiment_name)
    19. sd = torch.load(os.path.join(ckpt_dir, "ckpt_best.pth"))
    20. metric_val_reached = sd["acc"].cpu().item()
    21. diff = abs(metric_val_reached - metric_value)
    22. print(
    23. "Goal metric value: " + str(metric_value) + ", metric value reached: " + str(metric_val_reached) + ",diff: " + str(diff) + ", delta: " + str(delta)
    24. )
    25. return diff <= delta
    26. @classmethod
    27. def tearDownClass(cls) -> None:
    28. # ERASE ALL THE FOLDERS THAT WERE CREATED DURING THIS TEST
    29. for folder in cls.experiment_names:
    30. ckpt_dir = os.path.join(environment_config.PKG_CHECKPOINTS_DIR, folder)
    31. if os.path.isdir(ckpt_dir):
    32. shutil.rmtree(ckpt_dir)
    33. if __name__ == "__main__":
    34. unittest.main()
    Discard