Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

trainer_test.py 4.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  1. import shutil
  2. import unittest
  3. from super_gradients.common.object_names import Models
  4. from super_gradients.training import models
  5. import super_gradients
  6. import torch
  7. import os
  8. from super_gradients import Trainer
  9. from super_gradients.training.dataloaders.dataloaders import classification_test_dataloader
  10. from super_gradients.training.metrics import Accuracy, Top5
  11. from super_gradients.common.environment.checkpoints_dir_utils import get_checkpoints_dir_path
  12. class TestTrainer(unittest.TestCase):
  13. @classmethod
  14. def setUp(cls):
  15. super_gradients.init_trainer()
  16. # NAMES FOR THE EXPERIMENTS TO LATER DELETE
  17. cls.experiment_names = ["test_train", "test_save_load", "test_load_w", "test_load_w2", "test_load_w3", "test_checkpoint_content", "analyze"]
  18. cls.training_params = {
  19. "max_epochs": 1,
  20. "silent_mode": True,
  21. "lr_decay_factor": 0.1,
  22. "initial_lr": 0.1,
  23. "lr_updates": [4],
  24. "lr_mode": "StepLRScheduler",
  25. "loss": "CrossEntropyLoss",
  26. "train_metrics_list": [Accuracy(), Top5()],
  27. "valid_metrics_list": [Accuracy(), Top5()],
  28. "metric_to_watch": "Accuracy",
  29. "greater_metric_to_watch_is_better": True,
  30. }
  31. @classmethod
  32. def tearDownClass(cls) -> None:
  33. # ERASE ALL THE EXPERIMENT FOLDERS THAT WERE CREATED DURING THIS TEST
  34. for experiment_name in cls.experiment_names:
  35. experiment_dir = get_checkpoints_dir_path(experiment_name=experiment_name)
  36. if os.path.isdir(experiment_dir):
  37. # TODO: Occasionally this method fails because log files are still open (See setup_logging() call).
  38. # TODO: Need to find a way to close them at the end of training, this is however tricky to achieve
  39. # TODO: because setup_logging() called outside of Trainer class.
  40. shutil.rmtree(experiment_dir, ignore_errors=True)
  41. @staticmethod
  42. def get_classification_trainer(name=""):
  43. trainer = Trainer(name)
  44. model = models.get(Models.RESNET18, num_classes=5)
  45. return trainer, model
  46. def test_train(self):
  47. trainer, model = self.get_classification_trainer(self.experiment_names[0])
  48. trainer.train(
  49. model=model, training_params=self.training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
  50. )
  51. def test_save_load(self):
  52. trainer, model = self.get_classification_trainer(self.experiment_names[1])
  53. trainer.train(
  54. model=model, training_params=self.training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
  55. )
  56. resume_training_params = self.training_params.copy()
  57. resume_training_params["resume"] = True
  58. resume_training_params["max_epochs"] = 2
  59. trainer, model = self.get_classification_trainer(self.experiment_names[1])
  60. trainer.train(
  61. model=model, training_params=resume_training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
  62. )
  63. def test_checkpoint_content(self):
  64. """VERIFY THAT ALL CHECKPOINTS ARE SAVED AND CONTAIN ALL THE EXPECTED KEYS"""
  65. trainer, model = self.get_classification_trainer(self.experiment_names[5])
  66. params = self.training_params.copy()
  67. params["save_ckpt_epoch_list"] = [1]
  68. trainer.train(model=model, training_params=params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader())
  69. ckpt_filename = ["ckpt_best.pth", "ckpt_latest.pth", "ckpt_epoch_1.pth"]
  70. ckpt_paths = [os.path.join(trainer.checkpoints_dir_path, suf) for suf in ckpt_filename]
  71. for ckpt_path in ckpt_paths:
  72. ckpt = torch.load(ckpt_path)
  73. self.assertListEqual(["net", "acc", "epoch", "optimizer_state_dict", "scaler_state_dict"], list(ckpt.keys()))
  74. trainer._save_checkpoint()
  75. weights_only = torch.load(os.path.join(trainer.checkpoints_dir_path, "ckpt_latest_weights_only.pth"))
  76. self.assertListEqual(["net"], list(weights_only.keys()))
  77. if __name__ == "__main__":
  78. unittest.main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...