Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

#875 Feature/sg 761 yolo nas

Merged
Ghost merged 1 commits into Deci-AI:master from deci-ai:feature/SG-761-yolo-nas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
  1. from typing import Optional, Union, Iterable
  2. import torch
  3. from torch.optim import Optimizer
  4. from super_gradients.common.object_names import Optimizers
  5. from super_gradients.common.registry.registry import register_optimizer
  6. """
  7. This implementation is taken from timm's github:
  8. https://github.com/rwightman/pytorch-image-models/blob/master/timm/optim/rmsprop_tf.py
  9. """
  10. """ RMSProp modified to behave like Tensorflow impl
  11. Originally cut & paste from PyTorch RMSProp
  12. https://github.com/pytorch/pytorch/blob/063946d2b3f3f1e953a2a3b54e0b34f1393de295/torch/optim/rmsprop.py
  13. Licensed under BSD-Clause 3 (ish), https://github.com/pytorch/pytorch/blob/master/LICENSE
  14. Modifications Copyright 2020 Ross Wightman
  15. """
  16. @register_optimizer(Optimizers.RMS_PROP_TF)
  17. class RMSpropTF(Optimizer):
  18. """Implements RMSprop algorithm (TensorFlow style epsilon)
  19. NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt
  20. and a few other modifications to closer match Tensorflow for matching hyper-params.
  21. Noteworthy changes include:
  22. 1. Epsilon applied inside square-root
  23. 2. square_avg initialized to ones
  24. 3. LR scaling of update accumulated in momentum buffer
  25. Proposed by G. Hinton in his
  26. `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
  27. The centered version first appears in `Generating Sequences
  28. With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_."""
  29. def __init__(
  30. self,
  31. params: Union[Iterable[torch.Tensor], Iterable[dict]],
  32. lr: float = 1e-2,
  33. alpha: float = 0.9,
  34. eps: float = 1e-10,
  35. weight_decay: float = 0,
  36. momentum: float = 0.0,
  37. centered: bool = False,
  38. decoupled_decay: bool = False,
  39. lr_in_momentum: bool = True,
  40. ):
  41. """RMSprop optimizer that follows the tf's RMSprop characteristics
  42. :param params (iterable): iterable of parameters to optimize or dicts defining parameter groups.
  43. :param lr (float, optional): learning rate
  44. :param momentum (float, optional): momentum factor
  45. :param alpha (float, optional): smoothing (decay) constant
  46. :param eps (float, optional): term added to the denominator to improve numerical stability
  47. :param centered (bool, optional) : if ``True``, compute the centered RMSProp, the gradient is normalized by an
  48. estimation of its variance
  49. :param weight_decay (float, optional): weight decay (L2 penalty)
  50. :param decoupled_decay (bool, optional): decoupled weight decay as per https://arxiv.org/abs/1711.05101
  51. :param lr_in_momentum (bool, optional): learning rate scaling is included in the momentum buffer update as per
  52. defaults in Tensorflow
  53. """
  54. if not 0.0 <= lr:
  55. raise ValueError("Invalid learning rate: {}".format(lr))
  56. if not 0.0 <= eps:
  57. raise ValueError("Invalid epsilon value: {}".format(eps))
  58. if not 0.0 <= momentum:
  59. raise ValueError("Invalid momentum value: {}".format(momentum))
  60. if not 0.0 <= weight_decay:
  61. raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
  62. if not 0.0 <= alpha:
  63. raise ValueError("Invalid alpha value: {}".format(alpha))
  64. defaults = dict(
  65. lr=lr,
  66. momentum=momentum,
  67. alpha=alpha,
  68. eps=eps,
  69. centered=centered,
  70. weight_decay=weight_decay,
  71. decoupled_decay=decoupled_decay,
  72. lr_in_momentum=lr_in_momentum,
  73. )
  74. super(RMSpropTF, self).__init__(params, defaults)
  75. def __setstate__(self, state):
  76. super(RMSpropTF, self).__setstate__(state)
  77. for group in self.param_groups:
  78. group.setdefault("momentum", 0)
  79. group.setdefault("centered", False)
  80. def step(self, closure: Optional[callable] = None) -> torch.Tensor: # noqa: C901
  81. """Performs a single optimization step.
  82. Arguments:
  83. closure (callable, optional): A closure that reevaluates the model
  84. and returns the loss.
  85. """
  86. loss = None
  87. if closure is not None:
  88. loss = closure()
  89. for group in self.param_groups:
  90. for p in group["params"]:
  91. if p.grad is None:
  92. continue
  93. grad = p.grad.data
  94. if grad.is_sparse:
  95. raise RuntimeError("RMSprop does not support sparse gradients")
  96. state = self.state[p]
  97. # State initialization
  98. if len(state) == 0:
  99. state["step"] = 0
  100. state["square_avg"] = torch.ones_like(p.data) # PyTorch inits to zero
  101. if group["momentum"] > 0:
  102. state["momentum_buffer"] = torch.zeros_like(p.data)
  103. if group["centered"]:
  104. state["grad_avg"] = torch.zeros_like(p.data)
  105. square_avg = state["square_avg"]
  106. one_minus_alpha = 1.0 - group["alpha"]
  107. state["step"] += 1
  108. if group["weight_decay"] != 0:
  109. if "decoupled_decay" in group and group["decoupled_decay"]:
  110. p.data.add_(-group["weight_decay"], p.data)
  111. else:
  112. grad = grad.add(group["weight_decay"], p.data)
  113. # Tensorflow order of ops for updating squared avg
  114. square_avg.add_(one_minus_alpha, grad.pow(2) - square_avg)
  115. # square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad) # PyTorch original
  116. if group["centered"]:
  117. grad_avg = state["grad_avg"]
  118. grad_avg.add_(one_minus_alpha, grad - grad_avg)
  119. # grad_avg.mul_(alpha).add_(1 - alpha, grad) # PyTorch original
  120. avg = square_avg.addcmul(-1, grad_avg, grad_avg).add(group["eps"]).sqrt_() # eps moved in sqrt
  121. else:
  122. avg = square_avg.add(group["eps"]).sqrt_() # eps moved in sqrt
  123. if group["momentum"] > 0:
  124. buf = state["momentum_buffer"]
  125. # Tensorflow accumulates the LR scaling in the momentum buffer
  126. if "lr_in_momentum" in group and group["lr_in_momentum"]:
  127. buf.mul_(group["momentum"]).addcdiv_(group["lr"], grad, avg)
  128. p.data.add_(-buf)
  129. else:
  130. # PyTorch scales the param update by LR
  131. buf.mul_(group["momentum"]).addcdiv_(grad, avg)
  132. p.data.add_(-group["lr"], buf)
  133. else:
  134. p.data.addcdiv_(-group["lr"], grad, avg)
  135. return loss
Discard
Tip!

Press p or to see the previous file or, n or to see the next file