Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

rmsprop_tf.py 6.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
  1. import torch
  2. from torch.optim import Optimizer
  3. """
  4. This implementation is taken from timm's github:
  5. https://github.com/rwightman/pytorch-image-models/blob/master/timm/optim/rmsprop_tf.py
  6. """
  7. """ RMSProp modified to behave like Tensorflow impl
  8. Originally cut & paste from PyTorch RMSProp
  9. https://github.com/pytorch/pytorch/blob/063946d2b3f3f1e953a2a3b54e0b34f1393de295/torch/optim/rmsprop.py
  10. Licensed under BSD-Clause 3 (ish), https://github.com/pytorch/pytorch/blob/master/LICENSE
  11. Modifications Copyright 2020 Ross Wightman
  12. """
  13. class RMSpropTF(Optimizer):
  14. """Implements RMSprop algorithm (TensorFlow style epsilon)
  15. NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt
  16. and a few other modifications to closer match Tensorflow for matching hyper-params.
  17. Noteworthy changes include:
  18. 1. Epsilon applied inside square-root
  19. 2. square_avg initialized to ones
  20. 3. LR scaling of update accumulated in momentum buffer
  21. Proposed by G. Hinton in his
  22. `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
  23. The centered version first appears in `Generating Sequences
  24. With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_."""
  25. def __init__(self, params, lr=1e-2, alpha=0.9, eps=1e-10, weight_decay=0, momentum=0., centered=False,
  26. decoupled_decay=False, lr_in_momentum=True):
  27. """RMSprop optimizer that follows the tf's RMSprop characteristics
  28. :param params (iterable): iterable of parameters to optimize or dicts defining parameter groups
  29. :param lr (float, optional): learning rate
  30. :param momentum (float, optional): momentum factor
  31. :param alpha (float, optional): smoothing (decay) constant
  32. :param eps (float, optional): term added to the denominator to improve numerical stability
  33. :param centered (bool, optional) : if ``True``, compute the centered RMSProp, the gradient is normalized by an
  34. estimation of its variance
  35. :param weight_decay (float, optional): weight decay (L2 penalty)
  36. :param decoupled_decay (bool, optional): decoupled weight decay as per https://arxiv.org/abs/1711.05101
  37. :param lr_in_momentum (bool, optional): learning rate scaling is included in the momentum buffer update as per
  38. defaults in Tensorflow
  39. """
  40. if not 0.0 <= lr:
  41. raise ValueError("Invalid learning rate: {}".format(lr))
  42. if not 0.0 <= eps:
  43. raise ValueError("Invalid epsilon value: {}".format(eps))
  44. if not 0.0 <= momentum:
  45. raise ValueError("Invalid momentum value: {}".format(momentum))
  46. if not 0.0 <= weight_decay:
  47. raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
  48. if not 0.0 <= alpha:
  49. raise ValueError("Invalid alpha value: {}".format(alpha))
  50. defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay,
  51. decoupled_decay=decoupled_decay, lr_in_momentum=lr_in_momentum)
  52. super(RMSpropTF, self).__init__(params, defaults)
  53. def __setstate__(self, state):
  54. super(RMSpropTF, self).__setstate__(state)
  55. for group in self.param_groups:
  56. group.setdefault('momentum', 0)
  57. group.setdefault('centered', False)
  58. def step(self, closure=None): # noqa: C901
  59. """Performs a single optimization step.
  60. Arguments:
  61. closure (callable, optional): A closure that reevaluates the model
  62. and returns the loss.
  63. """
  64. loss = None
  65. if closure is not None:
  66. loss = closure()
  67. for group in self.param_groups:
  68. for p in group['params']:
  69. if p.grad is None:
  70. continue
  71. grad = p.grad.data
  72. if grad.is_sparse:
  73. raise RuntimeError('RMSprop does not support sparse gradients')
  74. state = self.state[p]
  75. # State initialization
  76. if len(state) == 0:
  77. state['step'] = 0
  78. state['square_avg'] = torch.ones_like(p.data) # PyTorch inits to zero
  79. if group['momentum'] > 0:
  80. state['momentum_buffer'] = torch.zeros_like(p.data)
  81. if group['centered']:
  82. state['grad_avg'] = torch.zeros_like(p.data)
  83. square_avg = state['square_avg']
  84. one_minus_alpha = 1. - group['alpha']
  85. state['step'] += 1
  86. if group['weight_decay'] != 0:
  87. if 'decoupled_decay' in group and group['decoupled_decay']:
  88. p.data.add_(-group['weight_decay'], p.data)
  89. else:
  90. grad = grad.add(group['weight_decay'], p.data)
  91. # Tensorflow order of ops for updating squared avg
  92. square_avg.add_(one_minus_alpha, grad.pow(2) - square_avg)
  93. # square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad) # PyTorch original
  94. if group['centered']:
  95. grad_avg = state['grad_avg']
  96. grad_avg.add_(one_minus_alpha, grad - grad_avg)
  97. # grad_avg.mul_(alpha).add_(1 - alpha, grad) # PyTorch original
  98. avg = square_avg.addcmul(-1, grad_avg, grad_avg).add(group['eps']).sqrt_() # eps moved in sqrt
  99. else:
  100. avg = square_avg.add(group['eps']).sqrt_() # eps moved in sqrt
  101. if group['momentum'] > 0:
  102. buf = state['momentum_buffer']
  103. # Tensorflow accumulates the LR scaling in the momentum buffer
  104. if 'lr_in_momentum' in group and group['lr_in_momentum']:
  105. buf.mul_(group['momentum']).addcdiv_(group['lr'], grad, avg)
  106. p.data.add_(-buf)
  107. else:
  108. # PyTorch scales the param update by LR
  109. buf.mul_(group['momentum']).addcdiv_(grad, avg)
  110. p.data.add_(-group['lr'], buf)
  111. else:
  112. p.data.addcdiv_(-group['lr'], grad, avg)
  113. return loss
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...