Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

nag.py 2.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the license found in the LICENSE file in
  5. # the root directory of this source tree. An additional grant of patent rights
  6. # can be found in the PATENTS file in the same directory.
  7. from torch.optim.optimizer import Optimizer, required
  8. from . import FairseqOptimizer, register_optimizer
  9. @register_optimizer('nag')
  10. class FairseqNAG(FairseqOptimizer):
  11. def __init__(self, args, params):
  12. super().__init__(args, params)
  13. self._optimizer = NAG(params, **self.optimizer_config)
  14. @property
  15. def optimizer_config(self):
  16. """
  17. Return a kwarg dictionary that will be used to override optimizer
  18. args stored in checkpoints. This allows us to load a checkpoint and
  19. resume training using a different set of optimizer args, e.g., with a
  20. different learning rate.
  21. """
  22. return {
  23. 'lr': self.args.lr[0],
  24. 'momentum': self.args.momentum,
  25. 'weight_decay': self.args.weight_decay,
  26. }
  27. class NAG(Optimizer):
  28. def __init__(self, params, lr=required, momentum=0, weight_decay=0):
  29. defaults = dict(lr=lr, lr_old=lr, momentum=momentum, weight_decay=weight_decay)
  30. super(NAG, self).__init__(params, defaults)
  31. def step(self, closure=None):
  32. """Performs a single optimization step.
  33. Arguments:
  34. closure (callable, optional): A closure that reevaluates the model
  35. and returns the loss.
  36. """
  37. loss = None
  38. if closure is not None:
  39. loss = closure()
  40. for group in self.param_groups:
  41. weight_decay = group['weight_decay']
  42. momentum = group['momentum']
  43. lr = group['lr']
  44. lr_old = group.get('lr_old', lr)
  45. lr_correct = lr / lr_old
  46. for p in group['params']:
  47. if p.grad is None:
  48. continue
  49. d_p = p.grad.data
  50. param_state = self.state[p]
  51. if 'momentum_buffer' not in param_state:
  52. param_state['momentum_buffer'] = d_p.clone().zero_()
  53. buf = param_state['momentum_buffer']
  54. if weight_decay != 0:
  55. p.data.mul_(1 - lr * weight_decay)
  56. p.data.add_(momentum * momentum * lr_correct, buf)
  57. p.data.add_(-(1 + momentum) * lr, d_p)
  58. buf.mul_(momentum * lr_correct).add_(-lr, d_p)
  59. group['lr_old'] = lr
  60. return loss
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...