Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

AdaBelief.py 3.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  1. import numpy as np
  2. from core.leras import nn
  3. from tensorflow.python.ops import control_flow_ops, state_ops
  4. tf = nn.tf
  5. class AdaBelief(nn.OptimizerBase):
  6. def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, lr_dropout=1.0, lr_cos=0, clipnorm=0.0, name=None, **kwargs):
  7. super().__init__(name=name)
  8. if name is None:
  9. raise ValueError('name must be defined.')
  10. self.lr = lr
  11. self.beta_1 = beta_1
  12. self.beta_2 = beta_2
  13. self.lr_dropout = lr_dropout
  14. self.lr_cos = lr_cos
  15. self.clipnorm = clipnorm
  16. with tf.device('/CPU:0') :
  17. with tf.variable_scope(self.name):
  18. self.iterations = tf.Variable(0, dtype=tf.int64, name='iters')
  19. self.ms_dict = {}
  20. self.vs_dict = {}
  21. self.lr_rnds_dict = {}
  22. def get_weights(self):
  23. return [self.iterations] + list(self.ms_dict.values()) + list(self.vs_dict.values())
  24. def initialize_variables(self, trainable_weights, vars_on_cpu=True, lr_dropout_on_cpu=False):
  25. # Initialize here all trainable variables used in training
  26. e = tf.device('/CPU:0') if vars_on_cpu else None
  27. if e: e.__enter__()
  28. with tf.variable_scope(self.name):
  29. ms = { v.name : tf.get_variable ( f'ms_{v.name}'.replace(':','_'), v.shape, dtype=v.dtype, initializer=tf.initializers.constant(0.0), trainable=False) for v in trainable_weights }
  30. vs = { v.name : tf.get_variable ( f'vs_{v.name}'.replace(':','_'), v.shape, dtype=v.dtype, initializer=tf.initializers.constant(0.0), trainable=False) for v in trainable_weights }
  31. self.ms_dict.update (ms)
  32. self.vs_dict.update (vs)
  33. if self.lr_dropout != 1.0:
  34. e = tf.device('/CPU:0') if lr_dropout_on_cpu else None
  35. if e: e.__enter__()
  36. lr_rnds = [ nn.random_binomial( v.shape, p=self.lr_dropout, dtype=v.dtype) for v in trainable_weights ]
  37. if e: e.__exit__(None, None, None)
  38. self.lr_rnds_dict.update ( { v.name : rnd for v,rnd in zip(trainable_weights,lr_rnds) } )
  39. if e: e.__exit__(None, None, None)
  40. def get_update_op(self, grads_vars):
  41. updates = []
  42. if self.clipnorm > 0.0:
  43. norm = tf.sqrt( sum([tf.reduce_sum(tf.square(tf.cast(g, tf.float32))) for g,v in grads_vars]))
  44. updates += [ state_ops.assign_add( self.iterations, 1) ]
  45. for i, (g,v) in enumerate(grads_vars):
  46. if self.clipnorm > 0.0:
  47. g = self.tf_clip_norm(g, self.clipnorm, tf.cast(norm, g.dtype) )
  48. ms = self.ms_dict[ v.name ]
  49. vs = self.vs_dict[ v.name ]
  50. m_t = self.beta_1*ms + (1.0-self.beta_1) * g
  51. v_t = self.beta_2*vs + (1.0-self.beta_2) * tf.square(g-m_t)
  52. lr = tf.constant(self.lr, g.dtype)
  53. if self.lr_cos != 0:
  54. lr *= (tf.cos( tf.cast(self.iterations, g.dtype) * (2*3.1415926535/ float(self.lr_cos) ) ) + 1.0) / 2.0
  55. v_diff = - lr * m_t / (tf.sqrt(v_t) + np.finfo( g.dtype.as_numpy_dtype ).resolution )
  56. if self.lr_dropout != 1.0:
  57. lr_rnd = self.lr_rnds_dict[v.name]
  58. v_diff *= lr_rnd
  59. new_v = v + v_diff
  60. updates.append (state_ops.assign(ms, m_t))
  61. updates.append (state_ops.assign(vs, v_t))
  62. updates.append (state_ops.assign(v, new_v))
  63. return control_flow_ops.group ( *updates, name=self.name+'_updates')
  64. nn.AdaBelief = AdaBelief
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...