1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>super_gradients.training.utils.distributed_training_utils — SuperGradients 1.0 documentation</title>
<link rel="stylesheet" href="../../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="../../../../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../../../../_static/graphviz.css" type="text/css" />
<!--[if lt IE 9]>
<script src="../../../../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script data-url_root="../../../../" id="documentation_options" src="../../../../_static/documentation_options.js"></script>
<script src="../../../../_static/jquery.js"></script>
<script src="../../../../_static/underscore.js"></script>
<script src="../../../../_static/doctools.js"></script>
<script src="../../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../../genindex.html" />
<link rel="search" title="Search" href="../../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../../index.html" class="icon icon-home"> SuperGradients
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption"><span class="caption-text">SuperGradients</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../welcome.html">SuperGradients</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../super_gradients.common.html">Common</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../super_gradients.training.html">Training</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../../index.html">SuperGradients</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../../index.html" class="icon icon-home"></a> »</li>
<li><a href="../../../index.html">Module code</a> »</li>
<li>super_gradients.training.utils.distributed_training_utils</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for super_gradients.training.utils.distributed_training_utils</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">from</span> <span class="nn">torch.cuda.amp</span> <span class="kn">import</span> <span class="n">autocast</span>
<span class="kn">import</span> <span class="nn">torch.nn</span> <span class="k">as</span> <span class="nn">nn</span>
<span class="kn">import</span> <span class="nn">itertools</span>
<div class="viewcode-block" id="distributed_all_reduce_tensor_average"><a class="viewcode-back" href="../../../../super_gradients.training.utils.html#super_gradients.training.utils.distributed_training_utils.distributed_all_reduce_tensor_average">[docs]</a><span class="k">def</span> <span class="nf">distributed_all_reduce_tensor_average</span><span class="p">(</span><span class="n">tensor</span><span class="p">,</span> <span class="n">n</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> This method performs a reduce operation on multiple nodes running distributed training</span>
<span class="sd"> It first sums all of the results and then divides the summation</span>
<span class="sd"> :param tensor: The tensor to perform the reduce operation for</span>
<span class="sd"> :param n: Number of nodes</span>
<span class="sd"> :return: Averaged tensor from all of the nodes</span>
<span class="sd"> """</span>
<span class="n">rt</span> <span class="o">=</span> <span class="n">tensor</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span>
<span class="n">torch</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="n">all_reduce</span><span class="p">(</span><span class="n">rt</span><span class="p">,</span> <span class="n">op</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="n">ReduceOp</span><span class="o">.</span><span class="n">SUM</span><span class="p">)</span>
<span class="n">rt</span> <span class="o">/=</span> <span class="n">n</span>
<span class="k">return</span> <span class="n">rt</span></div>
<div class="viewcode-block" id="reduce_results_tuple_for_ddp"><a class="viewcode-back" href="../../../../super_gradients.training.utils.html#super_gradients.training.utils.distributed_training_utils.reduce_results_tuple_for_ddp">[docs]</a><span class="k">def</span> <span class="nf">reduce_results_tuple_for_ddp</span><span class="p">(</span><span class="n">validation_results_tuple</span><span class="p">,</span> <span class="n">device</span><span class="p">):</span>
<span class="sd">"""Gather all validation tuples from the various devices and average them"""</span>
<span class="n">validation_results_list</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">validation_results_tuple</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">validation_result</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">validation_results_list</span><span class="p">):</span>
<span class="n">validation_results_list</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">distributed_all_reduce_tensor_average</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="n">validation_result</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
<span class="n">torch</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="n">get_world_size</span><span class="p">())</span>
<span class="n">validation_results_tuple</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">validation_results_list</span><span class="p">)</span>
<span class="k">return</span> <span class="n">validation_results_tuple</span></div>
<div class="viewcode-block" id="MultiGPUModeAutocastWrapper"><a class="viewcode-back" href="../../../../super_gradients.training.utils.html#super_gradients.training.utils.distributed_training_utils.MultiGPUModeAutocastWrapper">[docs]</a><span class="k">class</span> <span class="nc">MultiGPUModeAutocastWrapper</span><span class="p">():</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">func</span> <span class="o">=</span> <span class="n">func</span>
<span class="k">def</span> <span class="fm">__call__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="k">with</span> <span class="n">autocast</span><span class="p">():</span>
<span class="n">out</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">func</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">return</span> <span class="n">out</span></div>
<div class="viewcode-block" id="scaled_all_reduce"><a class="viewcode-back" href="../../../../super_gradients.training.utils.html#super_gradients.training.utils.distributed_training_utils.scaled_all_reduce">[docs]</a><span class="k">def</span> <span class="nf">scaled_all_reduce</span><span class="p">(</span><span class="n">tensors</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">num_gpus</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Performs the scaled all_reduce operation on the provided tensors.</span>
<span class="sd"> The input tensors are modified in-place.</span>
<span class="sd"> Currently supports only the sum</span>
<span class="sd"> reduction operator.</span>
<span class="sd"> The reduced values are scaled by the inverse size of the</span>
<span class="sd"> process group (equivalent to num_gpus).</span>
<span class="sd"> """</span>
<span class="c1"># There is no need for reduction in the single-proc case</span>
<span class="k">if</span> <span class="n">num_gpus</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">return</span> <span class="n">tensors</span>
<span class="c1"># Queue the reductions</span>
<span class="n">reductions</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">tensor</span> <span class="ow">in</span> <span class="n">tensors</span><span class="p">:</span>
<span class="n">reduction</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="n">all_reduce</span><span class="p">(</span><span class="n">tensor</span><span class="p">,</span> <span class="n">async_op</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">reductions</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">reduction</span><span class="p">)</span>
<span class="c1"># Wait for reductions to finish</span>
<span class="k">for</span> <span class="n">reduction</span> <span class="ow">in</span> <span class="n">reductions</span><span class="p">:</span>
<span class="n">reduction</span><span class="o">.</span><span class="n">wait</span><span class="p">()</span>
<span class="c1"># Scale the results</span>
<span class="k">for</span> <span class="n">tensor</span> <span class="ow">in</span> <span class="n">tensors</span><span class="p">:</span>
<span class="n">tensor</span><span class="o">.</span><span class="n">mul_</span><span class="p">(</span><span class="mf">1.0</span> <span class="o">/</span> <span class="n">num_gpus</span><span class="p">)</span>
<span class="k">return</span> <span class="n">tensors</span></div>
<div class="viewcode-block" id="compute_precise_bn_stats"><a class="viewcode-back" href="../../../../super_gradients.training.utils.html#super_gradients.training.utils.distributed_training_utils.compute_precise_bn_stats">[docs]</a><span class="nd">@torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">compute_precise_bn_stats</span><span class="p">(</span><span class="n">model</span><span class="p">:</span> <span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">,</span> <span class="n">loader</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">DataLoader</span><span class="p">,</span> <span class="n">precise_bn_batch_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">num_gpus</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="sd">'''</span>
<span class="sd"> :param model: The model being trained (ie: SgModel.net)</span>
<span class="sd"> :param loader: Training dataloader (ie: SgModel.train_loader)</span>
<span class="sd"> :param precise_bn_batch_size: The effective batch size we want to calculate the batchnorm on. For example, if we are training a model</span>
<span class="sd"> on 8 gpus, with a batch of 128 on each gpu, a good rule of thumb would be to give it 8192</span>
<span class="sd"> (ie: effective_batch_size * num_gpus = batch_per_gpu * num_gpus * num_gpus).</span>
<span class="sd"> If precise_bn_batch_size is not provided in the training_params, the latter heuristic</span>
<span class="sd"> will be taken.</span>
<span class="sd"> param num_gpus: The number of gpus we are training on</span>
<span class="sd"> '''</span>
<span class="c1"># Compute the number of minibatches to use</span>
<span class="n">num_iter</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">precise_bn_batch_size</span> <span class="o">/</span> <span class="p">(</span><span class="n">loader</span><span class="o">.</span><span class="n">batch_size</span> <span class="o">*</span> <span class="n">num_gpus</span><span class="p">))</span> <span class="k">if</span> <span class="n">precise_bn_batch_size</span> <span class="k">else</span> <span class="n">num_gpus</span>
<span class="n">num_iter</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">num_iter</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">loader</span><span class="p">))</span>
<span class="c1"># Retrieve the BN layers</span>
<span class="n">bns</span> <span class="o">=</span> <span class="p">[</span><span class="n">m</span> <span class="k">for</span> <span class="n">m</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">modules</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">m</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">BatchNorm2d</span><span class="p">)]</span>
<span class="c1"># Initialize BN stats storage for computing mean(mean(batch)) and mean(var(batch))</span>
<span class="n">running_means</span> <span class="o">=</span> <span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">bn</span><span class="o">.</span><span class="n">running_mean</span><span class="p">)</span> <span class="k">for</span> <span class="n">bn</span> <span class="ow">in</span> <span class="n">bns</span><span class="p">]</span>
<span class="n">running_vars</span> <span class="o">=</span> <span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">bn</span><span class="o">.</span><span class="n">running_var</span><span class="p">)</span> <span class="k">for</span> <span class="n">bn</span> <span class="ow">in</span> <span class="n">bns</span><span class="p">]</span>
<span class="c1"># Remember momentum values</span>
<span class="n">momentums</span> <span class="o">=</span> <span class="p">[</span><span class="n">bn</span><span class="o">.</span><span class="n">momentum</span> <span class="k">for</span> <span class="n">bn</span> <span class="ow">in</span> <span class="n">bns</span><span class="p">]</span>
<span class="c1"># Set momentum to 1.0 to compute BN stats that only reflect the current batch</span>
<span class="k">for</span> <span class="n">bn</span> <span class="ow">in</span> <span class="n">bns</span><span class="p">:</span>
<span class="n">bn</span><span class="o">.</span><span class="n">momentum</span> <span class="o">=</span> <span class="mf">1.0</span>
<span class="c1"># Average the BN stats for each BN layer over the batches</span>
<span class="k">for</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">_labels</span> <span class="ow">in</span> <span class="n">itertools</span><span class="o">.</span><span class="n">islice</span><span class="p">(</span><span class="n">loader</span><span class="p">,</span> <span class="n">num_iter</span><span class="p">):</span>
<span class="n">model</span><span class="p">(</span><span class="n">inputs</span><span class="o">.</span><span class="n">cuda</span><span class="p">())</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">bn</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">bns</span><span class="p">):</span>
<span class="n">running_means</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+=</span> <span class="n">bn</span><span class="o">.</span><span class="n">running_mean</span> <span class="o">/</span> <span class="n">num_iter</span>
<span class="n">running_vars</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+=</span> <span class="n">bn</span><span class="o">.</span><span class="n">running_var</span> <span class="o">/</span> <span class="n">num_iter</span>
<span class="c1"># Sync BN stats across GPUs (no reduction if 1 GPU used)</span>
<span class="n">running_means</span> <span class="o">=</span> <span class="n">scaled_all_reduce</span><span class="p">(</span><span class="n">running_means</span><span class="p">,</span> <span class="n">num_gpus</span><span class="o">=</span><span class="n">num_gpus</span><span class="p">)</span>
<span class="n">running_vars</span> <span class="o">=</span> <span class="n">scaled_all_reduce</span><span class="p">(</span><span class="n">running_vars</span><span class="p">,</span> <span class="n">num_gpus</span><span class="o">=</span><span class="n">num_gpus</span><span class="p">)</span>
<span class="c1"># Set BN stats and restore original momentum values</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">bn</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">bns</span><span class="p">):</span>
<span class="n">bn</span><span class="o">.</span><span class="n">running_mean</span> <span class="o">=</span> <span class="n">running_means</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="n">bn</span><span class="o">.</span><span class="n">running_var</span> <span class="o">=</span> <span class="n">running_vars</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="n">bn</span><span class="o">.</span><span class="n">momentum</span> <span class="o">=</span> <span class="n">momentums</span><span class="p">[</span><span class="n">i</span><span class="p">]</span></div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2021, SuperGradients team.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
Tip!
Press p or to see the previous file or,
n or to see the next file