File size: 22,754 Bytes
a86ca18
 
143e8bd
a86ca18
 
 
 
 
 
6b8b1ef
a86ca18
 
 
 
143e8bd
 
6b8b1ef
 
a86ca18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143e8bd
 
6b8b1ef
143e8bd
 
 
 
 
a86ca18
 
143e8bd
 
 
 
 
 
6b8b1ef
 
143e8bd
a86ca18
 
6b8b1ef
 
 
a86ca18
143e8bd
 
6b8b1ef
 
 
 
a86ca18
6b8b1ef
878bd55
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
878bd55
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
 
 
 
 
878bd55
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143e8bd
f2c15d5
a86ca18
6b8b1ef
 
 
143e8bd
6b8b1ef
143e8bd
6b8b1ef
143e8bd
6b8b1ef
143e8bd
6b8b1ef
143e8bd
6b8b1ef
143e8bd
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
143e8bd
6b8b1ef
 
 
 
 
143e8bd
6b8b1ef
 
 
 
 
 
143e8bd
6b8b1ef
143e8bd
6b8b1ef
 
 
 
 
0ccc803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a86ca18
143e8bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b8b1ef
143e8bd
6b8b1ef
 
 
 
a86ca18
143e8bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a86ca18
143e8bd
 
a86ca18
 
 
143e8bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a86ca18
 
143e8bd
 
 
a86ca18
 
143e8bd
 
 
f2c15d5
a86ca18
143e8bd
a86ca18
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
<!DOCTYPE html>
<html>

<head>
    <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
    <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta charset="utf8">
    <base target="_blank">
    <title>The Ultra-Scale Playbook: Training LLMs on GPU Clusters</title>
    <link rel="stylesheet" href="style.css">
</head>

<body>
    <d-front-matter>
        <script id='distill-front-matter' type="text/json">{
    "title": "The Ultra-Scale Playbook: Training LLMs on GPU Clusters",
    "description": "This blog covers everything about scaling LLMs in 2025.",
    "published": "Sept 28, 2024",
    "affiliation": {"name": "HuggingFace"},
    "authors": [
      {
        "author":"Leandro Werra",
        "authorURL":"https://huggingface.co/lvwerra"
      },
      {
        "author":"Thomas Wolf",
        "authorURL":"https://huggingface.co/thomwolf"
      }
    ],
    "katex": {
      "delimiters": [
        {"left": "$$", "right": "$$", "display": false}
      ]
    }
  }
    </script>
    </d-front-matter>
    <d-title>
        <h1 class="l-page" style="text-align: center;">The Ultra-Scale Playbook: Training LLMs on GPU Clusters</h1>
        <div id="title-plot" class="main-plot-container l-screen">
            <figure>
                <img src="assets/images/banner.png" alt="FineWeb">
            </figure>
            <!-- <div id="clusters-plot">
            <img src="assets/images/clusters.png" alt="Clusters">
        </div> -->
        </div>
    </d-title>
    <d-byline></d-byline>
    <d-article>
        <d-contents>
        </d-contents>
        
        <p>Fueled by the scaling laws<d-cite bibtex-key="kaplan2020scalinglaws"></d-cite><d-cite bibtex-key="hoffmann2022chinchilla"></d-cite>, the trend of training ever larger language models on vaster amounts of data has been driving progress in AI for the past couple years. Initially, the development of the largest models happened exclusively behind closed doors of a handful of research labs but recently opened up more with the release of models such as Llama 3.1 405B<d-cite bibtex-key="grattafiori2024llama3herdmodels"></d-cite> and DeepSeek R1<d-cite bibtex-key="deepseekai2024deepseekv3technicalreport"></d-cite>. While these models have <a href="https://huggingface.co/meta-llama">openly shared</a> <a href="https://huggingface.co/deepseek-ai">weights</a> and their training recipes are described in <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">reports</a>, the challenging engineering to involved to train at the necessary infrastructure scale is still hidden between the lines of a handful of papers and complex training frameworks. This ~~long blog post~~ open-source book is here to open this black box!</p>

        <aside>Reading time: 7 days. For the best reading experience, we recommend not using a mobile phone.</aside>

        <p>In this book we invite you to follow us in the wonderful world of scaling training of Large Language Models to tens, hundreds, thousands of GPUs. It assumes you know the basics on LLM architecture and training, but are new to distributed training. This writing can be seen as a second part of a trilogy following our first blog on processing data for pre-training, the so-called “<a href="https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1">FineWeb blog post</a>”. Having read both blog posts, you should have almost all the core knowledge needed to deeply understand how LLMs are being built nowadays, just missing a bit the final spices like data mixing or architecture choices to complete the recipe (stay tuned…).</p>

        <p>Pre-training LLMs from scratch now requires amounts of compute which exceed in almost every case the use of a single GPU or machine. The clusters used to train these models range from hundreds to thousands of nodes each usually equipped with 4 to 8 GPUs. To make the best use of such an expensive hardware as well as to train in a reasonable time, a range of distributed training methods have been developed with the goal of ensuring that GPUs are highly utilized at all times. Efficiently scaling LLM training is also not confined to pretraining anymore, as fine-tuning larger models on more domain specific data is becoming the standard practice to achieve the best results.</p>

        <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating
            the template on which we based this blog post.</aside>
        
        <p>In this post we’ll cover these scaling methods exhaustively while keeping a single story-line to understand where each technique comes from. We’ll cover data, tensor, pipeline and context parallelism as well as ZeRO and kernel fusion. The post is built on the following <strong>three foundations</strong>:</p>

        <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what it’s advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>

        <div id="graph"></div>
        <div id="controls">
            <div class="cell column-1">
            <label for="a">Attention Heads (a):</label>
            <input type="range" id="a" name="a" min="1" max="128" value="8">
            <input type="number" id="a_input" value="8" min="1" max="128">
            </div>
            <div class="cell column-2">
            <label for="mixed">Mixed Precision:</label>
            <input type="checkbox" id="mixed" name="mixed" checked>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="b">Micro Batch Size (b):</label>
            <input type="range" id="b" name="b" min="1" max="53248" value="32">
            <input type="number" id="b_input" value="32" min="1" max="53248">
            </div>
            <div class="cell column-2">
            <label for="seq_parallel">Sequence Parallelism:</label>
            <input type="checkbox" id="seq_parallel" name="seq_parallel">
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="h">Hidden Dimension (h):</label>
            <input type="range" id="h" name="h" min="1" max="16384" value="512">
            <input type="number" id="h_input" value="512" min="128" max="16384">
            </div>
            <div class="cell column-2">
            <label for="recomputation">Recomputation:</label>
            <select id="recomputation" name="recomputation">
                <option value="none">None</option>
                <option value="selective">Selective</option>
                <option value="full">Full</option>
            </select>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="h_ff">Feedforward Dimension (h_ff):</label>
            <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
            <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
            </div>
            <div class="cell column-2">
            <label for="zero">Zero:</label>
            <select id="zero" name="zero">
                <option value="0">0</option>
                <option value="1">1</option>
                <option value="2">2</option>
                <option value="3">3</option>
            </select>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="L">Number of Layers (L):</label>
            <input type="range" id="L" name="L" min="1" max="126" value="12">
            <input type="number" id="L_input" value="12" min="1" max="126">
            </div>
            <div class="cell column-2">
            <label for="ff_activation">FF Activation:</label>
            <select id="ff_activation" name="ff_activation">
                <option value="relu">ReLU</option>
                <option value="gelu">GELU</option>
                <option value="swiglu">SwiGLU</option>
            </select>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="s">Sequence Length (s):</label>
            <input type="range" id="s" name="s" min="1" max="128000" value="128">
            <input type="number" id="s_input" value="128" min="64" max="128000">
            </div>
            <div class="cell column-2">
            <label for="presets">Presets:</label>
            <select id="presets" name="presets">
                <option value="Llama 3 Tiny">Llama 3 Tiny</option>
                <option value="Llama 3 8B">Llama 3 8B</option>
                <option value="Llama 3 70B">Llama 3 70B</option>
                <option value="Llama 3 405B">Llama 3 405B</option>
            </select>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="v">Vocabulary Size (v):</label>
            <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
            <input type="number" id="v_input" value="30522" min="1000" max="100000">
            </div>
            <div class="cell column-2">
            <label for="tp">Tensor Parallelism (t):</label>
            <input type="range" id="tp" name="tp" min="1" max="16" value="8">
            <input type="number" id="tp_input" value="8" min="1" max="16">
            </div>
            <div class="cell column-1">
            <label for="k">Optimizer Parameters (k):</label>
            <input type="range" id="k" name="k" min="1" max="16" value="8">
            <input type="number" id="k_input" value="8" min="1" max="16">
            </div>
            <div class="cell column-2">
            <label for="dp">Data Parallelism (d):</label>
            <input type="range" id="dp" name="dp" min="1" max="256" value="1">
            <input type="number" id="dp_input" value="1" min="1" max="256">
            </div>
        </div>

        <p>While this widget gives a theoretical breakdown the following tool can be used to predict the memory usage:</p>
        
        <p><img alt="image.png" src="assets/images/placeholder.png"/></p>

        <p><strong>Clear code implementations:</strong> theory is one thing, but we discover all kinds of edge cases and important details when we implement something. That’s why we link to implementation references where possible. Depending on the case, we’ll use two code references: the <a href="https://github.com/huggingface/picotron">picotron</a> repository is built for education, thus it implements concepts usually in single, self-contained short files. On the other hand, to look at production ready code, we’ll refer to the <a href="https://github.com/huggingface/nanotron">nanotron</a> implementations which is a production training codebase used at Hugging Face.</p>

        <p><img alt="Picotron implements each key concept in a self-contained way, such that the method can be studied separately and in isolation." src="assets/images/placeholder.png" /></p>

        <p><strong>Real training efficiency benchmarks:</strong> Finally, how to <em>actually</em> scale your LLM training depends on your infrastructure, such as the kind of chips, interconnect etc., and we can’t give a single unified recipe. What we will give though is a way to benchmark several setups and it is what we have done on our cluster! We ran over 4100 distributed experiments with up to 512 GPUs to scan many possible distributed training layouts and model sizes. TODO: link to dataset too </p>

        <p><img alt="An overview of the over 4000 experiments across all Llama architectures where each data point corresponds to an experiment launch." src="assets/images/placeholder.png" /></p>

        <p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on we’ll cover in the post.</p>

        <h2>TL;DR</h2>
        <p>This book is very extensive so we decide to start with a very general overview of how you can think about distributed training. At a high level, the key challenge in scaling LLM training is to make a training step (forward/backward/optimizer step) with a large batch size the fastest possible.</p>
        <p>When scaling up models and input batches, we quickly end up in situations where either our target batch size won't fit in memory, or/and the model itself is too large to fit in a single GPU's memory.</p>
        <p>To solve this scaling issue we’ll need to carefully evaluate different parallelization strategies and find the optimal balance between three main factors:</p>
        <ol>
        <li><strong>Memory Usage</strong><ul>
        <li>Hard limitation - if a training step doesn't fit in memory, training cannot proceed</li>
        <li>Sometimes we can increase compute (e.g. recomputation) or increase communication (e.g. ZeRO) to reduce memory usage</li>
        </ul>
        </li>
        <li><strong>Compute Efficiency</strong><ul>
        <li>Memory transfer can also decrease compute efficiency.</li>
        <li>We want our hardware to spend most time computing, so we need to reduce time spent on data transfers or unoptimized kernels.</li>
        <li>GPUs need sufficient workload (large enough matrices/batch sizes) to maintain high utilization (compute-bound) otherwise they become memory-bound (limited by memory bandwidth).</li>
        </ul>
        </li>
        <li><strong>Communication overhead</strong><ul>
        <li>Two main types. For GPUs: intra-node (NVLink TODO: bandwidth) and inter-node (network TODO: bandwidth)</li>
        <li>Two main attributes: base latency and peak bandwidth. Base latency is a constant overhead that makes us want to do the least number of comms possible, and peak bandwidth controls the how fast we can move data between gpus</li>
        <li>We prioritize using the fastest communication channels (like NVLink) for operations that occur frequently and/or block computation (e.g. tensor parallelism)</li>
        <li>We want to minimize communication overhead as it keeps GPUs idle, so we try to overlap communication with compute as much as possible</li>
        </ul>
        </li>
        </ol>
        <p>But let’s not get too much ahead of our self and scale progressively. To guide you along the journey and as a practical reference we summarized the key concepts in a cheatsheet:</p>
        <p>[TODO: ADD CHEATSHEET]</p>
        <p>Now that we nailed a few key concept and terms let’s get started by revisiting the basic training steps of an LLM!</p>
                
        <h2>First Steps: Training on one GPU</h2>
        
        <h3>Memory usage in Transformers</h3>
        
        <h4>Memory profiling a training step</h4>
        
        <h4>Weights/grads/optimizer states memory</h4>
        
        <h4>Activations memory</h4>
        
        <h3>Activation recomputation</h3>
        
        <h3>Gradient accumulation</h3>
        
        <h2>Data Parallelism</h2>
        
        <h4><strong>First optimization:</strong> Overlap gradient synchronization with backward pass</h4>
        
        <h4><strong>Second optimization:</strong> Bucketing gradients</h4>
        
        <h4><strong>Third optimization: I</strong>nterplay with gradient accumulation</h4>
        
        <h3>Revisit global batch size</h3>
        
        <h3>Our journey up to now</h3>
        
        <h3>ZeRO (<strong>Ze</strong>ro <strong>R</strong>edundancy <strong>O</strong>ptimizer)</h3>
        
        <h4>Memory usage revisited</h4>
        
        <h4>ZeRO-1: Partitioning Optimizer States</h4>
        
        <h4>ZeRO-2: Adding <strong>Gradient Partitioning</strong></h4>
        
        <h4>ZeRO-3: Adding <strong>Parameter Partitioning</strong></h4>
        
        <h2>Tensor Parallelism</h2>
        
        <h3>Tensor Parallelism in a Transformer Block</h3>
        
        <h3>Sequence Parallelism</h3>
        
        <h2>Context Parallelism</h2>
        
        <h3>Introducing Context Parallelism</h3>
        
        <h3>Discovering Ring Attention</h3>
        
        <h3>Zig-Zag Ring Attention – A Balanced Compute Implementation</h3>
        
        <h2>Pipeline Parallelism</h2>
        
        <h3>Splitting layers on various nodes - All forward, all backward</h3>
        
        <h3>One-forward-one-backward and LLama 3.1 schemes</h3>
        
        <h3>Interleaving stages</h3>
        
        <h3>Zero Bubble and DualPipe</h3>
        
        <h2>Expert parallelism</h2>
        
        <h2>5D parallelism in a nutshell</h2>
        
        <h2>How to Find the Best Training Configuration</h2>
        
        <h2>Diving in the GPUs – fusing, threading, mixing</h2>
        
        <h4>A primer on GPU</h4>
        
        <h3>How to improve performance with Kernels ?</h3>
        
        <h4>Memory Coalescing</h4>
        
        <h4>Tiling</h4>
        
        <h4>Thread Coarsening</h4>
        
        <h4>Minimizing Control Divergence</h4>
        
        <h3>Flash Attention 1-3</h3>
        
        <h3>Fused Kernels</h3>
        
        <h3>Mixed Precision Training</h3>
        
        <h4>FP16 and BF16 training</h4>
        
        <h4>FP8 pretraining</h4>
        
        <h2>Conclusion</h2>
        
        <h3>What you learned</h3>
        
        <h3>What we learned</h3>
        
        <h3>What’s next?</h3>
        
        <h2>References</h2>
        
        <h3>Landmark LLM Scaling Papers</h3>
        
        <h3>Training Frameworks</h3>
        
        <h3>Debugging</h3>
        
        <h3>Distribution Techniques</h3>
        
        <h3>CUDA Kernels</h3>
        
        <h3>Hardware</h3>
        
        <h3>Others</h3>
        
        <h2>Appendix</h2>

    </d-article>

    <d-appendix>
        <d-bibliography src="bibliography.bib"></d-bibliography>
        <style>
            d-appendix .citation {
                font-size: 11px;
                line-height: 15px;
                border-left: 1px solid rgba(0, 0, 0, 0.1);
                padding-left: 18px;
                border: 1px solid rgba(0, 0, 0, 0.1);
                background: rgba(0, 0, 0, 0.02);
                padding: 10px 18px;
                border-radius: 3px;
                color: rgba(150, 150, 150, 1);
                overflow: hidden;
                margin-top: -12px;
                white-space: pre-wrap;
                word-wrap: break-word;
            }
        </style>

        <h3 id="citation">Citation</h3>
        <p>For attribution in academic contexts, please cite this work as</p>
        <pre
            class="citation short">XXX, et al., "The Ultra-Scale Playbook: Training LLMs on GPU Clusterse", 2025.</pre>
        <p>BibTeX citation</p>
        <pre class="citation long">@misc{TODO,
      title={The Ultra-Scale Playbook: Training LLMs on GPU Clusters},
      author={TODO},
      year={2025},
}</pre>
    </d-appendix>

    <script>
        const article = document.querySelector('d-article');
        const toc = document.querySelector('d-contents');
        if (toc) {
            const headings = article.querySelectorAll('h2, h3, h4');
            let ToC = `<nav role="navigation" class="l-text figcaption"><h3>Table of contents</h3>`;
            let prevLevel = 0;

            for (const el of headings) {
                // should element be included in TOC?
                const isInTitle = el.parentElement.tagName == 'D-TITLE';
                const isException = el.getAttribute('no-toc');
                if (isInTitle || isException) continue;
                el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
                const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';

                const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
                while (prevLevel < level) {
                    ToC += '<ul>'
                    prevLevel++;
                }
                while (prevLevel > level) {
                    ToC += '</ul>'
                    prevLevel--;
                }
                if (level === 0)
                    ToC += '<div>' + link + '</div>';
                else
                    ToC += '<li>' + link + '</li>';
            }

            while (prevLevel > 0) {
                ToC += '</ul>'
                prevLevel--;
            }
            ToC += '</nav>';
            toc.innerHTML = ToC;
            toc.setAttribute('prerendered', 'true');
            const toc_links = document.querySelectorAll('d-contents > nav a');

            window.addEventListener('scroll', (_event) => {
                if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
                    // Then iterate forwards, on the first match highlight it and break
                    find_active: {
                        for (let i = headings.length - 1; i >= 0; i--) {
                            if (headings[i].getBoundingClientRect().top - 50 <= 0) {
                                if (!toc_links[i].classList.contains("active")) {
                                    toc_links.forEach((link, _index) => {
                                        link.classList.remove("active");
                                    });
                                    toc_links[i].classList.add('active');
                                }
                                break find_active;
                            }
                        }
                        toc_links.forEach((link, _index) => {
                            link.classList.remove("active");
                        });
                    }
                }
            });
        }
    </script>

</body>

</html>