Spaces:

nanotron
/

ultrascale-playbook

Running

App Files Files Community

thomwolf HF staff commited on 5 days ago

Commit

926e541

verified ·

1 Parent(s): 787ae8e

thom new mods (#67)

Browse files

- updates (afcade16d3f70f9391d9257aad0e594681ac1b22)

Files changed (8) hide show

dist/assets/images/memorycoalescing.png +2 -2
dist/index.html +179 -120
dist/main.bundle.js +4 -4
dist/main.bundle.js.map +0 -0
dist/style.css +220 -75
src/index.html +178 -119
src/memory.js +4 -4
src/style.css +220 -75

dist/assets/images/memorycoalescing.png CHANGED Viewed

Git LFS Details

SHA256: 088cd848100ab26abbffdcc7c0e8f18a83facd0a8637c460e3ac88d483b04b46
Pointer size: 130 Bytes
Size of remote file: 94.1 kB

Git LFS Details

SHA256: 1094fe9aeb953c743791445ee6d7e73a5a89fa85fe60f4312266d1265e7c591a
Pointer size: 130 Bytes
Size of remote file: 94.1 kB

dist/index.html CHANGED Viewed

@@ -90,108 +90,141 @@
         <p>The book is built on the following <strong>three general foundations</strong>:</p>
         <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what it’s advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>
-        <div id="graph"></div>
-        <div id="controls">
-            <div class="cell column-1">
-            <label for="a">Attention Heads (a):</label>
-            <input type="range" id="a" name="a" min="1" max="128" value="8">
-            <input type="number" id="a_input" value="8" min="1" max="128">
             </div>
-            <div class="cell column-2">
-            <label for="mixed">Mixed Precision:</label>
-            <input type="checkbox" id="mixed" name="mixed" checked>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="b">Micro Batch Size (b):</label>
-            <input type="range" id="b" name="b" min="1" max="53248" value="32">
-            <input type="number" id="b_input" value="32" min="1" max="53248">
-            </div>
-            <div class="cell column-2">
-            <label for="seq_parallel">Sequence Parallelism:</label>
-            <input type="checkbox" id="seq_parallel" name="seq_parallel">
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="h">Hidden Dimension (h):</label>
-            <input type="range" id="h" name="h" min="1" max="16384" value="512">
-            <input type="number" id="h_input" value="512" min="128" max="16384">
-            </div>
-            <div class="cell column-2">
-            <label for="recomputation">Recomputation:</label>
-            <select id="recomputation" name="recomputation">
-                <option value="none">None</option>
-                <option value="selective">Selective</option>
-                <option value="full">Full</option>
-            </select>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="h_ff">Feedforward Dimension (h_ff):</label>
-            <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
-            <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
-            </div>
-            <div class="cell column-2">
-            <label for="zero">Zero:</label>
-            <select id="zero" name="zero">
-                <option value="0">0</option>
-                <option value="1">1</option>
-                <option value="2">2</option>
-                <option value="3">3</option>
-            </select>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="L">Number of Layers (L):</label>
-            <input type="range" id="L" name="L" min="1" max="126" value="12">
-            <input type="number" id="L_input" value="12" min="1" max="126">
-            </div>
-            <div class="cell column-2">
-            <label for="ff_activation">FF Activation:</label>
-            <select id="ff_activation" name="ff_activation">
-                <option value="relu">ReLU</option>
-                <option value="gelu">GELU</option>
-                <option value="swiglu">SwiGLU</option>
-            </select>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="s">Sequence Length (s):</label>
-            <input type="range" id="s" name="s" min="1" max="128000" value="128">
-            <input type="number" id="s_input" value="128" min="64" max="128000">
-            </div>
-            <div class="cell column-2">
-            <label for="presets">Presets:</label>
-            <select id="presets" name="presets">
-                <option value="Llama 3 Tiny">Llama 3 Tiny</option>
-                <option value="Llama 3 8B">Llama 3 8B</option>
-                <option value="Llama 3 70B">Llama 3 70B</option>
-                <option value="Llama 3 405B">Llama 3 405B</option>
-            </select>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="v">Vocabulary Size (v):</label>
-            <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
-            <input type="number" id="v_input" value="30522" min="1000" max="100000">
-            </div>
-            <div class="cell column-2">
-            <label for="tp">Tensor Parallelism (t):</label>
-            <input type="range" id="tp" name="tp" min="1" max="16" value="8">
-            <input type="number" id="tp_input" value="8" min="1" max="16">
-            </div>
-            <div class="cell column-1">
-            <label for="k">Optimizer Parameters (k):</label>
-            <input type="range" id="k" name="k" min="1" max="16" value="8">
-            <input type="number" id="k_input" value="8" min="1" max="16">
-            </div>
-            <div class="cell column-2">
-            <label for="dp">Data Parallelism (d):</label>
-            <input type="range" id="dp" name="dp" min="1" max="256" value="1">
-            <input type="number" id="dp_input" value="1" min="1" max="256">
             </div>
         </div>
         <p>While this widget gives a theoretical breakdown the following tool can be used to predict the memory usage:</p>
         <ul>
@@ -1724,9 +1757,11 @@
         <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and can be combined with both Pipeline Parallelism and ZeRO-3 as it relies on the distributive property of matrix multiplications which allows weights and activations to be sharded and computed independently before being combined.</p>
-        <div class="large-image-background">
             <img alt="TP & SP diagram" src="/assets/images/5d_nutshell_tp_sp.svg" style="width: 1200px; max-width: none;" />
         </div>
           <p>The main reason we don't want to use TP only for parallelism is that, in practice, TP has two limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
@@ -1737,17 +1772,20 @@
         <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. As we saw in <a target="_self" href="#context_parallelism"> CP section</a>, this is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where, even when using full activation recomputation, the memory requirements for attention would be prohibitive on a single GPU.</p>
-        <div class="large-image-background">
             <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1200px; max-width: none;" />
         </div>
         <p><strong>Expert Parallelism (EP)</strong> specifically targets the challenge of training Mixture of Experts (MoE) models by sharding specialized "experts" across GPUs and dynamically routing tokens to relevant experts during computation. The key communication operation in EP is the `all-to-all` operations routing tokens to their assigned experts and gathering the results back. While this operation introduces some communication overhead, it enables scaling model capacity significantly since each token is only processed during inference (and training) by a much smaller fraction of the total parameters. In terms of distributed training/inference, partitioning experts across GPUs becomes relevant when models scales to a large number of experts.</p>
         <aside>For instance DeepSeek V3 uses 256 experts.</aside>
-        <div class="large-image-background">
             <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1200px; max-width: none;" />
         </div>
         <div class="note-box">
             <p class="note-box-title">📝 Note</p>
             <div class="note-box-content">
@@ -1799,15 +1837,19 @@
         <p><strong>Summarizing it all–</strong> Now what about gathering and combining all the techniques we've seen in a single diagram combining them all. Yes, we're up for the challenge!</p>
         <p>In this summary diagram, you will find illustrated activations and modules for a single transformers layer –in it's MoE variant–. We also illustrate the various directions of parallelism and the communication operations we've been discussing in all the previous sections.</p>
-        <div class="large-image-background">
             <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1200px; max-width: none;"/></p>
             </div>
         <p>We can also represent side-by-side a <strong>full overview</strong> of the memory savings for each one of these strategies. We'll plot them with different sequence length as well as with selective (top) and full (bottom) recomputation so you can see how they all play with activations:</p>
-        <div class="large-image-background">
             <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1200px; max-width: none;"/>
         </div>
         <p>Let's finish this section with a high level view at all of these techniques, their main underlying idea and major bottleneck:</p>
@@ -1874,7 +1916,7 @@
         <p>Clearly, none of these techniques is a silver bullet for magical scaling and we'll often have to combine them in one way or another. Can we actually come up with a few rules that would help us find a good starting point to choose among –and combine– them? This will be the topic of our next section.</p>
-        <h2>How to Find the Best Training Configuration</h2>
         <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
@@ -1958,10 +2000,12 @@
         <p>All the following benchmarks were conducted with a sequence length of 4096 and a global batch size of 1M tokens. We gathered all the top configurations for each model and cluster size and plotted them in the following heatmaps:</p>
         </p>
-        <div class="large-image-background">
         <p><img alt="image.png" src="/assets/images/what_we_learnt_heatmap.svg" /></p>
     </div>
-    <div class="figure-legend">
         <p>Heatmap visualization showing the optimal training configurations across different model sizes and compute node counts (we have 8 GPUs per node). For each combination, the configuration details include Data Parallelism (DP), Tensor Parallelism (TP), Pipeline Parallelism (PP), Gradient Accumulation Steps (GAS), Micro Batch Size (MBS), and ZeRO optimization stage. The color intensity indicates the Model FLOPs Utilization (MFU), with brighter colors representing higher efficiency.</p>
     </div>
         <p>From this high-level visualization, we can draw several important insights:
@@ -2265,14 +2309,13 @@
         <p>However, when profiling this kernel with a tool like <code>ncu</code>, we can see issues, including low memory throughput and uncoalesced memory accesses.</p>
-        <div class="large-image-background">
-            <img width="1200px" alt="image.png" src="/assets/images/memorycoalescing2.png" />
         </div>
-        <div class="large-image-background">
-            <img width="1200px" alt="image.png" src="/assets/images/memorycoalescing3.png" />
         </div>
         <p>The reason for this is that in this kernel, two threads in the same block with Thread IDs <code>(0, 0)</code> and <code>(1, 0)</code> (which will end up in the same warp) will both load from the same column of matrix <code>B</code> but different rows of matrix <code>A</code>. Since matrix elements are stored in row-major order (meaning row elements are in consecutive memory addresses, as shown in the figure below) thread <code>(0, 0)</code> will load <d-math>A_{0,0}</d-math>, and thread <code>(1, 0)</code> will load <d-math>A_{1,0}</d-math> in the first iteration <code>i = 0</code>. These elements are not stored close to each other in memory, and this misalignment will be present at each iteration, thereby preventing memory accesses from being coalesced.</p>
         <p><img alt="image.png" src="/assets/images/memorycoalescing4.png" /></p>
@@ -2297,9 +2340,11 @@
         <p>When we profile our new kernel, we notice that the warning about uncoalesced memory accesses has disappeared, and <strong>the GPU's memory throughput has increased by approximately 10 times</strong>.</p>
-        <div class="large-image-background">
             <p><img width="1200px" alt="image.png" src="/assets/images/memorycoalescing5.png" /></p>
         </div>
         <p>We also notice that the execution time of the kernel <strong>decreases by 10x</strong>! Amazing.</p>
         <p>Now let's cover another technique you will often see mentioned in the litterature: <strong>tiling</strong>.</p>
@@ -2685,7 +2730,15 @@
         </ul>
         <p>We hope this book helps you get started in distributed training and that you will train the next generation of awesome models to the hum of your GPU cluster!</p>
         <h3>Acknowledgements</h3>
         <p>We thank <a href="https://huggingface.co/eliebak">Elie</a> for conducting thorough reviews and creating the audio components using NotebookLM. Special thanks to <a href="https://huggingface.co/hynky">Hynek</a> for optimizing the frontend performance. We also thank <a href="https://huggingface.co/sbrandeis">Simon</a> for resolving some issues on the hub.</p>
@@ -3395,8 +3448,10 @@
         <p>This would print aggregated profiling results sorted by the total CUDA time, and the output would be:</p>
-        <div class="large-image-background">
-            <img alt="image.png" src="/assets/images/a1_kernels.png" style="width: 1200px; max-width: none;" />
         </div>
         <p>You can also try to inspect the trace as we previously mentioned on <code>chrome://tracing/</code></p>
@@ -3410,8 +3465,10 @@
         <p>After zooming in, you can observe the flow of operations when calling <code>layer_norm</code> in this trace:</p>
-        <div class="large-image-background">
-            <img alt="image.png" src="/assets/images/a1_profile_trace.png" style="width: 1200px; max-width: none;" />
         </div>
         <p>The sequence begins in the CPU (the upper section) with <code>aten::layer_norm</code>, progressing to <code>aten::native_layer_norm</code>, and then transitioning to <code>cudaLaunchKernel</code>. From there, we move on to the GPU, where the <code>vectorized_layer_norm_kernel</code> kernel is called.</p>
@@ -3437,8 +3494,10 @@
         <p>and open the file <code>output.ncu-rep</code> with Nsight Compute, you will have a view that looks like this:</p>
-        <div class="large-image-background">
-            <img alt="image.png" src="/assets/images/a1_ncu.png" style="width: 1200px; max-width: none;" />
         </div>
         <p>With clear warnings about compute and memory utilization, and how to make the kernel better in balancing compute and memory and achieve maximal occupancy.</p>

         <p>The book is built on the following <strong>three general foundations</strong>:</p>
         <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what it’s advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>
+        <aside>Note that we're still missing Pipeline Parallelism in this widget. To be added as exercise for the reader.</aside>
+        <div class="large-image-background-transparent">
+            <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                <div id="graph-all">
+                    <div class="figure-legend">Memory usage breakdown</div>
+                <div id="graph"></div>
             </div>
+                <div id="controls">
+                    <div class="cell column-1">
+                        <label for="a">Attention Heads (a):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="a" name="a" min="1" max="128" value="8">
+                        <input type="number" id="a_input" value="8" min="1" max="128">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="mixed">Mixed Precision:</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="checkbox" id="mixed" name="mixed" checked>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                        </div>
+                    </div>
+                    <div class="cell column-1">
+                        <label for="b">Micro Batch Size (b):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="b" name="b" min="1" max="53248" value="32">
+                        <input type="number" id="b_input" value="32" min="1" max="53248">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="seq_parallel">Sequence Parallelism:</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="checkbox" id="seq_parallel" name="seq_parallel">
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                        </div>
+                    </div>
+                    <div class="cell column-1">
+                        <label for="h">Hidden Dimension (h):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="h" name="h" min="1" max="16384" value="512">
+                        <input type="number" id="h_input" value="512" min="128" max="16384">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="recomputation">Recomputation:</label>
+                        <select id="recomputation" name="recomputation">
+                            <option value="none">None</option>
+                            <option value="selective">Selective</option>
+                            <option value="full">Full</option>
+                        </select>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                    </div>
+                    <div class="cell column-1">
+                        <label for="h_ff">Feedforward Dimension (h_ff):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
+                        <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="zero">Zero:</label>
+                        <select id="zero" name="zero">
+                            <option value="0">0</option>
+                            <option value="1">1</option>
+                            <option value="2">2</option>
+                            <option value="3">3</option>
+                        </select>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                    </div>
+                    <div class="cell column-1">
+                        <label for="L">Number of Layers (L):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="L" name="L" min="1" max="126" value="12">
+                        <input type="number" id="L_input" value="12" min="1" max="126">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="ff_activation">FF Activation:</label>
+                        <select id="ff_activation" name="ff_activation">
+                            <option value="relu">ReLU</option>
+                            <option value="gelu">GELU</option>
+                            <option value="swiglu">SwiGLU</option>
+                        </select>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                    </div>
+                    <div class="cell column-1">
+                        <label for="s">Sequence Length (s):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="s" name="s" min="1" max="128000" value="128">
+                        <input type="number" id="s_input" value="128" min="64" max="128000">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="v">Vocabulary Size (v):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
+                        <input type="number" id="v_input" value="30522" min="1000" max="100000">
+                        </div>
+                    </div>
+                    <div class="cell column-1">
+                        <label for="tp">Tensor Parallelism (t):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="tp" name="tp" min="1" max="16" value="8">
+                        <input type="number" id="tp_input" value="8" min="1" max="16">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="k">Optimizer Parameters (k):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="k" name="k" min="1" max="16" value="8">
+                        <input type="number" id="k_input" value="8" min="1" max="16">
+                        </div>
+                    </div>
+                    <div class="cell column-1">
+                        <label for="dp">Data Parallelism (d):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="dp" name="dp" min="1" max="256" value="1">
+                        <input type="number" id="dp_input" value="1" min="1" max="256">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="presets">Presets:</label>
+                        <select id="presets" name="presets">
+                            <option value="Llama 3 Tiny">Llama 3 Tiny</option>
+                            <option value="Llama 3 8B">Llama 3 8B</option>
+                            <option value="Llama 3 70B">Llama 3 70B</option>
+                            <option value="Llama 3 405B">Llama 3 405B</option>
+                        </select>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                    </div>
+                </div>
             </div>
         </div>
+        <p>(Don't worry if you have no idea what's happening in this widget. That's why we're here.)</p>
         <p>While this widget gives a theoretical breakdown the following tool can be used to predict the memory usage:</p>
         <ul>
         <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and can be combined with both Pipeline Parallelism and ZeRO-3 as it relies on the distributive property of matrix multiplications which allows weights and activations to be sharded and computed independently before being combined.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <img alt="TP & SP diagram" src="/assets/images/5d_nutshell_tp_sp.svg" style="width: 1200px; max-width: none;" />
         </div>
+        </div>
           <p>The main reason we don't want to use TP only for parallelism is that, in practice, TP has two limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
         <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. As we saw in <a target="_self" href="#context_parallelism"> CP section</a>, this is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where, even when using full activation recomputation, the memory requirements for attention would be prohibitive on a single GPU.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1200px; max-width: none;" />
         </div>
+        </div>
         <p><strong>Expert Parallelism (EP)</strong> specifically targets the challenge of training Mixture of Experts (MoE) models by sharding specialized "experts" across GPUs and dynamically routing tokens to relevant experts during computation. The key communication operation in EP is the `all-to-all` operations routing tokens to their assigned experts and gathering the results back. While this operation introduces some communication overhead, it enables scaling model capacity significantly since each token is only processed during inference (and training) by a much smaller fraction of the total parameters. In terms of distributed training/inference, partitioning experts across GPUs becomes relevant when models scales to a large number of experts.</p>
         <aside>For instance DeepSeek V3 uses 256 experts.</aside>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1200px; max-width: none;" />
         </div>
+        </div>
         <div class="note-box">
             <p class="note-box-title">📝 Note</p>
             <div class="note-box-content">
         <p><strong>Summarizing it all–</strong> Now what about gathering and combining all the techniques we've seen in a single diagram combining them all. Yes, we're up for the challenge!</p>
         <p>In this summary diagram, you will find illustrated activations and modules for a single transformers layer –in it's MoE variant–. We also illustrate the various directions of parallelism and the communication operations we've been discussing in all the previous sections.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1200px; max-width: none;"/></p>
             </div>
+            </div>
         <p>We can also represent side-by-side a <strong>full overview</strong> of the memory savings for each one of these strategies. We'll plot them with different sequence length as well as with selective (top) and full (bottom) recomputation so you can see how they all play with activations:</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1200px; max-width: none;"/>
         </div>
+        </div>
         <p>Let's finish this section with a high level view at all of these techniques, their main underlying idea and major bottleneck:</p>
         <p>Clearly, none of these techniques is a silver bullet for magical scaling and we'll often have to combine them in one way or another. Can we actually come up with a few rules that would help us find a good starting point to choose among –and combine– them? This will be the topic of our next section.</p>
+        <h2>Finding the Best Training Configuration</h2>
         <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
         <p>All the following benchmarks were conducted with a sequence length of 4096 and a global batch size of 1M tokens. We gathered all the top configurations for each model and cluster size and plotted them in the following heatmaps:</p>
         </p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
         <p><img alt="image.png" src="/assets/images/what_we_learnt_heatmap.svg" /></p>
     </div>
+</div>
+<div class="figure-legend">
         <p>Heatmap visualization showing the optimal training configurations across different model sizes and compute node counts (we have 8 GPUs per node). For each combination, the configuration details include Data Parallelism (DP), Tensor Parallelism (TP), Pipeline Parallelism (PP), Gradient Accumulation Steps (GAS), Micro Batch Size (MBS), and ZeRO optimization stage. The color intensity indicates the Model FLOPs Utilization (MFU), with brighter colors representing higher efficiency.</p>
     </div>
         <p>From this high-level visualization, we can draw several important insights:
         <p>However, when profiling this kernel with a tool like <code>ncu</code>, we can see issues, including low memory throughput and uncoalesced memory accesses.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
+            <img width="1400px" alt="image.png" src="/assets/images/memorycoalescing2.png" />
+            <img width="1400px" alt="image.png" src="/assets/images/memorycoalescing3.png" />
         </div>
         </div>
         <p>The reason for this is that in this kernel, two threads in the same block with Thread IDs <code>(0, 0)</code> and <code>(1, 0)</code> (which will end up in the same warp) will both load from the same column of matrix <code>B</code> but different rows of matrix <code>A</code>. Since matrix elements are stored in row-major order (meaning row elements are in consecutive memory addresses, as shown in the figure below) thread <code>(0, 0)</code> will load <d-math>A_{0,0}</d-math>, and thread <code>(1, 0)</code> will load <d-math>A_{1,0}</d-math> in the first iteration <code>i = 0</code>. These elements are not stored close to each other in memory, and this misalignment will be present at each iteration, thereby preventing memory accesses from being coalesced.</p>
         <p><img alt="image.png" src="/assets/images/memorycoalescing4.png" /></p>
         <p>When we profile our new kernel, we notice that the warning about uncoalesced memory accesses has disappeared, and <strong>the GPU's memory throughput has increased by approximately 10 times</strong>.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <p><img width="1200px" alt="image.png" src="/assets/images/memorycoalescing5.png" /></p>
         </div>
+        </div>
         <p>We also notice that the execution time of the kernel <strong>decreases by 10x</strong>! Amazing.</p>
         <p>Now let's cover another technique you will often see mentioned in the litterature: <strong>tiling</strong>.</p>
         </ul>
         <p>We hope this book helps you get started in distributed training and that you will train the next generation of awesome models to the hum of your GPU cluster!</p>
+        <hr>
+        <p> <strong>One last word</strong> for our first readers. We're so happy with this writing piece that we've decided to distribute a limited number of physical printed editions of it as a gift for our first readers.</p>
+        <p>If you are among the first 50 people to fill in your email address below, we'll contact you later in the year to send you a real physical edition once we've formatted it as a printed copy.</p>
+        <p>We expect the book to be around 100-150 pages and to cover the same content as the blog post but we may also decide to shorten or lengthen it depending on what make sense as a printed object.</p>
+        <p>To get your physical copy, please fill in your email address in the following <a target="_blank" href="https://forms.gle/e1GkAShUCtgcwnne8">google form</a>.</p>
+        <p>Whether you are one of our first readers or coming much later to this blog post, we've very happy to see that you enjoyed this sharing of knowledge. May the force of open-source and open-science always be with you.</p>
         <h3>Acknowledgements</h3>
         <p>We thank <a href="https://huggingface.co/eliebak">Elie</a> for conducting thorough reviews and creating the audio components using NotebookLM. Special thanks to <a href="https://huggingface.co/hynky">Hynek</a> for optimizing the frontend performance. We also thank <a href="https://huggingface.co/sbrandeis">Simon</a> for resolving some issues on the hub.</p>
         <p>This would print aggregated profiling results sorted by the total CUDA time, and the output would be:</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
+                <img alt="image.png" src="/assets/images/a1_kernels.png" style="width: 1200px; max-width: none;" />
+            </div>
         </div>
         <p>You can also try to inspect the trace as we previously mentioned on <code>chrome://tracing/</code></p>
         <p>After zooming in, you can observe the flow of operations when calling <code>layer_norm</code> in this trace:</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
+                <img alt="image.png" src="/assets/images/a1_profile_trace.png" style="width: 1200px; max-width: none;" />
+            </div>
         </div>
         <p>The sequence begins in the CPU (the upper section) with <code>aten::layer_norm</code>, progressing to <code>aten::native_layer_norm</code>, and then transitioning to <code>cudaLaunchKernel</code>. From there, we move on to the GPU, where the <code>vectorized_layer_norm_kernel</code> kernel is called.</p>
         <p>and open the file <code>output.ncu-rep</code> with Nsight Compute, you will have a view that looks like this:</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
+                <img alt="image.png" src="/assets/images/a1_ncu.png" style="width: 1200px; max-width: none;" />
+            </div>
         </div>
         <p>With clear warnings about compute and memory utilization, and how to make the kernel better in balancing compute and memory and achieve maximal occupancy.</p>

dist/main.bundle.js CHANGED Viewed

@@ -4920,8 +4920,8 @@ function updateGraph() {
     }]
   };
   console.log('Data for treemap:', data);
-  var width = 700;
-  var height = 450;
   var legendHeight = 50;
   var svg = src_select("#graph").select("svg");
   svg.selectAll("*").remove();
@@ -4952,10 +4952,10 @@ function updateGraph() {
       // Give distinct colors to the main section containers
       case 'Activation Memory':
-        return 'rgb(78, 165, 183)';
       // Orange
       case 'Parameters / Gradients / Optimizer States':
-        return 'rgb(232, 137, 171)';
       // Teal Blue
       // Parameters / Gradients / Optimizer States branch

     }]
   };
   console.log('Data for treemap:', data);
+  var width = 600;
+  var height = 600;
   var legendHeight = 50;
   var svg = src_select("#graph").select("svg");
   svg.selectAll("*").remove();
       // Give distinct colors to the main section containers
       case 'Activation Memory':
+        return 'rgb(61, 198, 159)';
       // Orange
       case 'Parameters / Gradients / Optimizer States':
+        return 'rgba(232, 137, 170, 0.85)';
       // Teal Blue
       // Parameters / Gradients / Optimizer States branch

dist/main.bundle.js.map CHANGED Viewed

The diff for this file is too large to render. See raw diff

dist/style.css CHANGED Viewed

@@ -182,7 +182,7 @@ toggle-icon {
 }
 toggle-icon.collapsed {
-    transform: rotate(-90deg);
 }
 .toc-content {
@@ -296,80 +296,6 @@ d-contents nav > ul > li > a:hover {
     text-decoration: none;
 }
-/* memory */
-#controls {
-    display: grid;
-    grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
-    column-gap: 10px;
-    margin-bottom: 20px;
-    max-width: 100%;
-    @supports (container-type: inline-size) {
-        container-type: inline-size;
-    }
-}
-#controls .cell {
-    padding: 1px;
-    box-sizing: border-box;
-}
-#controls .column-1 {
-    display: flex;
-    align-items: center;
-}
-#controls .column-2 {
-    display: flex;
-    align-items: center;
-}
-@supports (container-type: inline-size) {
-    @container (max-width: 600px) {
-        #controls .column-2 {
-            order: 2;
-        }
-    }
-}
-#controls label {
-    text-align: right;
-    padding-right: 10px;
-    flex: 0 0 auto;
-    width: 150px;
-    line-height: 1.5em;
-    font-size: 0.8em;
-}
-#controls input[type="range"] {
-    width: 50%;
-    margin: 0 10px;
-}
-#controls input[type="number"] {
-    flex-shrink: 0;
-    width: 60px;
-    height: 24px;
-    border: 1px solid var(--distill-gray-light);
-    border-radius: 0.2rem;
-}
-#controls select {
-    width: 100%;
-    min-height: 28px;
-    border: 1px solid var(--distill-gray-light);
-    border-radius: 0.2rem;
-}
-#controls .column {
-    display: contents;
-}
-#graph svg {
-    font-family: sans-serif;
-}
-#graph svg rect {
-    cursor: pointer;
-}
 .note-box {
     background-color: #f6f8fa;
     border-left: 4px solid #444444;
@@ -437,6 +363,28 @@ d-code {
         justify-content: center; /* This will center your image */
 }
 d-article li {
     margin-bottom: 0.0em;
 }
@@ -452,3 +400,200 @@ d-article ol ol {
 d-article hr {
     grid-column: text;
 }

 }
 toggle-icon.collapsed {
+    transform: rotate(90deg);
 }
 .toc-content {
     text-decoration: none;
 }
 .note-box {
     background-color: #f6f8fa;
     border-left: 4px solid #444444;
         justify-content: center; /* This will center your image */
 }
+.large-image-background-transparent {
+    /* width: 100vw; */
+    padding-top: 10px;
+    padding-bottom: 10px;
+    /* margin-left: calc(-50vw + 50%); */
+    margin-left:-100px;
+    margin-right: -100px;
+    /* margin-right: calc(-50vw + 50%); */
+    /* background: white; */
+    height: fit-content; /* This will make it match the image height */
+    display: flex;
+    justify-content: center; /* This will center your image */
+}
+.boxed-image {
+    padding: 0.5rem;
+    background: white;
+    border-radius: 12px;
+    border: 1px solid #e5e7eb;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
 d-article li {
     margin-bottom: 0.0em;
 }
 d-article hr {
     grid-column: text;
 }
+/* Memory visualization */
+#graph-all {
+    min-width: 500px;
+    margin-right: 10px;
+    margin-bottom: 2rem;
+    padding: 0.5rem;
+    background: #f9fafb;
+    border-radius: 12px;
+    border: 1px solid #e5e7eb;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+/* Main container styles */
+#controls {
+    max-width: 1200px;
+    /* margin: 2rem auto; */
+    margin-bottom: 2rem;
+    margin-left: 10px;
+    padding: 0.6rem;
+    background: #f9fafb;
+    border-radius: 12px;
+    border: 1px solid #e5e7eb;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+/* Grid layout */
+#controls {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    /* gap: 2rem; */
+}
+/* Cell styles */
+.cell {
+    margin-bottom: 0.2rem;
+}
+/* Label styles */
+label {
+    display: block;
+    /* margin-bottom: 0.5rem; */
+    font-size: 0.8rem;
+    font-weight: 500;
+    color: #374151;
+}
+/* Input container for range + number combination */
+.input-container {
+    display: flex;
+    gap: 1rem;
+    align-items: center;
+}
+/* Range input styling */
+input[type="range"] {
+    flex: 1;
+    height: 6px;
+    background: #e5e7eb;
+    border-radius: 3px;
+    appearance: none;
+    outline: none;
+}
+input[type="range"]::-webkit-slider-thumb {
+    appearance: none;
+    width: 16px;
+    height: 16px;
+    background: #3b82f6;
+    border-radius: 50%;
+    cursor: pointer;
+    transition: background 0.15s ease;
+}
+input[type="range"]::-webkit-slider-thumb:hover {
+    background: #2563eb;
+}
+/* Number input styling */
+input[type="number"] {
+    width: 80px;
+    padding: 0.5rem;
+    border: 1px solid #e5e7eb;
+    border-radius: 6px;
+    font-size: 0.9rem;
+    color: #374151;
+}
+/* Select styling */
+select {
+    width: 100%;
+    padding: 0.5rem;
+    border: 1px solid #e5e7eb;
+    border-radius: 6px;
+    background: white;
+    font-size: 0.9rem;
+    color: #374151;
+    cursor: pointer;
+}
+/* Checkbox styling */
+input[type="checkbox"] {
+    width: 1.2rem;
+    height: 1.2rem;
+    margin-right: 0.5rem;
+    border: 2px solid #e5e7eb;
+    border-radius: 4px;
+    cursor: pointer;
+}
+/* Column specific styles */
+.column-1 {
+    padding-right: 0.5rem;
+}
+.column-2 {
+    padding-left: 0.5rem;
+}
+/* Checkbox container */
+.checkbox-container {
+    display: flex;
+    align-items: center;
+    margin-bottom: 1rem;
+}
+/* Memory visualization styles */
+.memory-block {
+    background: #fff;
+    border-radius: 8px;
+    padding: 1rem;
+    margin-bottom: 1rem;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
+}
+.memory-title {
+    font-size: 1.1rem;
+    font-weight: 500;
+    color: #374151;
+    margin-bottom: 0.5rem;
+}
+.memory-value {
+    font-size: 1.5rem;
+    font-weight: 600;
+    color: #3b82f6;
+}
+/* Responsive adjustments */
+@media (max-width: 768px) {
+    #controls {
+        grid-template-columns: 1fr;
+        padding: 1rem;
+    }
+    .column-1, .column-2 {
+        padding: 0;
+    }
+}
+/* Hover states and transitions */
+input:hover, select:hover {
+    border-color: #3b82f6;
+}
+input:focus, select:focus {
+    border-color: #2563eb;
+    outline: none;
+    box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.1);
+}
+/* Add smooth transitions */
+input, select, button {
+    transition: all 0.15s ease;
+}
+/* Preset dropdown special styling */
+select[name="presets"] {
+    background-color: #f3f4f6;
+    font-weight: 500;
+}
+/* Memory graph enhancements */
+.activation-memory {
+    background: #dbeafe;
+    padding: 1rem;
+    border-radius: 8px;
+    margin-bottom: 1rem;
+}
+.gradient-memory {
+    background: #ede9fe;
+    padding: 1rem;
+    border-radius: 8px;
+}

src/index.html CHANGED Viewed

@@ -90,108 +90,141 @@
         <p>The book is built on the following <strong>three general foundations</strong>:</p>
         <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what it’s advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>
-        <div id="graph"></div>
-        <div id="controls">
-            <div class="cell column-1">
-            <label for="a">Attention Heads (a):</label>
-            <input type="range" id="a" name="a" min="1" max="128" value="8">
-            <input type="number" id="a_input" value="8" min="1" max="128">
             </div>
-            <div class="cell column-2">
-            <label for="mixed">Mixed Precision:</label>
-            <input type="checkbox" id="mixed" name="mixed" checked>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="b">Micro Batch Size (b):</label>
-            <input type="range" id="b" name="b" min="1" max="53248" value="32">
-            <input type="number" id="b_input" value="32" min="1" max="53248">
-            </div>
-            <div class="cell column-2">
-            <label for="seq_parallel">Sequence Parallelism:</label>
-            <input type="checkbox" id="seq_parallel" name="seq_parallel">
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="h">Hidden Dimension (h):</label>
-            <input type="range" id="h" name="h" min="1" max="16384" value="512">
-            <input type="number" id="h_input" value="512" min="128" max="16384">
-            </div>
-            <div class="cell column-2">
-            <label for="recomputation">Recomputation:</label>
-            <select id="recomputation" name="recomputation">
-                <option value="none">None</option>
-                <option value="selective">Selective</option>
-                <option value="full">Full</option>
-            </select>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="h_ff">Feedforward Dimension (h_ff):</label>
-            <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
-            <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
-            </div>
-            <div class="cell column-2">
-            <label for="zero">Zero:</label>
-            <select id="zero" name="zero">
-                <option value="0">0</option>
-                <option value="1">1</option>
-                <option value="2">2</option>
-                <option value="3">3</option>
-            </select>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="L">Number of Layers (L):</label>
-            <input type="range" id="L" name="L" min="1" max="126" value="12">
-            <input type="number" id="L_input" value="12" min="1" max="126">
-            </div>
-            <div class="cell column-2">
-            <label for="ff_activation">FF Activation:</label>
-            <select id="ff_activation" name="ff_activation">
-                <option value="relu">ReLU</option>
-                <option value="gelu">GELU</option>
-                <option value="swiglu">SwiGLU</option>
-            </select>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="s">Sequence Length (s):</label>
-            <input type="range" id="s" name="s" min="1" max="128000" value="128">
-            <input type="number" id="s_input" value="128" min="64" max="128000">
-            </div>
-            <div class="cell column-2">
-            <label for="presets">Presets:</label>
-            <select id="presets" name="presets">
-                <option value="Llama 3 Tiny">Llama 3 Tiny</option>
-                <option value="Llama 3 8B">Llama 3 8B</option>
-                <option value="Llama 3 70B">Llama 3 70B</option>
-                <option value="Llama 3 405B">Llama 3 405B</option>
-            </select>
-            <span></span> <!-- Empty span to maintain grid alignment -->
-            </div>
-            <div class="cell column-1">
-            <label for="v">Vocabulary Size (v):</label>
-            <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
-            <input type="number" id="v_input" value="30522" min="1000" max="100000">
-            </div>
-            <div class="cell column-2">
-            <label for="tp">Tensor Parallelism (t):</label>
-            <input type="range" id="tp" name="tp" min="1" max="16" value="8">
-            <input type="number" id="tp_input" value="8" min="1" max="16">
-            </div>
-            <div class="cell column-1">
-            <label for="k">Optimizer Parameters (k):</label>
-            <input type="range" id="k" name="k" min="1" max="16" value="8">
-            <input type="number" id="k_input" value="8" min="1" max="16">
-            </div>
-            <div class="cell column-2">
-            <label for="dp">Data Parallelism (d):</label>
-            <input type="range" id="dp" name="dp" min="1" max="256" value="1">
-            <input type="number" id="dp_input" value="1" min="1" max="256">
             </div>
         </div>
         <p>While this widget gives a theoretical breakdown the following tool can be used to predict the memory usage:</p>
         <ul>
@@ -1724,9 +1757,11 @@
         <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and can be combined with both Pipeline Parallelism and ZeRO-3 as it relies on the distributive property of matrix multiplications which allows weights and activations to be sharded and computed independently before being combined.</p>
-        <div class="large-image-background">
             <img alt="TP & SP diagram" src="/assets/images/5d_nutshell_tp_sp.svg" style="width: 1200px; max-width: none;" />
         </div>
           <p>The main reason we don't want to use TP only for parallelism is that, in practice, TP has two limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
@@ -1737,17 +1772,20 @@
         <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. As we saw in <a target="_self" href="#context_parallelism"> CP section</a>, this is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where, even when using full activation recomputation, the memory requirements for attention would be prohibitive on a single GPU.</p>
-        <div class="large-image-background">
             <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1200px; max-width: none;" />
         </div>
         <p><strong>Expert Parallelism (EP)</strong> specifically targets the challenge of training Mixture of Experts (MoE) models by sharding specialized "experts" across GPUs and dynamically routing tokens to relevant experts during computation. The key communication operation in EP is the `all-to-all` operations routing tokens to their assigned experts and gathering the results back. While this operation introduces some communication overhead, it enables scaling model capacity significantly since each token is only processed during inference (and training) by a much smaller fraction of the total parameters. In terms of distributed training/inference, partitioning experts across GPUs becomes relevant when models scales to a large number of experts.</p>
         <aside>For instance DeepSeek V3 uses 256 experts.</aside>
-        <div class="large-image-background">
             <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1200px; max-width: none;" />
         </div>
         <div class="note-box">
             <p class="note-box-title">📝 Note</p>
             <div class="note-box-content">
@@ -1799,15 +1837,19 @@
         <p><strong>Summarizing it all–</strong> Now what about gathering and combining all the techniques we've seen in a single diagram combining them all. Yes, we're up for the challenge!</p>
         <p>In this summary diagram, you will find illustrated activations and modules for a single transformers layer –in it's MoE variant–. We also illustrate the various directions of parallelism and the communication operations we've been discussing in all the previous sections.</p>
-        <div class="large-image-background">
             <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1200px; max-width: none;"/></p>
             </div>
         <p>We can also represent side-by-side a <strong>full overview</strong> of the memory savings for each one of these strategies. We'll plot them with different sequence length as well as with selective (top) and full (bottom) recomputation so you can see how they all play with activations:</p>
-        <div class="large-image-background">
             <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1200px; max-width: none;"/>
         </div>
         <p>Let's finish this section with a high level view at all of these techniques, their main underlying idea and major bottleneck:</p>
@@ -1958,10 +2000,12 @@
         <p>All the following benchmarks were conducted with a sequence length of 4096 and a global batch size of 1M tokens. We gathered all the top configurations for each model and cluster size and plotted them in the following heatmaps:</p>
         </p>
-        <div class="large-image-background">
         <p><img alt="image.png" src="/assets/images/what_we_learnt_heatmap.svg" /></p>
     </div>
-    <div class="figure-legend">
         <p>Heatmap visualization showing the optimal training configurations across different model sizes and compute node counts (we have 8 GPUs per node). For each combination, the configuration details include Data Parallelism (DP), Tensor Parallelism (TP), Pipeline Parallelism (PP), Gradient Accumulation Steps (GAS), Micro Batch Size (MBS), and ZeRO optimization stage. The color intensity indicates the Model FLOPs Utilization (MFU), with brighter colors representing higher efficiency.</p>
     </div>
         <p>From this high-level visualization, we can draw several important insights:
@@ -2265,14 +2309,13 @@
         <p>However, when profiling this kernel with a tool like <code>ncu</code>, we can see issues, including low memory throughput and uncoalesced memory accesses.</p>
-        <div class="large-image-background">
-            <img width="1200px" alt="image.png" src="/assets/images/memorycoalescing2.png" />
         </div>
-        <div class="large-image-background">
-            <img width="1200px" alt="image.png" src="/assets/images/memorycoalescing3.png" />
         </div>
         <p>The reason for this is that in this kernel, two threads in the same block with Thread IDs <code>(0, 0)</code> and <code>(1, 0)</code> (which will end up in the same warp) will both load from the same column of matrix <code>B</code> but different rows of matrix <code>A</code>. Since matrix elements are stored in row-major order (meaning row elements are in consecutive memory addresses, as shown in the figure below) thread <code>(0, 0)</code> will load <d-math>A_{0,0}</d-math>, and thread <code>(1, 0)</code> will load <d-math>A_{1,0}</d-math> in the first iteration <code>i = 0</code>. These elements are not stored close to each other in memory, and this misalignment will be present at each iteration, thereby preventing memory accesses from being coalesced.</p>
         <p><img alt="image.png" src="/assets/images/memorycoalescing4.png" /></p>
@@ -2297,9 +2340,11 @@
         <p>When we profile our new kernel, we notice that the warning about uncoalesced memory accesses has disappeared, and <strong>the GPU's memory throughput has increased by approximately 10 times</strong>.</p>
-        <div class="large-image-background">
             <p><img width="1200px" alt="image.png" src="/assets/images/memorycoalescing5.png" /></p>
         </div>
         <p>We also notice that the execution time of the kernel <strong>decreases by 10x</strong>! Amazing.</p>
         <p>Now let's cover another technique you will often see mentioned in the litterature: <strong>tiling</strong>.</p>
@@ -2685,7 +2730,15 @@
         </ul>
         <p>We hope this book helps you get started in distributed training and that you will train the next generation of awesome models to the hum of your GPU cluster!</p>
         <h3>Acknowledgements</h3>
         <p>We thank <a href="https://huggingface.co/eliebak">Elie</a> for conducting thorough reviews and creating the audio components using NotebookLM. Special thanks to <a href="https://huggingface.co/hynky">Hynek</a> for optimizing the frontend performance. We also thank <a href="https://huggingface.co/sbrandeis">Simon</a> for resolving some issues on the hub.</p>
@@ -3395,8 +3448,10 @@
         <p>This would print aggregated profiling results sorted by the total CUDA time, and the output would be:</p>
-        <div class="large-image-background">
-            <img alt="image.png" src="/assets/images/a1_kernels.png" style="width: 1200px; max-width: none;" />
         </div>
         <p>You can also try to inspect the trace as we previously mentioned on <code>chrome://tracing/</code></p>
@@ -3410,8 +3465,10 @@
         <p>After zooming in, you can observe the flow of operations when calling <code>layer_norm</code> in this trace:</p>
-        <div class="large-image-background">
-            <img alt="image.png" src="/assets/images/a1_profile_trace.png" style="width: 1200px; max-width: none;" />
         </div>
         <p>The sequence begins in the CPU (the upper section) with <code>aten::layer_norm</code>, progressing to <code>aten::native_layer_norm</code>, and then transitioning to <code>cudaLaunchKernel</code>. From there, we move on to the GPU, where the <code>vectorized_layer_norm_kernel</code> kernel is called.</p>
@@ -3437,8 +3494,10 @@
         <p>and open the file <code>output.ncu-rep</code> with Nsight Compute, you will have a view that looks like this:</p>
-        <div class="large-image-background">
-            <img alt="image.png" src="/assets/images/a1_ncu.png" style="width: 1200px; max-width: none;" />
         </div>
         <p>With clear warnings about compute and memory utilization, and how to make the kernel better in balancing compute and memory and achieve maximal occupancy.</p>

         <p>The book is built on the following <strong>three general foundations</strong>:</p>
         <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what it’s advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>
+        <aside>Note that we're still missing Pipeline Parallelism in this widget. To be added as exercise for the reader.</aside>
+        <div class="large-image-background-transparent">
+            <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                <div id="graph-all">
+                    <div class="figure-legend">Memory usage breakdown</div>
+                <div id="graph"></div>
             </div>
+                <div id="controls">
+                    <div class="cell column-1">
+                        <label for="a">Attention Heads (a):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="a" name="a" min="1" max="128" value="8">
+                        <input type="number" id="a_input" value="8" min="1" max="128">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="mixed">Mixed Precision:</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="checkbox" id="mixed" name="mixed" checked>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                        </div>
+                    </div>
+                    <div class="cell column-1">
+                        <label for="b">Micro Batch Size (b):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="b" name="b" min="1" max="53248" value="32">
+                        <input type="number" id="b_input" value="32" min="1" max="53248">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="seq_parallel">Sequence Parallelism:</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="checkbox" id="seq_parallel" name="seq_parallel">
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                        </div>
+                    </div>
+                    <div class="cell column-1">
+                        <label for="h">Hidden Dimension (h):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="h" name="h" min="1" max="16384" value="512">
+                        <input type="number" id="h_input" value="512" min="128" max="16384">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="recomputation">Recomputation:</label>
+                        <select id="recomputation" name="recomputation">
+                            <option value="none">None</option>
+                            <option value="selective">Selective</option>
+                            <option value="full">Full</option>
+                        </select>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                    </div>
+                    <div class="cell column-1">
+                        <label for="h_ff">Feedforward Dimension (h_ff):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
+                        <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="zero">Zero:</label>
+                        <select id="zero" name="zero">
+                            <option value="0">0</option>
+                            <option value="1">1</option>
+                            <option value="2">2</option>
+                            <option value="3">3</option>
+                        </select>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                    </div>
+                    <div class="cell column-1">
+                        <label for="L">Number of Layers (L):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="L" name="L" min="1" max="126" value="12">
+                        <input type="number" id="L_input" value="12" min="1" max="126">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="ff_activation">FF Activation:</label>
+                        <select id="ff_activation" name="ff_activation">
+                            <option value="relu">ReLU</option>
+                            <option value="gelu">GELU</option>
+                            <option value="swiglu">SwiGLU</option>
+                        </select>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                    </div>
+                    <div class="cell column-1">
+                        <label for="s">Sequence Length (s):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="s" name="s" min="1" max="128000" value="128">
+                        <input type="number" id="s_input" value="128" min="64" max="128000">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="v">Vocabulary Size (v):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
+                        <input type="number" id="v_input" value="30522" min="1000" max="100000">
+                        </div>
+                    </div>
+                    <div class="cell column-1">
+                        <label for="tp">Tensor Parallelism (t):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="tp" name="tp" min="1" max="16" value="8">
+                        <input type="number" id="tp_input" value="8" min="1" max="16">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="k">Optimizer Parameters (k):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="k" name="k" min="1" max="16" value="8">
+                        <input type="number" id="k_input" value="8" min="1" max="16">
+                        </div>
+                    </div>
+                    <div class="cell column-1">
+                        <label for="dp">Data Parallelism (d):</label>
+                        <div style="display: grid; grid-template-columns: 1fr 1fr; align-items: center;">
+                            <input type="range" id="dp" name="dp" min="1" max="256" value="1">
+                        <input type="number" id="dp_input" value="1" min="1" max="256">
+                        </div>
+                    </div>
+                    <div class="cell column-2">
+                        <label for="presets">Presets:</label>
+                        <select id="presets" name="presets">
+                            <option value="Llama 3 Tiny">Llama 3 Tiny</option>
+                            <option value="Llama 3 8B">Llama 3 8B</option>
+                            <option value="Llama 3 70B">Llama 3 70B</option>
+                            <option value="Llama 3 405B">Llama 3 405B</option>
+                        </select>
+                        <span></span> <!-- Empty span to maintain grid alignment -->
+                    </div>
+                </div>
             </div>
         </div>
+        <p>(Don't worry if you have no idea what's happening in this widget. That's why we're here.)</p>
         <p>While this widget gives a theoretical breakdown the following tool can be used to predict the memory usage:</p>
         <ul>
         <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and can be combined with both Pipeline Parallelism and ZeRO-3 as it relies on the distributive property of matrix multiplications which allows weights and activations to be sharded and computed independently before being combined.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <img alt="TP & SP diagram" src="/assets/images/5d_nutshell_tp_sp.svg" style="width: 1200px; max-width: none;" />
         </div>
+        </div>
           <p>The main reason we don't want to use TP only for parallelism is that, in practice, TP has two limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
         <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. As we saw in <a target="_self" href="#context_parallelism"> CP section</a>, this is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where, even when using full activation recomputation, the memory requirements for attention would be prohibitive on a single GPU.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1200px; max-width: none;" />
         </div>
+        </div>
         <p><strong>Expert Parallelism (EP)</strong> specifically targets the challenge of training Mixture of Experts (MoE) models by sharding specialized "experts" across GPUs and dynamically routing tokens to relevant experts during computation. The key communication operation in EP is the `all-to-all` operations routing tokens to their assigned experts and gathering the results back. While this operation introduces some communication overhead, it enables scaling model capacity significantly since each token is only processed during inference (and training) by a much smaller fraction of the total parameters. In terms of distributed training/inference, partitioning experts across GPUs becomes relevant when models scales to a large number of experts.</p>
         <aside>For instance DeepSeek V3 uses 256 experts.</aside>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1200px; max-width: none;" />
         </div>
+        </div>
         <div class="note-box">
             <p class="note-box-title">📝 Note</p>
             <div class="note-box-content">
         <p><strong>Summarizing it all–</strong> Now what about gathering and combining all the techniques we've seen in a single diagram combining them all. Yes, we're up for the challenge!</p>
         <p>In this summary diagram, you will find illustrated activations and modules for a single transformers layer –in it's MoE variant–. We also illustrate the various directions of parallelism and the communication operations we've been discussing in all the previous sections.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1200px; max-width: none;"/></p>
             </div>
+            </div>
         <p>We can also represent side-by-side a <strong>full overview</strong> of the memory savings for each one of these strategies. We'll plot them with different sequence length as well as with selective (top) and full (bottom) recomputation so you can see how they all play with activations:</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1200px; max-width: none;"/>
         </div>
+        </div>
         <p>Let's finish this section with a high level view at all of these techniques, their main underlying idea and major bottleneck:</p>
         <p>All the following benchmarks were conducted with a sequence length of 4096 and a global batch size of 1M tokens. We gathered all the top configurations for each model and cluster size and plotted them in the following heatmaps:</p>
         </p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
         <p><img alt="image.png" src="/assets/images/what_we_learnt_heatmap.svg" /></p>
     </div>
+</div>
+<div class="figure-legend">
         <p>Heatmap visualization showing the optimal training configurations across different model sizes and compute node counts (we have 8 GPUs per node). For each combination, the configuration details include Data Parallelism (DP), Tensor Parallelism (TP), Pipeline Parallelism (PP), Gradient Accumulation Steps (GAS), Micro Batch Size (MBS), and ZeRO optimization stage. The color intensity indicates the Model FLOPs Utilization (MFU), with brighter colors representing higher efficiency.</p>
     </div>
         <p>From this high-level visualization, we can draw several important insights:
         <p>However, when profiling this kernel with a tool like <code>ncu</code>, we can see issues, including low memory throughput and uncoalesced memory accesses.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
+            <img width="1400px" alt="image.png" src="/assets/images/memorycoalescing2.png" />
+            <img width="1400px" alt="image.png" src="/assets/images/memorycoalescing3.png" />
         </div>
         </div>
         <p>The reason for this is that in this kernel, two threads in the same block with Thread IDs <code>(0, 0)</code> and <code>(1, 0)</code> (which will end up in the same warp) will both load from the same column of matrix <code>B</code> but different rows of matrix <code>A</code>. Since matrix elements are stored in row-major order (meaning row elements are in consecutive memory addresses, as shown in the figure below) thread <code>(0, 0)</code> will load <d-math>A_{0,0}</d-math>, and thread <code>(1, 0)</code> will load <d-math>A_{1,0}</d-math> in the first iteration <code>i = 0</code>. These elements are not stored close to each other in memory, and this misalignment will be present at each iteration, thereby preventing memory accesses from being coalesced.</p>
         <p><img alt="image.png" src="/assets/images/memorycoalescing4.png" /></p>
         <p>When we profile our new kernel, we notice that the warning about uncoalesced memory accesses has disappeared, and <strong>the GPU's memory throughput has increased by approximately 10 times</strong>.</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
             <p><img width="1200px" alt="image.png" src="/assets/images/memorycoalescing5.png" /></p>
         </div>
+        </div>
         <p>We also notice that the execution time of the kernel <strong>decreases by 10x</strong>! Amazing.</p>
         <p>Now let's cover another technique you will often see mentioned in the litterature: <strong>tiling</strong>.</p>
         </ul>
         <p>We hope this book helps you get started in distributed training and that you will train the next generation of awesome models to the hum of your GPU cluster!</p>
+        <hr>
+        <p> <strong>One last word</strong> for our first readers. We're so happy with this writing piece that we've decided to distribute a limited number of physical printed editions of it as a gift for our first readers.</p>
+        <p>If you are among the first 50 people to fill in your email address below, we'll contact you later in the year to send you a real physical edition once we've formatted it as a printed copy.</p>
+        <p>We expect the book to be around 100-150 pages and to cover the same content as the blog post but we may also decide to shorten or lengthen it depending on what make sense as a printed object.</p>
+        <p>To get your physical copy, please fill in your email address in the following <a target="_blank" href="https://forms.gle/e1GkAShUCtgcwnne8">google form</a>.</p>
+        <p>Whether you are one of our first readers or coming much later to this blog post, we've very happy to see that you enjoyed this sharing of knowledge. May the force of open-source and open-science always be with you.</p>
         <h3>Acknowledgements</h3>
         <p>We thank <a href="https://huggingface.co/eliebak">Elie</a> for conducting thorough reviews and creating the audio components using NotebookLM. Special thanks to <a href="https://huggingface.co/hynky">Hynek</a> for optimizing the frontend performance. We also thank <a href="https://huggingface.co/sbrandeis">Simon</a> for resolving some issues on the hub.</p>
         <p>This would print aggregated profiling results sorted by the total CUDA time, and the output would be:</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
+                <img alt="image.png" src="/assets/images/a1_kernels.png" style="width: 1200px; max-width: none;" />
+            </div>
         </div>
         <p>You can also try to inspect the trace as we previously mentioned on <code>chrome://tracing/</code></p>
         <p>After zooming in, you can observe the flow of operations when calling <code>layer_norm</code> in this trace:</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
+                <img alt="image.png" src="/assets/images/a1_profile_trace.png" style="width: 1200px; max-width: none;" />
+            </div>
         </div>
         <p>The sequence begins in the CPU (the upper section) with <code>aten::layer_norm</code>, progressing to <code>aten::native_layer_norm</code>, and then transitioning to <code>cudaLaunchKernel</code>. From there, we move on to the GPU, where the <code>vectorized_layer_norm_kernel</code> kernel is called.</p>
         <p>and open the file <code>output.ncu-rep</code> with Nsight Compute, you will have a view that looks like this:</p>
+        <div class="large-image-background-transparent">
+            <div class="boxed-image">
+                <img alt="image.png" src="/assets/images/a1_ncu.png" style="width: 1200px; max-width: none;" />
+            </div>
         </div>
         <p>With clear warnings about compute and memory utilization, and how to make the kernel better in balancing compute and memory and achieve maximal occupancy.</p>

src/memory.js CHANGED Viewed

@@ -189,8 +189,8 @@ export function updateGraph() {
     console.log('Data for treemap:', data);
-    const width = 700;
-    const height = 450;
     const legendHeight = 50;
     const svg = d3.select("#graph").select("svg");
@@ -225,8 +225,8 @@ export function updateGraph() {
             case 'Total': return 'rgb(225, 225, 225)';  // Light Grey
             // Give distinct colors to the main section containers
-            case 'Activation Memory': return 'rgb(78, 165, 183)';  // Orange
-            case 'Parameters / Gradients / Optimizer States': return 'rgb(232, 137, 171)';  // Teal Blue
             // Parameters / Gradients / Optimizer States branch
             case 'Parameters': return 'rgb(206, 192, 250)';  // Blue

     console.log('Data for treemap:', data);
+    const width = 600;
+    const height = 600;
     const legendHeight = 50;
     const svg = d3.select("#graph").select("svg");
             case 'Total': return 'rgb(225, 225, 225)';  // Light Grey
             // Give distinct colors to the main section containers
+            case 'Activation Memory': return 'rgb(61, 198, 159)';  // Orange
+            case 'Parameters / Gradients / Optimizer States': return 'rgba(232, 137, 170, 0.85)';  // Teal Blue
             // Parameters / Gradients / Optimizer States branch
             case 'Parameters': return 'rgb(206, 192, 250)';  // Blue

src/style.css CHANGED Viewed

@@ -182,7 +182,7 @@ toggle-icon {
 }
 toggle-icon.collapsed {
-    transform: rotate(-90deg);
 }
 .toc-content {
@@ -296,80 +296,6 @@ d-contents nav > ul > li > a:hover {
     text-decoration: none;
 }
-/* memory */
-#controls {
-    display: grid;
-    grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
-    column-gap: 10px;
-    margin-bottom: 20px;
-    max-width: 100%;
-    @supports (container-type: inline-size) {
-        container-type: inline-size;
-    }
-}
-#controls .cell {
-    padding: 1px;
-    box-sizing: border-box;
-}
-#controls .column-1 {
-    display: flex;
-    align-items: center;
-}
-#controls .column-2 {
-    display: flex;
-    align-items: center;
-}
-@supports (container-type: inline-size) {
-    @container (max-width: 600px) {
-        #controls .column-2 {
-            order: 2;
-        }
-    }
-}
-#controls label {
-    text-align: right;
-    padding-right: 10px;
-    flex: 0 0 auto;
-    width: 150px;
-    line-height: 1.5em;
-    font-size: 0.8em;
-}
-#controls input[type="range"] {
-    width: 50%;
-    margin: 0 10px;
-}
-#controls input[type="number"] {
-    flex-shrink: 0;
-    width: 60px;
-    height: 24px;
-    border: 1px solid var(--distill-gray-light);
-    border-radius: 0.2rem;
-}
-#controls select {
-    width: 100%;
-    min-height: 28px;
-    border: 1px solid var(--distill-gray-light);
-    border-radius: 0.2rem;
-}
-#controls .column {
-    display: contents;
-}
-#graph svg {
-    font-family: sans-serif;
-}
-#graph svg rect {
-    cursor: pointer;
-}
 .note-box {
     background-color: #f6f8fa;
     border-left: 4px solid #444444;
@@ -437,6 +363,28 @@ d-code {
         justify-content: center; /* This will center your image */
 }
 d-article li {
     margin-bottom: 0.0em;
 }
@@ -452,3 +400,200 @@ d-article ol ol {
 d-article hr {
     grid-column: text;
 }

 }
 toggle-icon.collapsed {
+    transform: rotate(90deg);
 }
 .toc-content {
     text-decoration: none;
 }
 .note-box {
     background-color: #f6f8fa;
     border-left: 4px solid #444444;
         justify-content: center; /* This will center your image */
 }
+.large-image-background-transparent {
+    /* width: 100vw; */
+    padding-top: 10px;
+    padding-bottom: 10px;
+    /* margin-left: calc(-50vw + 50%); */
+    margin-left:-100px;
+    margin-right: -100px;
+    /* margin-right: calc(-50vw + 50%); */
+    /* background: white; */
+    height: fit-content; /* This will make it match the image height */
+    display: flex;
+    justify-content: center; /* This will center your image */
+}
+.boxed-image {
+    padding: 0.5rem;
+    background: white;
+    border-radius: 12px;
+    border: 1px solid #e5e7eb;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
 d-article li {
     margin-bottom: 0.0em;
 }
 d-article hr {
     grid-column: text;
 }
+/* Memory visualization */
+#graph-all {
+    min-width: 500px;
+    margin-right: 10px;
+    margin-bottom: 2rem;
+    padding: 0.5rem;
+    background: #f9fafb;
+    border-radius: 12px;
+    border: 1px solid #e5e7eb;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+/* Main container styles */
+#controls {
+    max-width: 1200px;
+    /* margin: 2rem auto; */
+    margin-bottom: 2rem;
+    margin-left: 10px;
+    padding: 0.6rem;
+    background: #f9fafb;
+    border-radius: 12px;
+    border: 1px solid #e5e7eb;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+/* Grid layout */
+#controls {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    /* gap: 2rem; */
+}
+/* Cell styles */
+.cell {
+    margin-bottom: 0.2rem;
+}
+/* Label styles */
+label {
+    display: block;
+    /* margin-bottom: 0.5rem; */
+    font-size: 0.8rem;
+    font-weight: 500;
+    color: #374151;
+}
+/* Input container for range + number combination */
+.input-container {
+    display: flex;
+    gap: 1rem;
+    align-items: center;
+}
+/* Range input styling */
+input[type="range"] {
+    flex: 1;
+    height: 6px;
+    background: #e5e7eb;
+    border-radius: 3px;
+    appearance: none;
+    outline: none;
+}
+input[type="range"]::-webkit-slider-thumb {
+    appearance: none;
+    width: 16px;
+    height: 16px;
+    background: #3b82f6;
+    border-radius: 50%;
+    cursor: pointer;
+    transition: background 0.15s ease;
+}
+input[type="range"]::-webkit-slider-thumb:hover {
+    background: #2563eb;
+}
+/* Number input styling */
+input[type="number"] {
+    width: 80px;
+    padding: 0.5rem;
+    border: 1px solid #e5e7eb;
+    border-radius: 6px;
+    font-size: 0.9rem;
+    color: #374151;
+}
+/* Select styling */
+select {
+    width: 100%;
+    padding: 0.5rem;
+    border: 1px solid #e5e7eb;
+    border-radius: 6px;
+    background: white;
+    font-size: 0.9rem;
+    color: #374151;
+    cursor: pointer;
+}
+/* Checkbox styling */
+input[type="checkbox"] {
+    width: 1.2rem;
+    height: 1.2rem;
+    margin-right: 0.5rem;
+    border: 2px solid #e5e7eb;
+    border-radius: 4px;
+    cursor: pointer;
+}
+/* Column specific styles */
+.column-1 {
+    padding-right: 0.5rem;
+}
+.column-2 {
+    padding-left: 0.5rem;
+}
+/* Checkbox container */
+.checkbox-container {
+    display: flex;
+    align-items: center;
+    margin-bottom: 1rem;
+}
+/* Memory visualization styles */
+.memory-block {
+    background: #fff;
+    border-radius: 8px;
+    padding: 1rem;
+    margin-bottom: 1rem;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
+}
+.memory-title {
+    font-size: 1.1rem;
+    font-weight: 500;
+    color: #374151;
+    margin-bottom: 0.5rem;
+}
+.memory-value {
+    font-size: 1.5rem;
+    font-weight: 600;
+    color: #3b82f6;
+}
+/* Responsive adjustments */
+@media (max-width: 768px) {
+    #controls {
+        grid-template-columns: 1fr;
+        padding: 1rem;
+    }
+    .column-1, .column-2 {
+        padding: 0;
+    }
+}
+/* Hover states and transitions */
+input:hover, select:hover {
+    border-color: #3b82f6;
+}
+input:focus, select:focus {
+    border-color: #2563eb;
+    outline: none;
+    box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.1);
+}
+/* Add smooth transitions */
+input, select, button {
+    transition: all 0.15s ease;
+}
+/* Preset dropdown special styling */
+select[name="presets"] {
+    background-color: #f3f4f6;
+    font-weight: 500;
+}
+/* Memory graph enhancements */
+.activation-memory {
+    background: #dbeafe;
+    padding: 1rem;
+    border-radius: 8px;
+    margin-bottom: 1rem;
+}
+.gradient-memory {
+    background: #ede9fe;
+    padding: 1rem;
+    border-radius: 8px;
+}