thomwolf HF staff commited on
Commit
5a7c330
·
verified ·
1 Parent(s): fca5b64
Files changed (47) hide show
  1. assets/data/benchmarks/memusage_activations.html +2 -0
  2. assets/images/activation_recomputation.js +149 -0
  3. assets/images/activation_recomputation.svg +768 -0
  4. assets/images/first_steps_memory_profile.js +2 -2
  5. assets/images/first_steps_simple_training.js +2 -2
  6. dist/assets/.DS_Store +0 -0
  7. dist/assets/data/benchmarks/memusage_activations.html +2 -0
  8. dist/assets/images/activation_recomputation.js +1 -0
  9. dist/assets/images/activation_recomputation.png +3 -0
  10. dist/assets/images/activation_recomputation.svg +768 -0
  11. dist/assets/images/conclusion_llama3_parallelism.png +3 -0
  12. dist/assets/images/dp_diagram.png +3 -0
  13. dist/assets/images/ep_schema.png +3 -0
  14. dist/assets/images/first_steps_memory_profile.js +1 -1
  15. dist/assets/images/first_steps_simple_training.js +1 -1
  16. dist/assets/images/flashattn.png +3 -0
  17. dist/assets/images/flashattn2.png +3 -0
  18. dist/assets/images/fp8_diagram.png +3 -0
  19. dist/assets/images/fp8_divergence.png +3 -0
  20. dist/assets/images/fused_kernels1.png +3 -0
  21. dist/assets/images/fused_kernels2.png +3 -0
  22. dist/assets/images/gradaccumulation_diag.png +3 -0
  23. dist/assets/images/memorycoalescing.png +3 -0
  24. dist/assets/images/memorycoalescing2.png +3 -0
  25. dist/assets/images/memorycoalescing3.png +3 -0
  26. dist/assets/images/memorycoalescing4.png +3 -0
  27. dist/assets/images/memorycoalescing5.png +3 -0
  28. dist/assets/images/mixedprecision.png +3 -0
  29. dist/assets/images/mixedprecision_2.png +3 -0
  30. dist/assets/images/pp_1f1b_scaling.png +3 -0
  31. dist/assets/images/pp_bubblesize.png +3 -0
  32. dist/assets/images/pp_llama3.1_schedule.png +3 -0
  33. dist/assets/images/pp_zerobubble_compgraph.png +3 -0
  34. dist/assets/images/pp_zerobubble_dualpipe.png +3 -0
  35. dist/assets/images/pp_zerobubble_ppschedule.png +3 -0
  36. dist/assets/images/ring-attention.gif +0 -0
  37. dist/assets/images/threadcoarsening.png +3 -0
  38. dist/assets/images/tiling.png +3 -0
  39. dist/assets/images/tp_diagram.png +3 -0
  40. dist/assets/images/tp_diagram2.png +3 -0
  41. dist/assets/images/tp_diagram3.png +3 -0
  42. dist/assets/images/tp_diagram4.png +3 -0
  43. dist/assets/images/tp_full_diagram.png +3 -0
  44. dist/assets/images/tp_sp_diagram.png +3 -0
  45. dist/assets/images/tp_sp_diagram_zoomed.png +3 -0
  46. dist/index.html +100 -81
  47. src/index.html +16 -7
assets/data/benchmarks/memusage_activations.html ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
2
+ <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script> <div id="a841d1b3-f0b4-43f7-90f9-bbb31dc90094" class="plotly-graph-div" style="height:400px; width:1200px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("a841d1b3-f0b4-43f7-90f9-bbb31dc90094")) { Plotly.newPlot( "a841d1b3-f0b4-43f7-90f9-bbb31dc90094", [{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625,52.42681884765625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[9.25390625,28.5078125,97.015625,354.03125,1348.0625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125,488.8917236328125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[46.2578125,142.515625,485.03125,1770.0625,6740.125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125,3041.8564453125],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[145.703125,448.90625,1527.8125,5575.625,21231.25],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"GB memory"},"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-8B","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-70B","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-405B","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"barmode":"stack","width":1200,"height":400,"legend":{"title":{}}}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
assets/images/activation_recomputation.js ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ // Function to enhance the SVG content by adding styles and data attributes
3
+ function enhanceSVGContent(originalContent) {
4
+ const parser = new DOMParser();
5
+ const doc = parser.parseFromString(originalContent, 'image/svg+xml');
6
+
7
+ // Create a style element with hover effects and insert it as the first child of the SVG
8
+ const styleElement = doc.createElementNS('http://www.w3.org/2000/svg', 'style');
9
+ styleElement.textContent = `
10
+ path[data-element-type="layer"] {
11
+ transition: all 0.3s;
12
+ cursor: pointer;
13
+ }
14
+ path[data-element-type="layer"]:hover {
15
+ fill: #b197fc !important;
16
+ transform: translate(0, -2px);
17
+ }
18
+
19
+ path[data-element-type="layer-updated"] {
20
+ transition: all 0.3s;
21
+ cursor: pointer;
22
+ }
23
+
24
+ path[data-element-type="layer-updated"]:hover {
25
+ fill:rgb(103, 56, 244) !important;
26
+ transform: scale(1.02);
27
+ transform: translate(0, -2px);
28
+ }
29
+
30
+ path[data-element-type="gradient"] {
31
+ transition: all 0.3s;
32
+ cursor: pointer;
33
+ }
34
+ path[data-element-type="gradient"]:hover {
35
+ fill: #f06595 !important;
36
+ transform: translate(0, -2px);
37
+ }
38
+
39
+ path[data-element-type="forward"] {
40
+ transition: all 0.3s;
41
+ cursor: pointer;
42
+ }
43
+ path[data-element-type="forward"]:hover {
44
+ stroke: #0c8599 !important;
45
+ stroke-width: 4 !important;
46
+ }
47
+
48
+ path[data-element-type="backward"] {
49
+ transition: all 0.3s;
50
+ cursor: pointer;
51
+ }
52
+ path[data-element-type="backward"]:hover {
53
+ stroke: #e8590c !important;
54
+ stroke-width: 4 !important;
55
+ }
56
+
57
+ path[data-element-type="optimization"] {
58
+ transition: all 0.3s;
59
+ cursor: pointer;
60
+ }
61
+ path[data-element-type="optimization"]:hover {
62
+ stroke: #087f5b !important;
63
+ stroke-width: 4 !important;
64
+ }
65
+ `;
66
+ doc.documentElement.insertBefore(styleElement, doc.documentElement.firstChild);
67
+
68
+ // Process neural network layers (purple nodes)
69
+ doc.querySelectorAll('path[fill="#d0bfff"]').forEach((node, index) => {
70
+ node.setAttribute('data-element-id', `layer-${index}`);
71
+ node.setAttribute('data-element-type', 'layer');
72
+ });
73
+
74
+ doc.querySelectorAll('path[fill="#9775fa"]').forEach((node, index) => {
75
+ node.setAttribute('data-element-id', `layer-updated-${index}`);
76
+ node.setAttribute('data-element-type', 'layer-updated');
77
+ });
78
+
79
+ // Process gradient nodes (pink nodes)
80
+ doc.querySelectorAll('path[fill="#f783ac"]').forEach((node, index) => {
81
+ node.setAttribute('data-element-id', `gradient-${index}`);
82
+ node.setAttribute('data-element-type', 'gradient');
83
+ });
84
+
85
+ // Process arrows by matching stroke colors
86
+ const arrowTypes = {
87
+ '#15aabf': 'forward',
88
+ '#fd7e14': 'backward',
89
+ '#099268': 'optimization'
90
+ };
91
+
92
+ Object.entries(arrowTypes).forEach(([color, type]) => {
93
+ doc.querySelectorAll(`path[stroke="${color}"]`).forEach((arrow, index) => {
94
+ arrow.setAttribute('data-element-id', `${type}-${index}`);
95
+ arrow.setAttribute('data-element-type', type);
96
+ });
97
+ });
98
+
99
+ // Make the SVG responsive
100
+ doc.documentElement.setAttribute('width', '100%');
101
+ doc.documentElement.setAttribute('height', '100%');
102
+ doc.documentElement.setAttribute('preserveAspectRatio', 'xMidYMid meet');
103
+
104
+ return new XMLSerializer().serializeToString(doc);
105
+ }
106
+
107
+ // Function to load an SVG file via fetch
108
+ async function loadSVG(url, containerId) {
109
+ try {
110
+ const response = await fetch(url);
111
+ if (!response.ok) {
112
+ throw new Error(`HTTP error! Status: ${response.status}`);
113
+ }
114
+ const svgText = await response.text();
115
+ const enhancedSVG = enhanceSVGContent(svgText);
116
+ document.getElementById(containerId).innerHTML = enhancedSVG;
117
+ } catch (error) {
118
+ console.error('Error loading SVG:', error);
119
+ document.getElementById(containerId).innerHTML = '<p>Error loading SVG.</p>';
120
+ }
121
+ }
122
+
123
+ // Load the SVG file (adjust the path if needed)
124
+ loadSVG('../assets/images/activation_recomputation.svg', 'svg-activation_recomputation');
125
+
126
+ // Set up event listeners to display a description of the hovered element
127
+ const svgContainer3 = document.getElementById('svg-activation_recomputation');
128
+
129
+ svgContainer3.addEventListener('mouseover', function (event) {
130
+ const target = event.target;
131
+ if (target.tagName.toLowerCase() === 'path' && target.hasAttribute('data-element-id')) {
132
+ const elementId = target.getAttribute('data-element-id');
133
+ const elementType = target.getAttribute('data-element-type');
134
+ const descriptions = {
135
+ layer: 'Neural Network Layer',
136
+ 'layer-updated': 'Neural Network Layer (updated)',
137
+ gradient: 'Gradient Update Layer',
138
+ forward: 'Forward Pass Connection',
139
+ backward: 'Backward Pass Connection',
140
+ optimization: 'Optimization Step'
141
+ };
142
+ const description = descriptions[elementType] || elementType;
143
+ document.getElementById('svg-activation_recomputation-info').textContent = `Hovering over: ${description} (${elementId})`;
144
+ }
145
+ });
146
+
147
+ svgContainer3.addEventListener('mouseout', function () {
148
+ document.getElementById('svg-activation_recomputation-info').textContent = 'Hover over the network elements to see their details';
149
+ });
assets/images/activation_recomputation.svg ADDED
assets/images/first_steps_memory_profile.js CHANGED
@@ -151,10 +151,10 @@ svgContainer2.addEventListener('mouseover', function(event) {
151
  'cache': 'Cache Operation'
152
  };
153
  const description = descriptions[elementType] || elementType;
154
- document.getElementById('info').textContent = `Hovering over: ${description} (${elementId})`;
155
  }
156
  });
157
 
158
  svgContainer2.addEventListener('mouseout', function() {
159
- document.getElementById('info').textContent = 'Hover over the network elements to see their details';
160
  });
 
151
  'cache': 'Cache Operation'
152
  };
153
  const description = descriptions[elementType] || elementType;
154
+ document.getElementById('svg-first_steps_memory_profile-info').textContent = `Hovering over: ${description} (${elementId})`;
155
  }
156
  });
157
 
158
  svgContainer2.addEventListener('mouseout', function() {
159
+ document.getElementById('svg-first_steps_memory_profile-info').textContent = 'Hover over the elements to see their details';
160
  });
assets/images/first_steps_simple_training.js CHANGED
@@ -140,10 +140,10 @@ svgContainer.addEventListener('mouseover', function (event) {
140
  optimization: 'Optimization Step'
141
  };
142
  const description = descriptions[elementType] || elementType;
143
- document.getElementById('info').textContent = `Hovering over: ${description} (${elementId})`;
144
  }
145
  });
146
 
147
  svgContainer.addEventListener('mouseout', function () {
148
- document.getElementById('info').textContent = 'Hover over the network elements to see their details';
149
  });
 
140
  optimization: 'Optimization Step'
141
  };
142
  const description = descriptions[elementType] || elementType;
143
+ document.getElementById('svg-first_steps_simple_training-info').textContent = `Hovering over: ${description} (${elementId})`;
144
  }
145
  });
146
 
147
  svgContainer.addEventListener('mouseout', function () {
148
+ document.getElementById('svg-first_steps_simple_training-info').textContent = 'Hover over the network elements to see their details';
149
  });
dist/assets/.DS_Store ADDED
Binary file (6.15 kB). View file
 
dist/assets/data/benchmarks/memusage_activations.html ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
2
+ <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script> <div id="a841d1b3-f0b4-43f7-90f9-bbb31dc90094" class="plotly-graph-div" style="height:400px; width:1200px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("a841d1b3-f0b4-43f7-90f9-bbb31dc90094")) { Plotly.newPlot( "a841d1b3-f0b4-43f7-90f9-bbb31dc90094", [{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625,52.42681884765625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[9.25390625,28.5078125,97.015625,354.03125,1348.0625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125,488.8917236328125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[46.2578125,142.515625,485.03125,1770.0625,6740.125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125,3041.8564453125],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[145.703125,448.90625,1527.8125,5575.625,21231.25],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"GB memory"},"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-8B","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-70B","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-405B","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"barmode":"stack","width":1200,"height":400,"legend":{"title":{}}}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
dist/assets/images/activation_recomputation.js ADDED
@@ -0,0 +1 @@
 
 
1
+ function enhanceSVGContent(t){const e=(new DOMParser).parseFromString(t,"image/svg+xml"),n=e.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n path[data-element-type="layer"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="layer"]:hover {\n fill: #b197fc !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="layer-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n \n path[data-element-type="layer-updated"]:hover {\n fill:rgb(103, 56, 244) !important;\n transform: scale(1.02);\n transform: translate(0, -2px);\n }\n\n path[data-element-type="gradient"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="gradient"]:hover {\n fill: #f06595 !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="forward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="forward"]:hover {\n stroke: #0c8599 !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="backward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="backward"]:hover {\n stroke: #e8590c !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="optimization"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="optimization"]:hover {\n stroke: #087f5b !important;\n stroke-width: 4 !important;\n }\n',e.documentElement.insertBefore(n,e.documentElement.firstChild),e.querySelectorAll('path[fill="#d0bfff"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-${e}`),t.setAttribute("data-element-type","layer")})),e.querySelectorAll('path[fill="#9775fa"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-updated-${e}`),t.setAttribute("data-element-type","layer-updated")})),e.querySelectorAll('path[fill="#f783ac"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`gradient-${e}`),t.setAttribute("data-element-type","gradient")})),Object.entries({"#15aabf":"forward","#fd7e14":"backward","#099268":"optimization"}).forEach((([t,n])=>{e.querySelectorAll(`path[stroke="${t}"]`).forEach(((t,e)=>{t.setAttribute("data-element-id",`${n}-${e}`),t.setAttribute("data-element-type",n)}))})),e.documentElement.setAttribute("width","100%"),e.documentElement.setAttribute("height","100%"),e.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(e)}async function loadSVG(t,e){try{const n=await fetch(t);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const a=enhanceSVGContent(await n.text());document.getElementById(e).innerHTML=a}catch(t){console.error("Error loading SVG:",t),document.getElementById(e).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/activation_recomputation.svg","svg-activation_recomputation");const svgContainer3=document.getElementById("svg-activation_recomputation");svgContainer3.addEventListener("mouseover",(function(t){const e=t.target;if("path"===e.tagName.toLowerCase()&&e.hasAttribute("data-element-id")){const t=e.getAttribute("data-element-id"),n=e.getAttribute("data-element-type"),a={layer:"Neural Network Layer","layer-updated":"Neural Network Layer (updated)",gradient:"Gradient Update Layer",forward:"Forward Pass Connection",backward:"Backward Pass Connection",optimization:"Optimization Step"}[n]||n;document.getElementById("svg-activation_recomputation-info").textContent=`Hovering over: ${a} (${t})`}})),svgContainer3.addEventListener("mouseout",(function(){document.getElementById("svg-activation_recomputation-info").textContent="Hover over the network elements to see their details"}));
dist/assets/images/activation_recomputation.png ADDED

Git LFS Details

  • SHA256: 322496303f8133466e128f152e8cb2248bc2a0d5665a57b7894d80048612e64f
  • Pointer size: 130 Bytes
  • Size of remote file: 74.5 kB
dist/assets/images/activation_recomputation.svg ADDED
dist/assets/images/conclusion_llama3_parallelism.png ADDED

Git LFS Details

  • SHA256: e7282f28522dc436f176a5ceca8891397d9bb8b8522f9b020bdf20e31258d324
  • Pointer size: 131 Bytes
  • Size of remote file: 348 kB
dist/assets/images/dp_diagram.png ADDED

Git LFS Details

  • SHA256: 70ad6657c4dd1dc1e2f4ad132206a7c4c8682e44a8277f638753532de9aa7f71
  • Pointer size: 131 Bytes
  • Size of remote file: 127 kB
dist/assets/images/ep_schema.png ADDED

Git LFS Details

  • SHA256: 63bf8bb1bbe2ff46b4da5cf874df9880532c077995880565e7306cd26a9053b0
  • Pointer size: 131 Bytes
  • Size of remote file: 137 kB
dist/assets/images/first_steps_memory_profile.js CHANGED
@@ -1 +1 @@
1
- function enhanceSVGContent2(e){const t=(new DOMParser).parseFromString(e,"image/svg+xml"),n=t.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n /* Memory Block (free memory) */\n path[data-element-type="memory-block"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block"]:hover {\n fill: #a5d6a7 !important; /* slightly darker than original */\n transform: translate(0, -2px);\n }\n\n /* Memory Block (updated) */\n path[data-element-type="memory-block-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block-updated"]:hover {\n fill: #81c784 !important;\n transform: scale(1.02) translate(0, -2px);\n }\n\n /* Stack Segment */\n path[data-element-type="stack"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="stack"]:hover {\n fill: #ffd54f !important;\n transform: translate(0, -2px);\n }\n\n /* Read Operation Arrow */\n path[data-element-type="read"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="read"]:hover {\n stroke: #1e88e5 !important;\n stroke-width: 4 !important;\n }\n\n /* Write Operation Arrow */\n path[data-element-type="write"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="write"]:hover {\n stroke: #d32f2f !important;\n stroke-width: 4 !important;\n }\n\n /* Cache Operation Arrow */\n path[data-element-type="cache"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="cache"]:hover {\n stroke: #fbc02d !important;\n stroke-width: 4 !important;\n }\n ',t.documentElement.insertBefore(n,t.documentElement.firstChild),t.querySelectorAll('path[fill="#c8e6c9"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-${t}`),e.setAttribute("data-element-type","memory-block")})),t.querySelectorAll('path[fill="#a5d6a7"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-updated-${t}`),e.setAttribute("data-element-type","memory-block-updated")})),t.querySelectorAll('path[fill="#ffe082"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`stack-${t}`),e.setAttribute("data-element-type","stack")})),Object.entries({"#42a5f5":"read","#ef5350":"write","#ffca28":"cache"}).forEach((([e,n])=>{t.querySelectorAll(`path[stroke="${e}"]`).forEach(((e,t)=>{e.setAttribute("data-element-id",`${n}-${t}`),e.setAttribute("data-element-type",n)}))})),t.documentElement.setAttribute("width","100%"),t.documentElement.setAttribute("height","100%"),t.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(t)}async function loadSVG(e,t){try{console.log("Loading SVG from:",e);const n=await fetch(e);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const r=enhanceSVGContent2(await n.text());document.getElementById(t).innerHTML=r}catch(e){console.error("Error loading SVG:",e),document.getElementById(t).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_memory_profile.svg","svg-first_steps_memory_profile");const svgContainer2=document.getElementById("svg-first_steps_memory_profile");svgContainer2.addEventListener("mouseover",(function(e){const t=e.target;if("path"===t.tagName.toLowerCase()&&t.hasAttribute("data-element-id")){const e=t.getAttribute("data-element-id"),n=t.getAttribute("data-element-type"),r={"memory-block":"Memory Block","memory-block-updated":"Memory Block (updated)",stack:"Stack Segment",read:"Memory Read",write:"Memory Write",cache:"Cache Operation"}[n]||n;document.getElementById("info").textContent=`Hovering over: ${r} (${e})`}})),svgContainer2.addEventListener("mouseout",(function(){document.getElementById("info").textContent="Hover over the network elements to see their details"}));
 
1
+ function enhanceSVGContent2(e){const t=(new DOMParser).parseFromString(e,"image/svg+xml"),n=t.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n /* Memory Block (free memory) */\n path[data-element-type="memory-block"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block"]:hover {\n fill: #a5d6a7 !important; /* slightly darker than original */\n transform: translate(0, -2px);\n }\n\n /* Memory Block (updated) */\n path[data-element-type="memory-block-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block-updated"]:hover {\n fill: #81c784 !important;\n transform: scale(1.02) translate(0, -2px);\n }\n\n /* Stack Segment */\n path[data-element-type="stack"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="stack"]:hover {\n fill: #ffd54f !important;\n transform: translate(0, -2px);\n }\n\n /* Read Operation Arrow */\n path[data-element-type="read"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="read"]:hover {\n stroke: #1e88e5 !important;\n stroke-width: 4 !important;\n }\n\n /* Write Operation Arrow */\n path[data-element-type="write"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="write"]:hover {\n stroke: #d32f2f !important;\n stroke-width: 4 !important;\n }\n\n /* Cache Operation Arrow */\n path[data-element-type="cache"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="cache"]:hover {\n stroke: #fbc02d !important;\n stroke-width: 4 !important;\n }\n ',t.documentElement.insertBefore(n,t.documentElement.firstChild),t.querySelectorAll('path[fill="#c8e6c9"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-${t}`),e.setAttribute("data-element-type","memory-block")})),t.querySelectorAll('path[fill="#a5d6a7"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-updated-${t}`),e.setAttribute("data-element-type","memory-block-updated")})),t.querySelectorAll('path[fill="#ffe082"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`stack-${t}`),e.setAttribute("data-element-type","stack")})),Object.entries({"#42a5f5":"read","#ef5350":"write","#ffca28":"cache"}).forEach((([e,n])=>{t.querySelectorAll(`path[stroke="${e}"]`).forEach(((e,t)=>{e.setAttribute("data-element-id",`${n}-${t}`),e.setAttribute("data-element-type",n)}))})),t.documentElement.setAttribute("width","100%"),t.documentElement.setAttribute("height","100%"),t.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(t)}async function loadSVG(e,t){try{console.log("Loading SVG from:",e);const n=await fetch(e);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const r=enhanceSVGContent2(await n.text());document.getElementById(t).innerHTML=r}catch(e){console.error("Error loading SVG:",e),document.getElementById(t).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_memory_profile.svg","svg-first_steps_memory_profile");const svgContainer2=document.getElementById("svg-first_steps_memory_profile");svgContainer2.addEventListener("mouseover",(function(e){const t=e.target;if("path"===t.tagName.toLowerCase()&&t.hasAttribute("data-element-id")){const e=t.getAttribute("data-element-id"),n=t.getAttribute("data-element-type"),r={"memory-block":"Memory Block","memory-block-updated":"Memory Block (updated)",stack:"Stack Segment",read:"Memory Read",write:"Memory Write",cache:"Cache Operation"}[n]||n;document.getElementById("svg-first_steps_memory_profile-info").textContent=`Hovering over: ${r} (${e})`}})),svgContainer2.addEventListener("mouseout",(function(){document.getElementById("svg-first_steps_memory_profile-info").textContent="Hover over the elements to see their details"}));
dist/assets/images/first_steps_simple_training.js CHANGED
@@ -1 +1 @@
1
- function enhanceSVGContent(t){const e=(new DOMParser).parseFromString(t,"image/svg+xml"),n=e.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n path[data-element-type="layer"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="layer"]:hover {\n fill: #b197fc !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="layer-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n \n path[data-element-type="layer-updated"]:hover {\n fill:rgb(103, 56, 244) !important;\n transform: scale(1.02);\n transform: translate(0, -2px);\n }\n\n path[data-element-type="gradient"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="gradient"]:hover {\n fill: #f06595 !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="forward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="forward"]:hover {\n stroke: #0c8599 !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="backward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="backward"]:hover {\n stroke: #e8590c !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="optimization"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="optimization"]:hover {\n stroke: #087f5b !important;\n stroke-width: 4 !important;\n }\n',e.documentElement.insertBefore(n,e.documentElement.firstChild),e.querySelectorAll('path[fill="#d0bfff"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-${e}`),t.setAttribute("data-element-type","layer")})),e.querySelectorAll('path[fill="#9775fa"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-updated-${e}`),t.setAttribute("data-element-type","layer-updated")})),e.querySelectorAll('path[fill="#f783ac"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`gradient-${e}`),t.setAttribute("data-element-type","gradient")})),Object.entries({"#15aabf":"forward","#fd7e14":"backward","#099268":"optimization"}).forEach((([t,n])=>{e.querySelectorAll(`path[stroke="${t}"]`).forEach(((t,e)=>{t.setAttribute("data-element-id",`${n}-${e}`),t.setAttribute("data-element-type",n)}))})),e.documentElement.setAttribute("width","100%"),e.documentElement.setAttribute("height","100%"),e.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(e)}async function loadSVG(t,e){try{const n=await fetch(t);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const a=enhanceSVGContent(await n.text());document.getElementById(e).innerHTML=a}catch(t){console.error("Error loading SVG:",t),document.getElementById(e).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_simple_training.svg","svg-first_steps_simple_training");const svgContainer=document.getElementById("svg-first_steps_simple_training");svgContainer.addEventListener("mouseover",(function(t){const e=t.target;if("path"===e.tagName.toLowerCase()&&e.hasAttribute("data-element-id")){const t=e.getAttribute("data-element-id"),n=e.getAttribute("data-element-type"),a={layer:"Neural Network Layer","layer-updated":"Neural Network Layer (updated)",gradient:"Gradient Update Layer",forward:"Forward Pass Connection",backward:"Backward Pass Connection",optimization:"Optimization Step"}[n]||n;document.getElementById("info").textContent=`Hovering over: ${a} (${t})`}})),svgContainer.addEventListener("mouseout",(function(){document.getElementById("info").textContent="Hover over the network elements to see their details"}));
 
1
+ function enhanceSVGContent(t){const e=(new DOMParser).parseFromString(t,"image/svg+xml"),n=e.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n path[data-element-type="layer"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="layer"]:hover {\n fill: #b197fc !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="layer-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n \n path[data-element-type="layer-updated"]:hover {\n fill:rgb(103, 56, 244) !important;\n transform: scale(1.02);\n transform: translate(0, -2px);\n }\n\n path[data-element-type="gradient"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="gradient"]:hover {\n fill: #f06595 !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="forward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="forward"]:hover {\n stroke: #0c8599 !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="backward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="backward"]:hover {\n stroke: #e8590c !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="optimization"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="optimization"]:hover {\n stroke: #087f5b !important;\n stroke-width: 4 !important;\n }\n',e.documentElement.insertBefore(n,e.documentElement.firstChild),e.querySelectorAll('path[fill="#d0bfff"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-${e}`),t.setAttribute("data-element-type","layer")})),e.querySelectorAll('path[fill="#9775fa"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-updated-${e}`),t.setAttribute("data-element-type","layer-updated")})),e.querySelectorAll('path[fill="#f783ac"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`gradient-${e}`),t.setAttribute("data-element-type","gradient")})),Object.entries({"#15aabf":"forward","#fd7e14":"backward","#099268":"optimization"}).forEach((([t,n])=>{e.querySelectorAll(`path[stroke="${t}"]`).forEach(((t,e)=>{t.setAttribute("data-element-id",`${n}-${e}`),t.setAttribute("data-element-type",n)}))})),e.documentElement.setAttribute("width","100%"),e.documentElement.setAttribute("height","100%"),e.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(e)}async function loadSVG(t,e){try{const n=await fetch(t);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const a=enhanceSVGContent(await n.text());document.getElementById(e).innerHTML=a}catch(t){console.error("Error loading SVG:",t),document.getElementById(e).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_simple_training.svg","svg-first_steps_simple_training");const svgContainer=document.getElementById("svg-first_steps_simple_training");svgContainer.addEventListener("mouseover",(function(t){const e=t.target;if("path"===e.tagName.toLowerCase()&&e.hasAttribute("data-element-id")){const t=e.getAttribute("data-element-id"),n=e.getAttribute("data-element-type"),a={layer:"Neural Network Layer","layer-updated":"Neural Network Layer (updated)",gradient:"Gradient Update Layer",forward:"Forward Pass Connection",backward:"Backward Pass Connection",optimization:"Optimization Step"}[n]||n;document.getElementById("svg-first_steps_simple_training-info").textContent=`Hovering over: ${a} (${t})`}})),svgContainer.addEventListener("mouseout",(function(){document.getElementById("svg-first_steps_simple_training-info").textContent="Hover over the network elements to see their details"}));
dist/assets/images/flashattn.png ADDED

Git LFS Details

  • SHA256: 2ca3528348a2cc037d31521c11ad44cf7078653a7f453483e346508ba139ab4d
  • Pointer size: 130 Bytes
  • Size of remote file: 98.3 kB
dist/assets/images/flashattn2.png ADDED

Git LFS Details

  • SHA256: 4312d0a3b349219f2215887926555a08261507e92b992a93337659fd7aff1157
  • Pointer size: 131 Bytes
  • Size of remote file: 396 kB
dist/assets/images/fp8_diagram.png ADDED

Git LFS Details

  • SHA256: 2517479bff358569b4410ffe302d63ff530fa2883722603012b6e14a18fefd75
  • Pointer size: 131 Bytes
  • Size of remote file: 127 kB
dist/assets/images/fp8_divergence.png ADDED

Git LFS Details

  • SHA256: 81e8495d96c8e40fbd36ee1030fc5325adabd5c2541b4a5c6041b07320ef76c6
  • Pointer size: 131 Bytes
  • Size of remote file: 241 kB
dist/assets/images/fused_kernels1.png ADDED

Git LFS Details

  • SHA256: 51c0e08c1d245d4bf529a97990eb85b15f31c9dea10f9bfdb18de6969957c20d
  • Pointer size: 131 Bytes
  • Size of remote file: 141 kB
dist/assets/images/fused_kernels2.png ADDED

Git LFS Details

  • SHA256: 949e208e9f7e140395c303aac6684ded8d120e7cd788c20e5fd395c2df5b5b91
  • Pointer size: 130 Bytes
  • Size of remote file: 73 kB
dist/assets/images/gradaccumulation_diag.png ADDED

Git LFS Details

  • SHA256: 0a7acb4c1e4832272beb247588f2a154a703d5b6f468b5e0b7dcffbcda41bbdc
  • Pointer size: 131 Bytes
  • Size of remote file: 116 kB
dist/assets/images/memorycoalescing.png ADDED

Git LFS Details

  • SHA256: 96ed02089819123c2ec48b178d4b673cc4f628f4c903f02ad98c5588cf3e1931
  • Pointer size: 131 Bytes
  • Size of remote file: 155 kB
dist/assets/images/memorycoalescing2.png ADDED

Git LFS Details

  • SHA256: c1708d3f4588768350a78a5b38d7e2a968fb6115d1be8bc0f02f7f81dc6e767c
  • Pointer size: 130 Bytes
  • Size of remote file: 36.4 kB
dist/assets/images/memorycoalescing3.png ADDED

Git LFS Details

  • SHA256: 2fa6b2066aaac9a5dad1a96489414478c680a53bc39ebea704931c466af8d343
  • Pointer size: 130 Bytes
  • Size of remote file: 56.8 kB
dist/assets/images/memorycoalescing4.png ADDED

Git LFS Details

  • SHA256: 62621f72d70635d79b48c7815127cd31da119292981bea58ab20a6b578d3aff3
  • Pointer size: 130 Bytes
  • Size of remote file: 59.3 kB
dist/assets/images/memorycoalescing5.png ADDED

Git LFS Details

  • SHA256: c33982566e567cc075f544aae349bb37dca63b6ce16e2d7e753293826e4a06dd
  • Pointer size: 130 Bytes
  • Size of remote file: 36.4 kB
dist/assets/images/mixedprecision.png ADDED

Git LFS Details

  • SHA256: 8891a3a71a819f217c5b2bfa39d68c794eca480f24bfbb74b618aebde2971fc8
  • Pointer size: 131 Bytes
  • Size of remote file: 241 kB
dist/assets/images/mixedprecision_2.png ADDED

Git LFS Details

  • SHA256: d4cac7b16899d1c36f4936ddcf6751ce96391831397199735a3ef64b6daa0a07
  • Pointer size: 131 Bytes
  • Size of remote file: 190 kB
dist/assets/images/pp_1f1b_scaling.png ADDED

Git LFS Details

  • SHA256: 5191c89bcffed1ead467742eb4cec4c89c53f22e8c7391d115ca06ece15cf21c
  • Pointer size: 131 Bytes
  • Size of remote file: 224 kB
dist/assets/images/pp_bubblesize.png ADDED

Git LFS Details

  • SHA256: 784528719df2d3cbb4765802463b1ab14e1b20d80a593b27ea328086fb67a5cb
  • Pointer size: 131 Bytes
  • Size of remote file: 178 kB
dist/assets/images/pp_llama3.1_schedule.png ADDED

Git LFS Details

  • SHA256: a055afbdb270c6c319e41aa2b7d4e2893c55da18fd6102764a43ce7935d224e2
  • Pointer size: 131 Bytes
  • Size of remote file: 145 kB
dist/assets/images/pp_zerobubble_compgraph.png ADDED

Git LFS Details

  • SHA256: 58b04bbae5360ee205560670c196df82964b6ddb552f962b86889727d292ff08
  • Pointer size: 130 Bytes
  • Size of remote file: 47.6 kB
dist/assets/images/pp_zerobubble_dualpipe.png ADDED

Git LFS Details

  • SHA256: e3d4c7070550b4a76f1c39577edb9c62b467817558721a652cdc9f6e4bdcba1f
  • Pointer size: 131 Bytes
  • Size of remote file: 206 kB
dist/assets/images/pp_zerobubble_ppschedule.png ADDED

Git LFS Details

  • SHA256: 12f18a861d558fa68b8aefdcdecc8b63326ec4b56350e9a1536d45c2cc1238ef
  • Pointer size: 131 Bytes
  • Size of remote file: 123 kB
dist/assets/images/ring-attention.gif ADDED
dist/assets/images/threadcoarsening.png ADDED

Git LFS Details

  • SHA256: 007f2426210d2328df00dcd1122b056831b47a2c604f6512ff08172fcb943621
  • Pointer size: 130 Bytes
  • Size of remote file: 38.6 kB
dist/assets/images/tiling.png ADDED

Git LFS Details

  • SHA256: 8889c4317e8a78a16404af34cfb5153fc10cfefca6b25dc0a9eb7561abf012c3
  • Pointer size: 130 Bytes
  • Size of remote file: 21.6 kB
dist/assets/images/tp_diagram.png ADDED

Git LFS Details

  • SHA256: fb5ae9993740f216bfc4f8481536739c0e85853ef798fe1940f4e6b3bee0683d
  • Pointer size: 130 Bytes
  • Size of remote file: 43.3 kB
dist/assets/images/tp_diagram2.png ADDED

Git LFS Details

  • SHA256: f075304c019e12be1ac0ef8afa9241c03bc466f568dca0c66e20b1391a471bca
  • Pointer size: 131 Bytes
  • Size of remote file: 486 kB
dist/assets/images/tp_diagram3.png ADDED

Git LFS Details

  • SHA256: beff9be457b6363c370d9831f42155ae9674240d2588eac6270f62aeb58f0a70
  • Pointer size: 131 Bytes
  • Size of remote file: 486 kB
dist/assets/images/tp_diagram4.png ADDED

Git LFS Details

  • SHA256: 6885bbcb5ba13aad0a111b69eacf6e679a0ed5dd688cc0ac0b58b21318fce852
  • Pointer size: 131 Bytes
  • Size of remote file: 211 kB
dist/assets/images/tp_full_diagram.png ADDED

Git LFS Details

  • SHA256: 01f9fd3bc4b0a97b167d6ce2d47b0a207492441592d1e2a51fab1e8bfad9962e
  • Pointer size: 131 Bytes
  • Size of remote file: 113 kB
dist/assets/images/tp_sp_diagram.png ADDED

Git LFS Details

  • SHA256: d2463f346a6a3e16d447329a94eb8e9120d38effacf1230637ca25cd35d4c250
  • Pointer size: 131 Bytes
  • Size of remote file: 201 kB
dist/assets/images/tp_sp_diagram_zoomed.png ADDED

Git LFS Details

  • SHA256: f86131810347fba77e74d4972ad0115e8bdfab4d42448b8f07e1d79c3d6eef6a
  • Pointer size: 131 Bytes
  • Size of remote file: 385 kB
dist/index.html CHANGED
@@ -234,7 +234,7 @@
234
  <p>It looks generally like this: </p>
235
 
236
  <div class="svg-container" id="svg-first_steps_simple_training"> </div>
237
- <div class="info" id="info">Hover over the network elements to see their details</div>
238
  <script src="../assets/images/first_steps_simple_training.js"></script>
239
 
240
  <p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
@@ -297,8 +297,9 @@
297
 
298
  <p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
299
 
300
- <!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
301
- <script src="../assets/images/first_steps_memory_profile.js"></script>-->
 
302
 
303
  <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
304
 
@@ -353,7 +354,7 @@
353
  <div class="note-box">
354
  <p class="note-box-title">📝 Note</p>
355
  <p class="note-box-content">
356
- Some librarie store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
357
 
358
  </p>
359
  </div>
@@ -415,7 +416,14 @@
415
 
416
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
417
 
418
- <p><img alt="llama-memory-bars-no-recomp.png" src="/assets/images/placeholder.png" /></p>
 
 
 
 
 
 
 
419
 
420
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
421
 
@@ -429,8 +437,9 @@
429
 
430
  <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
431
 
432
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
433
-
 
434
  <p>There are several strategies to select key activations to store:</p>
435
 
436
  <ul>
@@ -489,7 +498,7 @@
489
 
490
  <p>Gradient accumulation allows us to effectively increase our batch size up to infinity (and beyond!) while the memory footprint stays constant. Gradient accumulation is also compatible with activation recomputation for further memory reduction. One drawback however, is that gradient accumulation requires multiple consecutive forward/backward passes per optimization step thereby increasing the compute overhead and slowing down training. No free lunch! </p>
491
 
492
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
493
 
494
  <aside>Using gradient accumulation means we need to keep buffers where we accumulate gradients which persist throughout a training step. Whereas without gradient accumulation, in the backward gradients are computed while freeing the activations memory, which means a lower peak memory.</aside>
495
 
@@ -508,13 +517,13 @@
508
 
509
  <p>Using a different micro batch for each GPU means we’ll have different gradients in each GPU, so to keep the model instances in sync across different GPUs, the gradients from the model instances are averaged using an operation called “all-reduce”, which happens during the backward pass, before the optimizer step.</p>
510
 
511
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
512
 
513
  <aside>If you are not familiar with distributed communications patterns like broadcast, gather or all-reduce we put together a small crash course in the Appendix [TODO Link].</aside>
514
 
515
  <p>This involves our first “distributed communication” primitive: <em><strong>all-reduce</em></strong> which handles the synchronization and communication between GPU instances and nodes.</p>
516
 
517
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
518
 
519
  <p>A naive DP implementation would just wait for the backward pass the finish so that we have all gradients, then it triggers an all-reduce over all DP ranks, to sync these gradients. But such an sequential steps of computation followed by communication is <strong>A BIG NO!</strong> Because we don’t want our GPUs to stay idle while communication is happening.</p>
520
 
@@ -540,7 +549,7 @@
540
  if p.requires_grad is True:
541
  p.register_post_accumulate_grad_hook(hook)</d-code>
542
 
543
- <p><img alt="image.png" src="/assets/images/placeholder.png"/></p>
544
 
545
  <p>Overlapping computation and communication reduces the time spent waiting for gradient synchronization across the entire model. Gradient synchronization can occur (at least partially) in parallel with backward pass, significantly speeding up data parallelism. Here's a full implementation of naive DP with synchronization overlap:</p>
546
 
@@ -574,7 +583,7 @@
574
  </div>
575
  </details>
576
 
577
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
578
 
579
  <h4><strong>Third optimization: </strong>Interplay with gradient accumulation</h4>
580
 
@@ -634,7 +643,7 @@
634
 
635
  <p>While data parallelism cleverly overlaps the all-reduce gradient synchronization with backward computation to save time, this benefit starts to break down at large scales. As we add more and more GPUs (hundreds or thousands), the overhead of coordinating between them grows significantly. The end result? We get less and less efficient returns from each additional GPU we add to the system:</p>
636
 
637
- <p><img alt="image.png" src="/assets/images/placeholder.png"/></p>
638
 
639
  <p>As expected, we can also see that the memory usage per GPU is not affected by adding more DP ranks for training.</p>
640
 
@@ -642,7 +651,7 @@
642
 
643
  <p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
644
 
645
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
646
 
647
  <aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
648
 
@@ -688,7 +697,7 @@
688
 
689
  <p>The idea of ZeRO is to shard these objects across the DP ranks, each node only storing a slice of the items which are reconstructed when and if needed, thereby dividing memory usage by the data parallel degree <d-math>N_d</d-math>:</p>
690
 
691
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
692
  <p>Memory consumption of DP and three stages of Zero-DP. <d-math>\Psi</d-math> denotes number of parameters, <d-math>k</d-math> denotes the memory multiplier of optimizer states (<d-math>k=12</d-math> for Adam), and <d-math>N_d</d-math> denotes DP degree.</p>
693
 
694
 
@@ -714,11 +723,11 @@
714
 
715
  <p>See the figure below for all the necessary steps in one forward/backward pass cycle:</p>
716
 
717
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
718
 
719
  <p>So in practice, compared to vanilla DP, Zero-1 adds an all-gather over all parameters after the optimizer step as we can see below:</p>
720
 
721
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
722
 
723
  <p>If you've been following along, you'll recall from vanilla DP that we can overlap the all-reduce gradient communication with the backward pass computation. In ZeRO-1, we can also investigate how to efficiently overlap the newly added all-gather of bf16 parameters. There are two main strategies for this:</p>
724
 
@@ -742,13 +751,13 @@
742
 
743
  <aside>In case of FP32 gradient accumulation, we only need to keep <d-math>\frac{1}{N_d}</d-math> fp32_grads where we accumulate the bf16 grads coming from the reduce-scatter. And in the optimizer step we use the <d-math>\frac{1}{N_d}</d-math> fp32_grads.</aside>
744
 
745
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
746
 
747
  <p>It’s easy to see now that sharding the gradients leads to to <d-math>2\Psi + \frac{2\Psi+k\Psi}{N_d}</d-math> and as <d-math>N_d</d-math> is increased we can save up to 8x memory over the baseline. In terms of communication the same process applies as for ZeRO-1, with the only difference that we communicate and release on the fly. In total, ZeRO-2 is thus also equivalent to vanilla DP training w.r.t. communication.</p>
748
 
749
  <p>In terms of communication ZeRO-2 is similar to ZeRO-1, they both require a reduce-scatter for the gradients, and an all-gather over all parameters.</p>
750
 
751
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
752
 
753
  <aside>Note: You might notice that there is no real overhead of using ZeRO-2 over ZeRO-1 and indeed ZeRO-2 is usually the best option.</aside>
754
 
@@ -767,13 +776,15 @@
767
 
768
  <p>So how do we do a forward or backward pass in practice if all parts of the model are distributed? Quite simply we gather them on-demand when we need them. In the forward pass this looks as follows:</p>
769
 
770
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
771
 
772
- <p>So as we perform the forward pass and sequentially go through the layers we retrieve the necessary parameters on demand and immediately flush them from memory when we dont need them anymore. The backward pass works the same way just inverted in flow and we produce the gradient shards: </p>
773
 
774
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
775
 
776
-
 
 
777
 
778
  <p>During the forward pass we do all-gather operations for the parameters when we need them, so a <d-math>\Psi</d-math> communication tax. Since we discard the parameters immediately after we needed them in the forward pass we need one more all-gather during the backward pass as well incurring another <d-math>\Psi</d-math> in communication tax. Finally we need the same <strong><em>reduce-scatter</em></strong> as in ZeRO-2 for the gradients which costs also <d-math>\Psi</d-math> in communication and we arrive at a total communication cost of <d-math>3\Psi</d-math>, compared to <d-math>2\Psi</d-math> for Zero-2.</p>
779
 
@@ -788,7 +799,7 @@
788
 
789
  <p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! Recall from the activation memory discussion that it scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
790
 
791
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
792
 
793
  <p>Now that we've efficiently used the DP axis to reduce memory through efficient communication patterns, let's explore a new, orthogonal axis of parallelism - Tensor Parallelism. Unlike ZeRO3 that relies on heavy parameter communication, TP manages to shard parameters, gradients, optimizer states AND activations across devices without requiring any model parameter movement between GPUs. What! How is this even possible?! Let's explore this seemingly magical approach together! 🙂</p>
794
 
@@ -814,13 +825,13 @@
814
 
815
  <p>In practice a small example of the operation looks like this:</p>
816
 
817
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
818
 
819
  <p>Let’s see how we can parallelise this operation! In tensor parallelism, tensors will be split into N shards along a particular dimension and distributed across N GPUs. Matrices can be split either on the column part or row part leading to row and column parallelism. One thing we’ll see in the following is that choosing row or column sharding will require different communications primitives.</p>
820
 
821
  <p>Our first option is to use column-wise sharding (also called <strong><em>column-linear</em></strong>): We'll copy the complete input matrices to each worker, requiring an operation called <strong><em>broadcast</em></strong>, and split the weight matrix into columns. The inputs are then multiplied with the partial weight matrices, and the results are finally combined using an <strong><em>all-gather</em></strong> operation.</p>
822
 
823
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
824
 
825
  <p>Here's the code implementation of column wise tensor parallelism:</p>
826
 
@@ -837,7 +848,7 @@
837
 
838
  <p>We see here our fourth distributed primitive: <strong><em>scatter</em></strong>!</p>
839
 
840
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
841
 
842
  <p>Here's the implementation for row-wise tensor parallelism:</p>
843
 
@@ -858,7 +869,7 @@
858
 
859
  <p>The Feedforward part can be parallelized by having a “Column linear” followed by a “Row Linear” which amounts to a broadcast to copy the input and an all-reduce in forward. Note that the broadcast isn’t needed in actual training where we can make sure inputs are already synced across TP ranks.</p>
860
 
861
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
862
 
863
  <p>Now that we’ve found the most efficient schema for the Feedforward part of the transformer, let’s take a look at the multi-head attention block (MHA).</p>
864
 
@@ -866,17 +877,17 @@
866
 
867
  <p>It's also worth noting that the tensor parallelism degree should not exceed the number of Q/K/V heads because we need intact heads per TP rank. And in case we’re using GQA, TP degree should be below number of K/V heads, otherwise it requires additional comms to keep them in sync. For instance, LLaMA-3 8B has 8 Key/Value heads, so the tensor parallelism degree should be less than or equal to 8, otherwise if TP=16 for example, we need to duplicate each K/V head and make sure they stay in sync.</p>
868
 
869
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
870
 
871
  <p>Finally note that there is a tradeoff in terms of communication as we’ve added several distributed communication primitive directly in the computation path of our model. At the difference of ZeRO where we could prefetch, it can be harder to make these communication fully overlap with computations. </p>
872
 
873
- <p><img alt="Forward pass in Tensor Parallelism" src="/assets/images/placeholder.png" /></p>
874
 
875
  <p>Looking at the timeline of operations in tensor-parallel MLP (same applies for Attention), we can better understand the tradeoffs involved. In the forward of each decoder layer, we hit a synchronization point with the AllReduce operation that cannot be overlapped with computation. This <em>exposed communication</em> overhead is necessary to combine partial results across tensor-parallel ranks before the final LayerNorm can be applied. </p>
876
 
877
  <p>Tensor parallelism does help reduce activation memory for the matrix multiplications since the intermediate activations are sharded across GPUs. However, we still need to gather the full activations for operations like LayerNorm, which means we're not getting the full memory benefits we could. Additionally, it introduces significant communication requirements that heavily depend on the network infrastructure. The inability to hide this particular AllReduce behind computation means it directly adds to the critical path of forward propagation.</p>
878
 
879
- <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/placeholder.png" /></p>
880
 
881
  <p>Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training.</p>
882
 
@@ -884,7 +895,7 @@
884
 
885
  <p>However, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
886
 
887
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
888
 
889
  <p>As we can see, increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU. While tensor parallelism does help reduce activation memory in attention and feedforward layers by sharding the matrix multiplications across GPUs, we don't get the full memory benefits we could. This is because operations like layer normalization and dropout still require gathering the full activations on each GPU, partially negating the memory savings. We can do better by finding ways to parallelize these remaining operations as well.</p>
890
 
@@ -924,7 +935,7 @@
924
 
925
  <p><img alt=" in forward: f = no-op ; f* = all-reduce ; g = all-gather ; g* = reduce-scatter
926
  in backward: f = all-reduce ; f* = no-op ; g = reduce-scatter ; g* = all-gather
927
- SP region needs full hidden_dim" src="/assets/images/placeholder.png" /></p>
928
 
929
  <p>in forward: f = no-op ; f<em> = all-reduce ; g = all-gather ; g</em> = reduce-scatter in backward: f = all-reduce ; f<em> = no-op ; g = reduce-scatter ; g</em> = all-gather SP region needs full hidden_dim</p>
930
 
@@ -945,7 +956,7 @@
945
 
946
  <p>For sequence parallelism (SP), we use different operations labeled "g" and "g*". Specifically, we avoid using all-reduce in the SP region since that would require gathering the full activations and increase our peak memory usage, defeating the purpose of SP.</p>
947
 
948
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
949
 
950
  <p>So what is actually happening here? As a famous LLM would say, let’s take it step-by-step:</p>
951
 
@@ -1033,13 +1044,13 @@
1033
 
1034
  <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
1035
 
1036
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1037
 
1038
  <p>Does that mean that SP incurs more communication than TP? Well, yes and no. In the forward of a vanilla TP we had two all-reduce per transformer block, and in SP we have two all-gather and two reduce-scatter per transformer block. So SP does twice the number of communication operations as TP. But since an all-reduce operation can be broken down into to an all-gather + reduce-scatter (see in [TODO: Appendix link]) they’re actually equivalent in terms of communication. Same reasoning for backward as we just use the conjugate of each operation (no-op ↔ allreduce and allgather ↔ reducescatter).</p>
1039
 
1040
  <p>If you’ve been paying close attention, you’ll notice that we’re talking about 4 comms ops in each layer (2 for Attention and 2 for MLP). This is how the MLP profiling looks like when using Tensor + Sequence Parallelism:</p>
1041
 
1042
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1043
 
1044
  <p>Besides the fact that TP requires communications in each layer, it also can’t easily be overlapped with compute, which makes throughput heavily dependent on the communication bandwidth. This is why TP is usually done only within a node (TP≤8).</p>
1045
 
@@ -1048,7 +1059,7 @@
1048
 
1049
  <p>As you might expect, this communication overhead becomes increasingly problematic as we scale up tensor parallelism. To illustrate this, let’s check throughput as we scale TP with SP for a 3B model:</p>
1050
 
1051
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1052
  <p>Impact of combined Tensor and Sequence Parallelism (TP/SP) on a 3B model’s performance and memory utilization with 4096 seqlen: when scaling both TP and SP together, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees reduce per-GPU throughput, they enable processing of significantly larger batch sizes by reducing the activation memory.</p>
1053
 
1054
  <p>Let’s summarize our observations:</p>
@@ -1078,7 +1089,7 @@
1078
 
1079
  <p>Even if we use full recomputation of the activations, which comes at a heavy compute overhead (30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length:</p>
1080
 
1081
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1082
 
1083
  <p>Can we apply similar ideas to our sequence parallelism approach but inside in the modules where we apply Tensor Parallelism already, thereby also reducing the effect of sequence length? Yes, it’s time to talk about Context Parallelism, which you will find quite intuitive after all we’ve already convered.</p>
1084
 
@@ -1086,7 +1097,7 @@
1086
 
1087
  <p>The idea of Context Parallelism is quite simple; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previous with Tensor + Sequence Parallelism.</p>
1088
 
1089
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1090
 
1091
  <p>Splitting the sequence doesn't affect most modules like MLP and LayerNorm, where each token is processed independently. It also doesn’t require expensive communication like TP, as only the inputs are split and not the weight matrices. Just like data parallelism, after computing the gradients, an all-reduce operation is initiated to synchronize the gradients across the context parallelism group.</p>
1092
 
@@ -1117,13 +1128,13 @@
1117
 
1118
  <p>The whole process with 4 GPUs is shown in the following animation:</p>
1119
 
1120
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1121
 
1122
  <p>With this animation, it’s also immediately clear why the authors chose to call this approach Ring Attention.</p>
1123
 
1124
  <p>There is one big problem though which is that a naive implementation of Ring Attention lead to some strong imbalance between GPU streaming from the shape of the causal attention matrix. Let’s take a real look at what is happening in the SoftMax computation by considering the attention score matrix with the causal attention mask:</p>
1125
 
1126
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1127
 
1128
  <p>The SoftMax is computed row-wise, which means whenever a GPU has received all the tokens of a row it can be computed. We see that GPU1 can immediately compute it as it starts with tokens 1-4 and GPU1 actually doesn’t need to receive any information from any other GPUs. However, GPU2 will need to wait for the second round to also receive 1-4 and thus have all values for tokens 1-8. Also, GPU1 seems to perform much less work than all the other GPUs.</p>
1129
 
@@ -1133,14 +1144,14 @@
1133
 
1134
  <p>We need a better way to distribute the input sequences. This can be achieved by assigning the tokens not purely sequential to the GPUs but by mixing the ordering a bit such that we have a good mix of early and late tokens on each GPU. This approach is called Zig-Zag attention<d-cite bibtex-key="attention brandon2023fasterring"></d-cite> and in this new arrangement, the attention mask will show an even distribution of computation but if you count the number of colored squares, you’ll see that the computation is now balanced across all GPUs.</p>
1135
 
1136
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1137
 
1138
  <p>At the same time we’ll also see that in order to complete all rows, each GPU will need information from all the other GPUs.</p>
1139
 
1140
  <p>We have two general ways to overlap computation and communication, either by performing a general all-gather, regrouping all the KV on each GPUs at the same time (in a Zero-3 type of way) or we gather them one-by-one from each GPU to each GPU as needed:</p>
1141
 
1142
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1143
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1144
 
1145
  <p>The key difference between these two implementations lies in their communication patterns and memory usage:</p>
1146
 
@@ -1150,7 +1161,6 @@
1150
  <li>All GPUs simultaneously gather the complete key/value pairs from all other GPUs</li>
1151
  <li>Requires more temporary memory as each GPU needs to store the full KV pairs at once</li>
1152
  <li>Communication happens in one step but with larger memory overhead</li>
1153
- <li>Used in MegatronLM's implementation of context parallelism</li>
1154
  </ul>
1155
 
1156
  <p><strong>2. All-to-All (Ring) Implementation:</strong></p>
@@ -1159,7 +1169,6 @@
1159
  <li>GPUs exchange KV pairs in a ring-like pattern, one chunk at a time</li>
1160
  <li>More memory efficient as each GPU only needs to store one additional chunk temporarily</li>
1161
  <li>Communication is spread out and overlapped with computation, though with some additional base latency overhead from multiple communication steps</li>
1162
- <li>Used in DeepSpeed's implementation of context parallelism</li>
1163
  </ul>
1164
 
1165
  <p>The All-to-All approach generally offers better memory efficiency at the cost of slightly more complex communication patterns, while the AllGather approach is simpler but requires more temporary memory during the attention computation.</p>
@@ -1170,12 +1179,12 @@
1170
 
1171
  <p>In the TP section we saw that if we try to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) we hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we perform it across several nodes:</p>
1172
 
1173
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1174
  <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
1175
 
1176
  <p>Sequence and context parallelism can help for long sequences but don’t help much if sequence length is not the root cause of our memory issues but rather the size of the model itself. For large model (70B+), the size of the weights alone can already push past the limits of the 4-8 GPUs on a single node. We can solve this issue by summoning the fourth (and last) parallelism dimension: “pipeline parallelism”.</p>
1177
 
1178
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1179
 
1180
  <p>Pipeline Parallelism is conceptually very simple –we’ll simply spread the layers of our model across GPUs – but the devil lies in implementing it efficiently. Let’s dive in it!</p>
1181
 
@@ -1189,7 +1198,7 @@
1189
 
1190
  <p>Indeed reader! The main challenge in pipeline parallelism will be how to efficiently circumvent the sequential nature of PP to keep our GPU busy at all times and avoid having one GPU computing while the others are waiting. Here is how our GPU utilization is looking when doing a naive and simple forward and backward pass through the model where the numbers indicate the model layers:</p>
1191
 
1192
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1193
  <p>An example of Pipeline parallelism for a model with 16 layers distributed across 4 GPUs. The numbers correspond to the layer IDs.</p>
1194
 
1195
  <p>The remaining idle time is indicated in grey and usually called the “bubble” and the sight of this probably break your heart after we spent so much time optimizing throughput.</p>
@@ -1208,7 +1217,7 @@
1208
 
1209
  <p>Let’s take a first tool out of our toolbox and think about splitting our batch into smaller bit-sized portions which can be processed in parallel or almost, like we did before in data parallel for instance. Now when the second GPU is busy processing micro-batch 1, the first GPU can already start processing micro-batch 2. Here is a schedule using 8 micro-batches:</p>
1210
 
1211
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1212
 
1213
  <aside>Before the numbers in the diagram indicated the layers but in all pipeline parallel plots from now including this one it indicates a microbatch. You can think of each square here to contain several layers as seen in the previous figure. </aside>
1214
 
@@ -1241,11 +1250,12 @@
1241
 
1242
  <p>This schedule is called <strong><em>one-forward-one-backward (1F1B)</em></strong> as the middle/steady state involves alternatively performing one forward and one backward pass. The general idea is to start performing the backward pass as soon as possible. The schedule looks like this:</p>
1243
 
1244
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1245
 
1246
  <p>The bubble still has the same size so our training efficiency is not significantly improved. However we only need to store activations for <d-math>p</d-math> micro-batches instead of <d-math>m</d-math> which quite reduce the activation memory explosion we had in the AFAB schedule. As a consequence we can add more microbatches which then will actually reduce the bubble.</p>
1247
 
1248
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
 
1249
 
1250
  <p>A major complexity of this setup, visible on the above graph is how forward and backward passes are not cleanly consecutive anymore but performed in parallel across devices. This means we will have to schedule the switch from forward to backward passes independently on each device instead of in a simple and common central training loop as usual.</p>
1251
 
@@ -1276,7 +1286,7 @@
1276
 
1277
  <p>This can be seen in general as a kind of “looping pipeline” where a micro-batch will move in circles from one GPU to the next as it goes through the forward pass through the model.</p>
1278
 
1279
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1280
 
1281
  <p>As a consequence we see additional communications happening as the model goes several times through each GPU for the same computation that previously just took one pass. However, each forward and backward pass is divided by a factor of <d-math>v</d-math>, where <d-math>v</d-math> is the number of stages or model chunks per GPUs as we are able to better interleave forward and backward passes. </p>
1282
 
@@ -1291,14 +1301,14 @@
1291
 
1292
  <p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
1293
 
1294
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1295
 
1296
 
1297
  <p>Scheduling also becomes more complex here as we need to decide on a GPU whether we are prioritizing at a given moment earlier micro-batches meaning that we close the forward and backward loops as fast as possible (so called “depth-first”, i.e. prioritizing getting batches out of the model as fast as possible) or we prioritize to first complete the forward passes of all microbatches in the queue before going over to backward passes (so called “breadth-first” i.e. prioritizing filling in the pipeline as much as possible). This is explained in detail in the "Breadth-Fist Pipeline" paper<d-cite bibtex-key="lamypoirier2023breadthfirstpipelineparallelism"></d-cite>.</p>
1298
 
1299
  <p>You now have all the elements to understand the pipeline parallelism approach in Llama 3.1 which is using a one-forward-one-backward setup with interleaved stages and a priority setting tuneable between depth-first and bread-first.</p>
1300
 
1301
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1302
 
1303
  <p>However, we haven’t reached the end of possible pipeline schedules and recently some methods have been proposed to reduce the bubble to virtually zero! Peaked your curiosity? Let’s have a look!</p>
1304
 
@@ -1307,14 +1317,15 @@
1307
  <p>There are even more sophisticated ways to reduce the bubble more and reached close to a “zero bubble” regime. The secret here is to split at an even finer-grained level the operations involved in order to interleave them in the most efficient way. For instance the pipeline implementation approach in DeepSeek V3/R1, called DualPipe reach close to a zero bubble regime.</p>
1308
 
1309
  <p>Let’s very quickly see how this can work by detailing briefly the ZeroBubble<d-cite bibtex-key="qi2023zerobubblepipelineparallelism"></d-cite> work which is a precursor to DualPipe. The base observation of ZeroBubble is that a backward through a matrix multiplication involve actually two separated operations: backward for the inputs (B) and the backward for the weights (W):</p>
1310
-
1311
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
 
1312
 
1313
  <p>While the output of B, the backward pass for the input, is necessary for performing the backward pass of the lower layers, the backward pass of the weights, W, is not necessary for the rest of the backward pass and generally only need to be performed before the optimiser step. This means W can be flexibly scheduled anywhere after the corresponding B of the same stage. This allows for strategic placement of W to fill the pipeline bubbles. The ZB-H2 schedule on the top right is an example of (theoretical) schedule with zero bubble taking advantage for this fine-grained decomposition.</p>
1314
 
1315
  <p>DeepSeek’s DualPipe introduced with V3 proposes an extension of this decomposed approach to the case of two stream propagating from both sides of the PP ranks and being interleaved to minimize even further idle time in the GPUs are displayed in the following scheduling graph:</p>
1316
 
1317
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1318
 
1319
  <p>The ZeroBubble and DualPipe schedules are a bit too complex for us to give here code snippets but you should start to have a general idea of the concepts involved. In practice, optimizing these schedules requires careful measurements of the time for each operations followed by a scheduling algorithm able to find the most optimal allocation of time given the constrains. See for instance in the ZeroBubble paper<d-cite bibtex-key="qi2023zerobubblepipelineparallelism"></d-cite> for a discussion of the heuristics and algorithms to perform such a scheduling.</p>
1320
 
@@ -1325,7 +1336,7 @@
1325
 
1326
  <p>Mixture-of-expert models have gained some traction with models such as Mixtral<d-cite bibtex-key="jiang2024mixtralexperts"></d-cite> or more recently DeepSeek-V3/R1! The basic idea is that instead of having a single feedforward module per layer we can have several and route tokens through different ones depending on their context:</p>
1327
 
1328
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1329
  <p>Source: A Survey on Mixture of Experts<d-cite bibtex-key="cai2024surveymixtureexperts"></d-cite> </p>
1330
 
1331
  <p>This design makes it very easy to add a new parallelism paradigm: Expert parallelism (EP). Since the feedforward layers are fully independent we can simply put each expert’s feedforward layer on a different worker. Compared to TP it’s much more lightweight, since we don’t need to split the matrix multiplication, we just need to route the hidden states of a token to the right expert. There are several tricks to make EP work in practice, closely tied to model design. For instance, DeepSeek-V3 enforces a constraint in the router, ensuring that each token is sent to at most M nodes (in their case, 4) to reduce communication overhead.</p>
@@ -1458,7 +1469,7 @@
1458
 
1459
  <p>And to have an idea of the memory benefits of each parallelism:</p>
1460
 
1461
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1462
 
1463
  <h2>How to Find the Best Training Configuration</h2>
1464
 
@@ -1617,12 +1628,12 @@
1617
 
1618
  <p>On the compute side, GPUs consist of an array of compute units called <strong>Streaming Multiprocessors</strong> (SM). Each SM contains and controls a set of streaming processors, also known as cores. For example, an Nvidia H100 GPU has 132 SMs with 128 cores per SM, resulting in a total of 16,896 cores (see <a href="https://resources.nvidia.com/en-us-tensor-core">docs for tensor cores</a> for details), each capable of handling multiple threads simultaneously.</p>
1619
 
1620
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1621
  <p>TODO: Original figure from https://blog.codingconfessions.com/p/gpu-computing.</p>
1622
 
1623
  <p>The memory side is also highly hierarchical with several layers of cache and memory: <strong>Registers</strong> are the smallest units and are private to the threads during executions, <strong>Shared Memory</strong> and <strong>L1 cache are</strong> shared between the threads running on a single SM, higher up is the <strong>L2 cache</strong> shared by all SMs, finally there is the <strong>Global Memory</strong> which is the largest memory on the GPU (the advertised 80 GB for a H100 for instance) but also the slowest to access and query.</p>
1624
 
1625
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1626
  <p>TODO: Original figure from https://www.youtube.com/watch?v=ZQKMZIP3Fzg</p>
1627
 
1628
  <p>The goal of GPU will be to run as many workloads as possible, in parallel, on the GPU cores, by taking advantage of this hierarchical organization of compute/memory.</p>
@@ -1775,16 +1786,17 @@
1775
 
1776
  <p>Here’s an excellent visualization of the kernel from this <a href="https://siboehm.com/articles/22/CUDA-MMM">fantastic blogpost</a>: </p>
1777
 
1778
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1779
 
1780
  <p>However, when profiling this kernel with a tool like <code>ncu</code>, we can see issues, including low memory throughput and uncoalesced memory accesses.</p>
1781
 
1782
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
 
1783
 
1784
 
1785
  <p>The reason for this is that in this kernel, two threads in the same block with Thread IDs <code>(0, 0)</code> and <code>(1, 0)</code> (which will end up in the same warp) will both load from the same column of matrix <code>B</code> but different rows of matrix <code>A</code>. Since matrix elements are stored in row-major order (meaning each row's elements are in consecutive memory addresses, as shown in the figure below), in the first iteration with <code>i = 0</code>, thread <code>(0, 0)</code> will load <d-math>A_{0,0}</d-math>, and thread <code>(1, 0)</code> will load <d-math>A_{1,0}</d-math>. These elements are not stored close to each other in memory, and this misalignment repeats across all iterations along the shared dimension, preventing memory accesses from being coalesced.</p>
1786
 
1787
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1788
 
1789
 
1790
  <p>To improve our kernel we can change the way the coordinates x and y are calculated like the following : </p>
@@ -1806,7 +1818,7 @@
1806
 
1807
  <p>When we profile our new kernel, we notice that the warning about uncoalesced memory accesses has disappeared, and <strong>the GPU's memory throughput has increased by approximately 10 times</strong>.</p>
1808
 
1809
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1810
 
1811
 
1812
  <p>We also notice that the execution time of the kernel <strong>decreases by 10x</strong> !</p>
@@ -1822,7 +1834,7 @@
1822
 
1823
  <p>In the tiling approach, each iteration involves all threads within a block cooperatively loading two tiles—one from matrix A and another from matrix B —into shared memory. Specifically, threads load a tile of matrix A (of size <code>BLOCK_SIZE_M</code> by <code>BLOCK_SIZE_K</code>) and a tile of matrix B (of size <code>BLOCK_SIZE_K</code> by <code>BLOCK_SIZE_N</code>). Once the tiles are in shared memory, the threads perform matrix multiplication on these tiles, enabling efficient computation since all necessary data is quickly accessible. The results of the tile multiplication are stored in an accumulation matrix that holds intermediate results. After each iteration, the results from the current tile multiplication are added to this accumulation matrix, continuing until all tiles from both matrices have been processed.</p>
1824
 
1825
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1826
  <p>From https://cnugteren.github.io/tutorial/pages/page4.html</p>
1827
 
1828
  <p>The important parts to understand the implementation are below (for simplicity we consider a square shaped tile) : </p>
@@ -1867,7 +1879,7 @@
1867
 
1868
  <p>The tiling technique has significantly improved the performance of our kernel. However, when analyzing the warp states which quantify how many cycles were spent in each state, we observe the following:</p>
1869
 
1870
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1871
 
1872
 
1873
  <p>The meaning of the states can be found in the <a href="https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference">Profiling Guide</a>, specifically in the <strong>Warp Stall Reasons</strong> section. There we can read that:</p>
@@ -1889,11 +1901,16 @@
1889
  <p>In several places now we’ve mentioned how GPU and CPU operation can be asynchronous. In particular, the host code on the CPU can schedule workload on the GPU in a non-blocking way.</p>
1890
 
1891
  <p>Non-blocking can be useful for overlapping communication and computation as we saw at several part along this blog post but can be extended to the more general idea of trying to avoid at all cost going back and forth between host and GPU kernel commands. This is beautifully illustrated by <a href="https://horace.io/brrr_intro.html">Horace He</a> in these diagrams:</p>
1892
-
1893
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1894
- <p>A sequence of kernels requiring back and forth between global memory and compute units</p>
1895
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1896
- <p>Instead of sending our triangle back to global memory just to read it back again, we instead just do all of our operations in one go.</p>
 
 
 
 
 
1897
 
1898
  <p>How can we avoid this back and forth? Well the best way is to make our GPU as autonomous as possible. This is achieved by packing as many successive compute operations together in a single kernel for the GPU to run, called a “Fused Kernel”.</p>
1899
 
@@ -1910,13 +1927,13 @@
1910
 
1911
  <p>A basic implementation of the attention mechanism involve a lot of transfer between memory and workers. It requires materializing the S and P matrices in HBM which means that the results need to be sent to HBM and then back to SRAM for the next computations:</p>
1912
 
1913
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1914
 
1915
  <p>Since bandwidth is much lower in HBM this introduces a severe bottleneck in the attention computation. Can we do better? Tri Dao says yes!</p>
1916
 
1917
  <p>The key element is to compute the S matrices in small pieces which can fit in the smaller shared memory of the SM. But we can do even better and avoid materializing the very large S matrix all together in favor of keeping only the necessary statistics for computing the normalization factor of the softmax. So we can compute part of <d-math>O</d-math> directly in one computation in SRAM rather than moving intermediate results back and forth. In this case, not even do we make use of the shared memory but we also release the memory bottleneck resulting from materializing one of the largest activation matrices in the model (at long context length), the attention matrix.</p>
1918
 
1919
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1920
  <p>From the FLASH-ATTENTION paper<d-cite bibtex-key="dao2022flashattention"></d-cite></p>
1921
 
1922
  <p>The idea of flash attention resolves so many bottlenecks in model training that it has quickly become the default way to perform attention in all transformers:</p>
@@ -2002,14 +2019,14 @@
2002
 
2003
  <p>Reducing the total number of bits comes at a price (no free lunch here either), but we have some control over how to pay. Either we can sacrifice more bits on the mantissa or exponent. For this reason there exist also two float8 formats, named according to exponent and mantissa, to flexibly choose the most appropriate format. We can look at the possible range of numbers for each format:</p>
2004
 
2005
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
2006
 
2007
 
2008
  <p>We can see that float32 spans 80 orders of magnitude and float16 sacrifices a lot of range while bfloat16 maintains the full range. The two float8 formats reduce the range even further where e5e2 can maintain float16 range and e4m3 has an even smaller ranger.</p>
2009
 
2010
  <p>How come some format are able to maintain the range and other not? Let’s investigate the resolution by plotting 10,000 points between 1 and 2. Each point will be rounded to the nearest representable number in each format:</p>
2011
 
2012
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
2013
 
2014
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2015
 
@@ -2047,7 +2064,7 @@
2047
 
2048
  <p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
2049
 
2050
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
2051
 
2052
  <p>In order to switch from high precision (e.g. FP32 or BF16) to lower precision (e.g. FP16 or FP8) with smaller range, we need to normalize the range of values by computing the absolute maximum. DeepSeek-V3 also introduces a quantization scheme, where the ranges are normalized per tile: 1x128 for inputs/activations and 128x128 for weights and scale elements. This makes the normalization less susceptible to outliers. There is a number of additional tricks they deploy to also reduce the memory and communication footprint which you can follow in section 3.3. of the DeepSeek-V3 technical report<d-cite bibtex-key="deepseekai2024deepseekv3technicalreport"></d-cite>. </p>
2053
 
@@ -2141,7 +2158,7 @@
2141
 
2142
  <p>Congratulations! You've completed quite a journey - from understanding how to train a simple model on a single GPU, all the way to mastering the complex techniques used to efficiently train massive language models like Llama-405B and DeepSeek-V3. By now, you should feel confident interpreting advanced parallelism diagrams like the one below, which would have seemed daunting when you first started.</p>
2143
 
2144
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
2145
 
2146
  <p>In distributed training, many concepts sound easy enough when you first hear them, like “Pipeline parallelism just distributes layers on different GPUs”, but we also worked through all the challenging details when implementing those methods. </p>
2147
 
@@ -2191,12 +2208,14 @@
2191
 
2192
  <p>First, let's examine this heatmap visualization:</p>
2193
 
2194
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
2195
  <p>Heatmap visualization showing the optimal training configurations across different model sizes and compute node counts. For each combination, the configuration details include Data Parallelism (DP), Tensor Parallelism (TP), Pipeline Parallelism (PP), Gradient Accumulation Steps (GAS), Micro Batch Size (MBS), and ZeRO optimization stage. The color intensity indicates the Model FLOPs Utilization (MFU), with brighter colors representing higher efficiency.</p>
2196
 
2197
  <p>To complement this, let's look at the relationships between different parameters:</p>
2198
 
2199
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
 
 
2200
  <p>Parallel coordinates plot showing the relationship between different model parallelism configurations (Data Parallel degree, Tensor Parallel degree, Pipeline Parallel degree), training hyperparameters (gradient accumulation steps, micro batch size), ZeRO stage and the resulting Model FLOPs Utilization (MFU). Each line represents a different training configuration, with colors indicating the MFU value - warmer colors show higher efficiency.</p>
2201
 
2202
  <p>From these visualizations, we can draw several important insights:
 
234
  <p>It looks generally like this: </p>
235
 
236
  <div class="svg-container" id="svg-first_steps_simple_training"> </div>
237
+ <div class="info" id="svg-first_steps_simple_training-info">Hover over the network elements to see their details</div>
238
  <script src="../assets/images/first_steps_simple_training.js"></script>
239
 
240
  <p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
 
297
 
298
  <p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
299
 
300
+ <div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
301
+ <div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
302
+ <script src="../assets/images/first_steps_memory_profile.js"></script>
303
 
304
  <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
305
 
 
354
  <div class="note-box">
355
  <p class="note-box-title">📝 Note</p>
356
  <p class="note-box-content">
357
+ Some libraries store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
358
 
359
  </p>
360
  </div>
 
416
 
417
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
418
 
419
+ <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/memusage_activations.html" width="90%" scrolling="no" frameborder="0"></iframe>
420
+ <script>
421
+ window.addEventListener('load', function() {
422
+ const frame = document.getElementById('plotFrame2');
423
+ frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
424
+ frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
425
+ });
426
+ </script>
427
 
428
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
429
 
 
437
 
438
  <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
439
 
440
+ <div class="svg-container" id="svg-activation_recomputation"> </div>
441
+ <div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
442
+ <script src="../assets/images/activation_recomputation.js"></script>
443
  <p>There are several strategies to select key activations to store:</p>
444
 
445
  <ul>
 
498
 
499
  <p>Gradient accumulation allows us to effectively increase our batch size up to infinity (and beyond!) while the memory footprint stays constant. Gradient accumulation is also compatible with activation recomputation for further memory reduction. One drawback however, is that gradient accumulation requires multiple consecutive forward/backward passes per optimization step thereby increasing the compute overhead and slowing down training. No free lunch! </p>
500
 
501
+ <p><img alt="image.png" src="/assets/images/gradaccumulation_diag.png" /></p>
502
 
503
  <aside>Using gradient accumulation means we need to keep buffers where we accumulate gradients which persist throughout a training step. Whereas without gradient accumulation, in the backward gradients are computed while freeing the activations memory, which means a lower peak memory.</aside>
504
 
 
517
 
518
  <p>Using a different micro batch for each GPU means we’ll have different gradients in each GPU, so to keep the model instances in sync across different GPUs, the gradients from the model instances are averaged using an operation called “all-reduce”, which happens during the backward pass, before the optimizer step.</p>
519
 
520
+ <p><img alt="image.png" src="/assets/images/dp_diagram.png" /></p>
521
 
522
  <aside>If you are not familiar with distributed communications patterns like broadcast, gather or all-reduce we put together a small crash course in the Appendix [TODO Link].</aside>
523
 
524
  <p>This involves our first “distributed communication” primitive: <em><strong>all-reduce</em></strong> which handles the synchronization and communication between GPU instances and nodes.</p>
525
 
526
+ <p><img alt="image.png" src="/assets/images/dp_overlap1.svg" /></p>
527
 
528
  <p>A naive DP implementation would just wait for the backward pass the finish so that we have all gradients, then it triggers an all-reduce over all DP ranks, to sync these gradients. But such an sequential steps of computation followed by communication is <strong>A BIG NO!</strong> Because we don’t want our GPUs to stay idle while communication is happening.</p>
529
 
 
549
  if p.requires_grad is True:
550
  p.register_post_accumulate_grad_hook(hook)</d-code>
551
 
552
+ <p><img alt="image.png" src="/assets/images/dp_overlap2.svg"/></p>
553
 
554
  <p>Overlapping computation and communication reduces the time spent waiting for gradient synchronization across the entire model. Gradient synchronization can occur (at least partially) in parallel with backward pass, significantly speeding up data parallelism. Here's a full implementation of naive DP with synchronization overlap:</p>
555
 
 
583
  </div>
584
  </details>
585
 
586
+ <p><img alt="dp_overlap3.svg" src="/assets/images/dp_overlap3.svg" /></p>
587
 
588
  <h4><strong>Third optimization: </strong>Interplay with gradient accumulation</h4>
589
 
 
643
 
644
  <p>While data parallelism cleverly overlaps the all-reduce gradient synchronization with backward computation to save time, this benefit starts to break down at large scales. As we add more and more GPUs (hundreds or thousands), the overhead of coordinating between them grows significantly. The end result? We get less and less efficient returns from each additional GPU we add to the system:</p>
645
 
646
+ <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p>
647
 
648
  <p>As expected, we can also see that the memory usage per GPU is not affected by adding more DP ranks for training.</p>
649
 
 
651
 
652
  <p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
653
 
654
+ <p><img alt="dp_ourjourney_memoryusage.svg" src="/assets/images/dp_ourjourney_memoryusage.svg" /></p>
655
 
656
  <aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
657
 
 
697
 
698
  <p>The idea of ZeRO is to shard these objects across the DP ranks, each node only storing a slice of the items which are reconstructed when and if needed, thereby dividing memory usage by the data parallel degree <d-math>N_d</d-math>:</p>
699
 
700
+ <p><img alt="zero_memory.svg" src="/assets/images/zero_memory.svg" /></p>
701
  <p>Memory consumption of DP and three stages of Zero-DP. <d-math>\Psi</d-math> denotes number of parameters, <d-math>k</d-math> denotes the memory multiplier of optimizer states (<d-math>k=12</d-math> for Adam), and <d-math>N_d</d-math> denotes DP degree.</p>
702
 
703
 
 
723
 
724
  <p>See the figure below for all the necessary steps in one forward/backward pass cycle:</p>
725
 
726
+ <p><img alt="dp_zero1.gif" src="/assets/images/dp_zero1.gif" /></p>
727
 
728
  <p>So in practice, compared to vanilla DP, Zero-1 adds an all-gather over all parameters after the optimizer step as we can see below:</p>
729
 
730
+ <p><img alt="dp_zero1_overlap.svg" src="/assets/images/dp_zero1_overlap.svg" /></p>
731
 
732
  <p>If you've been following along, you'll recall from vanilla DP that we can overlap the all-reduce gradient communication with the backward pass computation. In ZeRO-1, we can also investigate how to efficiently overlap the newly added all-gather of bf16 parameters. There are two main strategies for this:</p>
733
 
 
751
 
752
  <aside>In case of FP32 gradient accumulation, we only need to keep <d-math>\frac{1}{N_d}</d-math> fp32_grads where we accumulate the bf16 grads coming from the reduce-scatter. And in the optimizer step we use the <d-math>\frac{1}{N_d}</d-math> fp32_grads.</aside>
753
 
754
+ <p><img alt="dp_zero2.gif" src="/assets/images/dp_zero2.gif" /></p>
755
 
756
  <p>It’s easy to see now that sharding the gradients leads to to <d-math>2\Psi + \frac{2\Psi+k\Psi}{N_d}</d-math> and as <d-math>N_d</d-math> is increased we can save up to 8x memory over the baseline. In terms of communication the same process applies as for ZeRO-1, with the only difference that we communicate and release on the fly. In total, ZeRO-2 is thus also equivalent to vanilla DP training w.r.t. communication.</p>
757
 
758
  <p>In terms of communication ZeRO-2 is similar to ZeRO-1, they both require a reduce-scatter for the gradients, and an all-gather over all parameters.</p>
759
 
760
+ <p><img alt="dp_zero2_overlap.svg" src="/assets/images/dp_zero2_overlap.svg" /></p>
761
 
762
  <aside>Note: You might notice that there is no real overhead of using ZeRO-2 over ZeRO-1 and indeed ZeRO-2 is usually the best option.</aside>
763
 
 
776
 
777
  <p>So how do we do a forward or backward pass in practice if all parts of the model are distributed? Quite simply we gather them on-demand when we need them. In the forward pass this looks as follows:</p>
778
 
779
+ <p><img alt="dp_zero3_fwd.svg" src="/assets/images/dp_zero3_fwd.svg" /></p>
780
 
781
+ <p>So as we perform the forward pass and sequentially go through the layers we retrieve the necessary parameters on demand and immediately flush them from memory when we don't need them anymore. The backward pass works the same way just inverted in flow and we produce the gradient shards: </p>
782
 
783
+ <p><img alt="dp_zero3_bwd.svg" src="/assets/images/dp_zero3_bwd.svg" /></p>
784
 
785
+ <p>The other issue is that we need to do these all-gathers continuously throughout the forward and backward step, which amounts to <d-math>2\cdot \text{num\_layers} -1</d-math> additional all-gathers in <strong>a training step</strong> compared to Zero-2, each comes with a small <strong>base latency</strong> overhead as we can see in the following figure:</p>
786
+
787
+ <p><img alt="dp_zero3_overlap.svg" src="/assets/images/dp_zero3_overlap.svg" /></p>
788
 
789
  <p>During the forward pass we do all-gather operations for the parameters when we need them, so a <d-math>\Psi</d-math> communication tax. Since we discard the parameters immediately after we needed them in the forward pass we need one more all-gather during the backward pass as well incurring another <d-math>\Psi</d-math> in communication tax. Finally we need the same <strong><em>reduce-scatter</em></strong> as in ZeRO-2 for the gradients which costs also <d-math>\Psi</d-math> in communication and we arrive at a total communication cost of <d-math>3\Psi</d-math>, compared to <d-math>2\Psi</d-math> for Zero-2.</p>
790
 
 
799
 
800
  <p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! Recall from the activation memory discussion that it scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
801
 
802
+ <p><img alt="zero3_memoryusage.svg" src="/assets/images/zero3_memoryusage.svg" /></p>
803
 
804
  <p>Now that we've efficiently used the DP axis to reduce memory through efficient communication patterns, let's explore a new, orthogonal axis of parallelism - Tensor Parallelism. Unlike ZeRO3 that relies on heavy parameter communication, TP manages to shard parameters, gradients, optimizer states AND activations across devices without requiring any model parameter movement between GPUs. What! How is this even possible?! Let's explore this seemingly magical approach together! 🙂</p>
805
 
 
825
 
826
  <p>In practice a small example of the operation looks like this:</p>
827
 
828
+ <p><img alt="image.png" src="/assets/images/tp_diagram.png" /></p>
829
 
830
  <p>Let’s see how we can parallelise this operation! In tensor parallelism, tensors will be split into N shards along a particular dimension and distributed across N GPUs. Matrices can be split either on the column part or row part leading to row and column parallelism. One thing we’ll see in the following is that choosing row or column sharding will require different communications primitives.</p>
831
 
832
  <p>Our first option is to use column-wise sharding (also called <strong><em>column-linear</em></strong>): We'll copy the complete input matrices to each worker, requiring an operation called <strong><em>broadcast</em></strong>, and split the weight matrix into columns. The inputs are then multiplied with the partial weight matrices, and the results are finally combined using an <strong><em>all-gather</em></strong> operation.</p>
833
 
834
+ <p><img alt="image.png" src="/assets/images/tp_diagram2.png" /></p>
835
 
836
  <p>Here's the code implementation of column wise tensor parallelism:</p>
837
 
 
848
 
849
  <p>We see here our fourth distributed primitive: <strong><em>scatter</em></strong>!</p>
850
 
851
+ <p><img alt="image.png" src="/assets/images/tp_diagram3.png" /></p>
852
 
853
  <p>Here's the implementation for row-wise tensor parallelism:</p>
854
 
 
869
 
870
  <p>The Feedforward part can be parallelized by having a “Column linear” followed by a “Row Linear” which amounts to a broadcast to copy the input and an all-reduce in forward. Note that the broadcast isn’t needed in actual training where we can make sure inputs are already synced across TP ranks.</p>
871
 
872
+ <p><img alt="image.png" src="/assets/images/tp_diagram4.png" /></p>
873
 
874
  <p>Now that we’ve found the most efficient schema for the Feedforward part of the transformer, let’s take a look at the multi-head attention block (MHA).</p>
875
 
 
877
 
878
  <p>It's also worth noting that the tensor parallelism degree should not exceed the number of Q/K/V heads because we need intact heads per TP rank. And in case we’re using GQA, TP degree should be below number of K/V heads, otherwise it requires additional comms to keep them in sync. For instance, LLaMA-3 8B has 8 Key/Value heads, so the tensor parallelism degree should be less than or equal to 8, otherwise if TP=16 for example, we need to duplicate each K/V head and make sure they stay in sync.</p>
879
 
880
+ <p><img alt="image.png" src="/assets/images/tp_full_diagram.png" /></p>
881
 
882
  <p>Finally note that there is a tradeoff in terms of communication as we’ve added several distributed communication primitive directly in the computation path of our model. At the difference of ZeRO where we could prefetch, it can be harder to make these communication fully overlap with computations. </p>
883
 
884
+ <p><img alt="Forward pass in Tensor Parallelism" src="/assets/images/tp_overlap.svg" /></p>
885
 
886
  <p>Looking at the timeline of operations in tensor-parallel MLP (same applies for Attention), we can better understand the tradeoffs involved. In the forward of each decoder layer, we hit a synchronization point with the AllReduce operation that cannot be overlapped with computation. This <em>exposed communication</em> overhead is necessary to combine partial results across tensor-parallel ranks before the final LayerNorm can be applied. </p>
887
 
888
  <p>Tensor parallelism does help reduce activation memory for the matrix multiplications since the intermediate activations are sharded across GPUs. However, we still need to gather the full activations for operations like LayerNorm, which means we're not getting the full memory benefits we could. Additionally, it introduces significant communication requirements that heavily depend on the network infrastructure. The inability to hide this particular AllReduce behind computation means it directly adds to the critical path of forward propagation.</p>
889
 
890
+ <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p>
891
 
892
  <p>Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training.</p>
893
 
 
895
 
896
  <p>However, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
897
 
898
+ <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p>
899
 
900
  <p>As we can see, increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU. While tensor parallelism does help reduce activation memory in attention and feedforward layers by sharding the matrix multiplications across GPUs, we don't get the full memory benefits we could. This is because operations like layer normalization and dropout still require gathering the full activations on each GPU, partially negating the memory savings. We can do better by finding ways to parallelize these remaining operations as well.</p>
901
 
 
935
 
936
  <p><img alt=" in forward: f = no-op ; f* = all-reduce ; g = all-gather ; g* = reduce-scatter
937
  in backward: f = all-reduce ; f* = no-op ; g = reduce-scatter ; g* = all-gather
938
+ SP region needs full hidden_dim" src="/assets/images/tp_sp_diagram.png" /></p>
939
 
940
  <p>in forward: f = no-op ; f<em> = all-reduce ; g = all-gather ; g</em> = reduce-scatter in backward: f = all-reduce ; f<em> = no-op ; g = reduce-scatter ; g</em> = all-gather SP region needs full hidden_dim</p>
941
 
 
956
 
957
  <p>For sequence parallelism (SP), we use different operations labeled "g" and "g*". Specifically, we avoid using all-reduce in the SP region since that would require gathering the full activations and increase our peak memory usage, defeating the purpose of SP.</p>
958
 
959
+ <p><img alt="image.png" src="/assets/images/tp_sp_diagram_zoomed.png" /></p>
960
 
961
  <p>So what is actually happening here? As a famous LLM would say, let’s take it step-by-step:</p>
962
 
 
1044
 
1045
  <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
1046
 
1047
+ <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p>
1048
 
1049
  <p>Does that mean that SP incurs more communication than TP? Well, yes and no. In the forward of a vanilla TP we had two all-reduce per transformer block, and in SP we have two all-gather and two reduce-scatter per transformer block. So SP does twice the number of communication operations as TP. But since an all-reduce operation can be broken down into to an all-gather + reduce-scatter (see in [TODO: Appendix link]) they’re actually equivalent in terms of communication. Same reasoning for backward as we just use the conjugate of each operation (no-op ↔ allreduce and allgather ↔ reducescatter).</p>
1050
 
1051
  <p>If you’ve been paying close attention, you’ll notice that we’re talking about 4 comms ops in each layer (2 for Attention and 2 for MLP). This is how the MLP profiling looks like when using Tensor + Sequence Parallelism:</p>
1052
 
1053
+ <p><img alt="tp_sp_overlap.svg" src="/assets/images/tp_sp_overlap.svg" /></p>
1054
 
1055
  <p>Besides the fact that TP requires communications in each layer, it also can’t easily be overlapped with compute, which makes throughput heavily dependent on the communication bandwidth. This is why TP is usually done only within a node (TP≤8).</p>
1056
 
 
1059
 
1060
  <p>As you might expect, this communication overhead becomes increasingly problematic as we scale up tensor parallelism. To illustrate this, let’s check throughput as we scale TP with SP for a 3B model:</p>
1061
 
1062
+ <p><img alt="tp_sp_scaling.svg" src="/assets/images/tp_sp_scaling.svg" /></p>
1063
  <p>Impact of combined Tensor and Sequence Parallelism (TP/SP) on a 3B model’s performance and memory utilization with 4096 seqlen: when scaling both TP and SP together, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees reduce per-GPU throughput, they enable processing of significantly larger batch sizes by reducing the activation memory.</p>
1064
 
1065
  <p>Let’s summarize our observations:</p>
 
1089
 
1090
  <p>Even if we use full recomputation of the activations, which comes at a heavy compute overhead (30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length:</p>
1091
 
1092
+ <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p>
1093
 
1094
  <p>Can we apply similar ideas to our sequence parallelism approach but inside in the modules where we apply Tensor Parallelism already, thereby also reducing the effect of sequence length? Yes, it’s time to talk about Context Parallelism, which you will find quite intuitive after all we’ve already convered.</p>
1095
 
 
1097
 
1098
  <p>The idea of Context Parallelism is quite simple; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previous with Tensor + Sequence Parallelism.</p>
1099
 
1100
+ <p><img alt="cp_8Bmemoryusage.svg" src="/assets/images/cp_8Bmemoryusage.svg" /></p>
1101
 
1102
  <p>Splitting the sequence doesn't affect most modules like MLP and LayerNorm, where each token is processed independently. It also doesn’t require expensive communication like TP, as only the inputs are split and not the weight matrices. Just like data parallelism, after computing the gradients, an all-reduce operation is initiated to synchronize the gradients across the context parallelism group.</p>
1103
 
 
1128
 
1129
  <p>The whole process with 4 GPUs is shown in the following animation:</p>
1130
 
1131
+ <p><img alt="ring-attention.gif" src="/assets/images/ring-attention.gif" /></p>
1132
 
1133
  <p>With this animation, it’s also immediately clear why the authors chose to call this approach Ring Attention.</p>
1134
 
1135
  <p>There is one big problem though which is that a naive implementation of Ring Attention lead to some strong imbalance between GPU streaming from the shape of the causal attention matrix. Let’s take a real look at what is happening in the SoftMax computation by considering the attention score matrix with the causal attention mask:</p>
1136
 
1137
+ <p><img alt="cp_attnmask.svg" src="/assets/images/cp_attnmask.svg" /></p>
1138
 
1139
  <p>The SoftMax is computed row-wise, which means whenever a GPU has received all the tokens of a row it can be computed. We see that GPU1 can immediately compute it as it starts with tokens 1-4 and GPU1 actually doesn’t need to receive any information from any other GPUs. However, GPU2 will need to wait for the second round to also receive 1-4 and thus have all values for tokens 1-8. Also, GPU1 seems to perform much less work than all the other GPUs.</p>
1140
 
 
1144
 
1145
  <p>We need a better way to distribute the input sequences. This can be achieved by assigning the tokens not purely sequential to the GPUs but by mixing the ordering a bit such that we have a good mix of early and late tokens on each GPU. This approach is called Zig-Zag attention<d-cite bibtex-key="attention brandon2023fasterring"></d-cite> and in this new arrangement, the attention mask will show an even distribution of computation but if you count the number of colored squares, you’ll see that the computation is now balanced across all GPUs.</p>
1146
 
1147
+ <p><img alt="cp_zigzagmask.svg" src="/assets/images/cp_zigzagmask.svg" /></p>
1148
 
1149
  <p>At the same time we’ll also see that in order to complete all rows, each GPU will need information from all the other GPUs.</p>
1150
 
1151
  <p>We have two general ways to overlap computation and communication, either by performing a general all-gather, regrouping all the KV on each GPUs at the same time (in a Zero-3 type of way) or we gather them one-by-one from each GPU to each GPU as needed:</p>
1152
 
1153
+ <p><img alt="cp_overlap_allgather.svg" src="/assets/images/cp_overlap_allgather.svg" /></p>
1154
+ <p><img alt="cp_overlap_all2all.svg" src="/assets/images/cp_overlap_all2all.svg" /></p>
1155
 
1156
  <p>The key difference between these two implementations lies in their communication patterns and memory usage:</p>
1157
 
 
1161
  <li>All GPUs simultaneously gather the complete key/value pairs from all other GPUs</li>
1162
  <li>Requires more temporary memory as each GPU needs to store the full KV pairs at once</li>
1163
  <li>Communication happens in one step but with larger memory overhead</li>
 
1164
  </ul>
1165
 
1166
  <p><strong>2. All-to-All (Ring) Implementation:</strong></p>
 
1169
  <li>GPUs exchange KV pairs in a ring-like pattern, one chunk at a time</li>
1170
  <li>More memory efficient as each GPU only needs to store one additional chunk temporarily</li>
1171
  <li>Communication is spread out and overlapped with computation, though with some additional base latency overhead from multiple communication steps</li>
 
1172
  </ul>
1173
 
1174
  <p>The All-to-All approach generally offers better memory efficiency at the cost of slightly more complex communication patterns, while the AllGather approach is simpler but requires more temporary memory during the attention computation.</p>
 
1179
 
1180
  <p>In the TP section we saw that if we try to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) we hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we perform it across several nodes:</p>
1181
 
1182
+ <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p>
1183
  <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
1184
 
1185
  <p>Sequence and context parallelism can help for long sequences but don’t help much if sequence length is not the root cause of our memory issues but rather the size of the model itself. For large model (70B+), the size of the weights alone can already push past the limits of the 4-8 GPUs on a single node. We can solve this issue by summoning the fourth (and last) parallelism dimension: “pipeline parallelism”.</p>
1186
 
1187
+ <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p>
1188
 
1189
  <p>Pipeline Parallelism is conceptually very simple –we’ll simply spread the layers of our model across GPUs – but the devil lies in implementing it efficiently. Let’s dive in it!</p>
1190
 
 
1198
 
1199
  <p>Indeed reader! The main challenge in pipeline parallelism will be how to efficiently circumvent the sequential nature of PP to keep our GPU busy at all times and avoid having one GPU computing while the others are waiting. Here is how our GPU utilization is looking when doing a naive and simple forward and backward pass through the model where the numbers indicate the model layers:</p>
1200
 
1201
+ <p><img alt="image.png" src="/assets/images/pp_afab.svg" /></p>
1202
  <p>An example of Pipeline parallelism for a model with 16 layers distributed across 4 GPUs. The numbers correspond to the layer IDs.</p>
1203
 
1204
  <p>The remaining idle time is indicated in grey and usually called the “bubble” and the sight of this probably break your heart after we spent so much time optimizing throughput.</p>
 
1217
 
1218
  <p>Let’s take a first tool out of our toolbox and think about splitting our batch into smaller bit-sized portions which can be processed in parallel or almost, like we did before in data parallel for instance. Now when the second GPU is busy processing micro-batch 1, the first GPU can already start processing micro-batch 2. Here is a schedule using 8 micro-batches:</p>
1219
 
1220
+ <p><img alt="pp_afab2.svg" src="/assets/images/pp_afab2.svg" /></p>
1221
 
1222
  <aside>Before the numbers in the diagram indicated the layers but in all pipeline parallel plots from now including this one it indicates a microbatch. You can think of each square here to contain several layers as seen in the previous figure. </aside>
1223
 
 
1250
 
1251
  <p>This schedule is called <strong><em>one-forward-one-backward (1F1B)</em></strong> as the middle/steady state involves alternatively performing one forward and one backward pass. The general idea is to start performing the backward pass as soon as possible. The schedule looks like this:</p>
1252
 
1253
+ <p><img alt="image.png" src="/assets/images/pp_1f1b.svg" /></p>
1254
 
1255
  <p>The bubble still has the same size so our training efficiency is not significantly improved. However we only need to store activations for <d-math>p</d-math> micro-batches instead of <d-math>m</d-math> which quite reduce the activation memory explosion we had in the AFAB schedule. As a consequence we can add more microbatches which then will actually reduce the bubble.</p>
1256
 
1257
+ <!-- TODO: @Nouamane add this figure -->
1258
+ <p><img alt="image.png" src="/assets/images/pp_1f1b_scaling.png" /></p>
1259
 
1260
  <p>A major complexity of this setup, visible on the above graph is how forward and backward passes are not cleanly consecutive anymore but performed in parallel across devices. This means we will have to schedule the switch from forward to backward passes independently on each device instead of in a simple and common central training loop as usual.</p>
1261
 
 
1286
 
1287
  <p>This can be seen in general as a kind of “looping pipeline” where a micro-batch will move in circles from one GPU to the next as it goes through the forward pass through the model.</p>
1288
 
1289
+ <p><img alt="pp_1f1b_interleaved.svg" src="/assets/images/pp_1f1b_interleaved.svg" /></p>
1290
 
1291
  <p>As a consequence we see additional communications happening as the model goes several times through each GPU for the same computation that previously just took one pass. However, each forward and backward pass is divided by a factor of <d-math>v</d-math>, where <d-math>v</d-math> is the number of stages or model chunks per GPUs as we are able to better interleave forward and backward passes. </p>
1292
 
 
1301
 
1302
  <p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
1303
 
1304
+ <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p>
1305
 
1306
 
1307
  <p>Scheduling also becomes more complex here as we need to decide on a GPU whether we are prioritizing at a given moment earlier micro-batches meaning that we close the forward and backward loops as fast as possible (so called “depth-first”, i.e. prioritizing getting batches out of the model as fast as possible) or we prioritize to first complete the forward passes of all microbatches in the queue before going over to backward passes (so called “breadth-first” i.e. prioritizing filling in the pipeline as much as possible). This is explained in detail in the "Breadth-Fist Pipeline" paper<d-cite bibtex-key="lamypoirier2023breadthfirstpipelineparallelism"></d-cite>.</p>
1308
 
1309
  <p>You now have all the elements to understand the pipeline parallelism approach in Llama 3.1 which is using a one-forward-one-backward setup with interleaved stages and a priority setting tuneable between depth-first and bread-first.</p>
1310
 
1311
+ <p><img alt="pp_llama3.1_schedule.png" src="/assets/images/pp_llama3.1_schedule.png" /></p>
1312
 
1313
  <p>However, we haven’t reached the end of possible pipeline schedules and recently some methods have been proposed to reduce the bubble to virtually zero! Peaked your curiosity? Let’s have a look!</p>
1314
 
 
1317
  <p>There are even more sophisticated ways to reduce the bubble more and reached close to a “zero bubble” regime. The secret here is to split at an even finer-grained level the operations involved in order to interleave them in the most efficient way. For instance the pipeline implementation approach in DeepSeek V3/R1, called DualPipe reach close to a zero bubble regime.</p>
1318
 
1319
  <p>Let’s very quickly see how this can work by detailing briefly the ZeroBubble<d-cite bibtex-key="qi2023zerobubblepipelineparallelism"></d-cite> work which is a precursor to DualPipe. The base observation of ZeroBubble is that a backward through a matrix multiplication involve actually two separated operations: backward for the inputs (B) and the backward for the weights (W):</p>
1320
+
1321
+ <p><img alt="image.png" src="/assets/images/pp_zerobubble_compgraph.png" /></p>
1322
+ <p><img alt="image.png" src="/assets/images/pp_zerobubble_ppschedule.png" /></p>
1323
 
1324
  <p>While the output of B, the backward pass for the input, is necessary for performing the backward pass of the lower layers, the backward pass of the weights, W, is not necessary for the rest of the backward pass and generally only need to be performed before the optimiser step. This means W can be flexibly scheduled anywhere after the corresponding B of the same stage. This allows for strategic placement of W to fill the pipeline bubbles. The ZB-H2 schedule on the top right is an example of (theoretical) schedule with zero bubble taking advantage for this fine-grained decomposition.</p>
1325
 
1326
  <p>DeepSeek’s DualPipe introduced with V3 proposes an extension of this decomposed approach to the case of two stream propagating from both sides of the PP ranks and being interleaved to minimize even further idle time in the GPUs are displayed in the following scheduling graph:</p>
1327
 
1328
+ <p><img alt="image.png" src="/assets/images/pp_zerobubble_dualpipe.png" /></p>
1329
 
1330
  <p>The ZeroBubble and DualPipe schedules are a bit too complex for us to give here code snippets but you should start to have a general idea of the concepts involved. In practice, optimizing these schedules requires careful measurements of the time for each operations followed by a scheduling algorithm able to find the most optimal allocation of time given the constrains. See for instance in the ZeroBubble paper<d-cite bibtex-key="qi2023zerobubblepipelineparallelism"></d-cite> for a discussion of the heuristics and algorithms to perform such a scheduling.</p>
1331
 
 
1336
 
1337
  <p>Mixture-of-expert models have gained some traction with models such as Mixtral<d-cite bibtex-key="jiang2024mixtralexperts"></d-cite> or more recently DeepSeek-V3/R1! The basic idea is that instead of having a single feedforward module per layer we can have several and route tokens through different ones depending on their context:</p>
1338
 
1339
+ <p><img alt="ep_schema.png" src="/assets/images/ep_schema.png" /></p>
1340
  <p>Source: A Survey on Mixture of Experts<d-cite bibtex-key="cai2024surveymixtureexperts"></d-cite> </p>
1341
 
1342
  <p>This design makes it very easy to add a new parallelism paradigm: Expert parallelism (EP). Since the feedforward layers are fully independent we can simply put each expert’s feedforward layer on a different worker. Compared to TP it’s much more lightweight, since we don’t need to split the matrix multiplication, we just need to route the hidden states of a token to the right expert. There are several tricks to make EP work in practice, closely tied to model design. For instance, DeepSeek-V3 enforces a constraint in the router, ensuring that each token is sent to at most M nodes (in their case, 4) to reduce communication overhead.</p>
 
1469
 
1470
  <p>And to have an idea of the memory benefits of each parallelism:</p>
1471
 
1472
+ <p><img alt="image.png" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" /></p>
1473
 
1474
  <h2>How to Find the Best Training Configuration</h2>
1475
 
 
1628
 
1629
  <p>On the compute side, GPUs consist of an array of compute units called <strong>Streaming Multiprocessors</strong> (SM). Each SM contains and controls a set of streaming processors, also known as cores. For example, an Nvidia H100 GPU has 132 SMs with 128 cores per SM, resulting in a total of 16,896 cores (see <a href="https://resources.nvidia.com/en-us-tensor-core">docs for tensor cores</a> for details), each capable of handling multiple threads simultaneously.</p>
1630
 
1631
+ <p><img alt="image.png" src="/assets/images/diving_primergpu.svg" /></p>
1632
  <p>TODO: Original figure from https://blog.codingconfessions.com/p/gpu-computing.</p>
1633
 
1634
  <p>The memory side is also highly hierarchical with several layers of cache and memory: <strong>Registers</strong> are the smallest units and are private to the threads during executions, <strong>Shared Memory</strong> and <strong>L1 cache are</strong> shared between the threads running on a single SM, higher up is the <strong>L2 cache</strong> shared by all SMs, finally there is the <strong>Global Memory</strong> which is the largest memory on the GPU (the advertised 80 GB for a H100 for instance) but also the slowest to access and query.</p>
1635
 
1636
+ <p><img alt="image.png" src="/assets/images/diving_primergpu2.svg" /></p>
1637
  <p>TODO: Original figure from https://www.youtube.com/watch?v=ZQKMZIP3Fzg</p>
1638
 
1639
  <p>The goal of GPU will be to run as many workloads as possible, in parallel, on the GPU cores, by taking advantage of this hierarchical organization of compute/memory.</p>
 
1786
 
1787
  <p>Here’s an excellent visualization of the kernel from this <a href="https://siboehm.com/articles/22/CUDA-MMM">fantastic blogpost</a>: </p>
1788
 
1789
+ <p><img alt="image.png" src="/assets/images/memorycoalescing.png" /></p>
1790
 
1791
  <p>However, when profiling this kernel with a tool like <code>ncu</code>, we can see issues, including low memory throughput and uncoalesced memory accesses.</p>
1792
 
1793
+ <p><img alt="image.png" src="/assets/images/memorycoalescing2.png" /></p>
1794
+ <p><img alt="image.png" src="/assets/images/memorycoalescing3.png" /></p>
1795
 
1796
 
1797
  <p>The reason for this is that in this kernel, two threads in the same block with Thread IDs <code>(0, 0)</code> and <code>(1, 0)</code> (which will end up in the same warp) will both load from the same column of matrix <code>B</code> but different rows of matrix <code>A</code>. Since matrix elements are stored in row-major order (meaning each row's elements are in consecutive memory addresses, as shown in the figure below), in the first iteration with <code>i = 0</code>, thread <code>(0, 0)</code> will load <d-math>A_{0,0}</d-math>, and thread <code>(1, 0)</code> will load <d-math>A_{1,0}</d-math>. These elements are not stored close to each other in memory, and this misalignment repeats across all iterations along the shared dimension, preventing memory accesses from being coalesced.</p>
1798
 
1799
+ <p><img alt="image.png" src="/assets/images/memorycoalescing4.png" /></p>
1800
 
1801
 
1802
  <p>To improve our kernel we can change the way the coordinates x and y are calculated like the following : </p>
 
1818
 
1819
  <p>When we profile our new kernel, we notice that the warning about uncoalesced memory accesses has disappeared, and <strong>the GPU's memory throughput has increased by approximately 10 times</strong>.</p>
1820
 
1821
+ <p><img alt="image.png" src="/assets/images/memorycoalescing5.png" /></p>
1822
 
1823
 
1824
  <p>We also notice that the execution time of the kernel <strong>decreases by 10x</strong> !</p>
 
1834
 
1835
  <p>In the tiling approach, each iteration involves all threads within a block cooperatively loading two tiles—one from matrix A and another from matrix B —into shared memory. Specifically, threads load a tile of matrix A (of size <code>BLOCK_SIZE_M</code> by <code>BLOCK_SIZE_K</code>) and a tile of matrix B (of size <code>BLOCK_SIZE_K</code> by <code>BLOCK_SIZE_N</code>). Once the tiles are in shared memory, the threads perform matrix multiplication on these tiles, enabling efficient computation since all necessary data is quickly accessible. The results of the tile multiplication are stored in an accumulation matrix that holds intermediate results. After each iteration, the results from the current tile multiplication are added to this accumulation matrix, continuing until all tiles from both matrices have been processed.</p>
1836
 
1837
+ <p><img alt="image.png" src="/assets/images/tiling.png" /></p>
1838
  <p>From https://cnugteren.github.io/tutorial/pages/page4.html</p>
1839
 
1840
  <p>The important parts to understand the implementation are below (for simplicity we consider a square shaped tile) : </p>
 
1879
 
1880
  <p>The tiling technique has significantly improved the performance of our kernel. However, when analyzing the warp states which quantify how many cycles were spent in each state, we observe the following:</p>
1881
 
1882
+ <p><img alt="image.png" src="/assets/images/threadcoarsening.png" /></p>
1883
 
1884
 
1885
  <p>The meaning of the states can be found in the <a href="https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference">Profiling Guide</a>, specifically in the <strong>Warp Stall Reasons</strong> section. There we can read that:</p>
 
1901
  <p>In several places now we’ve mentioned how GPU and CPU operation can be asynchronous. In particular, the host code on the CPU can schedule workload on the GPU in a non-blocking way.</p>
1902
 
1903
  <p>Non-blocking can be useful for overlapping communication and computation as we saw at several part along this blog post but can be extended to the more general idea of trying to avoid at all cost going back and forth between host and GPU kernel commands. This is beautifully illustrated by <a href="https://horace.io/brrr_intro.html">Horace He</a> in these diagrams:</p>
1904
+ <div style="display: flex; gap: 20px; align-items: flex-start;">
1905
+ <div style="width: 50%;">
1906
+ <img alt="image.png" src="/assets/images/fused_kernels1.png" style="width: 100%;" />
1907
+ <p>A sequence of kernels requiring back and forth between global memory and compute units</p>
1908
+ </div>
1909
+ <div style="width: 50%;">
1910
+ <img alt="image.png" src="/assets/images/fused_kernels2.png" style="width: 100%;" />
1911
+ <p>Instead of sending our triangle back to global memory just to read it back again, we instead just do all of our operations in one go.</p>
1912
+ </div>
1913
+ </div>
1914
 
1915
  <p>How can we avoid this back and forth? Well the best way is to make our GPU as autonomous as possible. This is achieved by packing as many successive compute operations together in a single kernel for the GPU to run, called a “Fused Kernel”.</p>
1916
 
 
1927
 
1928
  <p>A basic implementation of the attention mechanism involve a lot of transfer between memory and workers. It requires materializing the S and P matrices in HBM which means that the results need to be sent to HBM and then back to SRAM for the next computations:</p>
1929
 
1930
+ <p><img alt="image.png" src="/assets/images/flashattn.png" /></p>
1931
 
1932
  <p>Since bandwidth is much lower in HBM this introduces a severe bottleneck in the attention computation. Can we do better? Tri Dao says yes!</p>
1933
 
1934
  <p>The key element is to compute the S matrices in small pieces which can fit in the smaller shared memory of the SM. But we can do even better and avoid materializing the very large S matrix all together in favor of keeping only the necessary statistics for computing the normalization factor of the softmax. So we can compute part of <d-math>O</d-math> directly in one computation in SRAM rather than moving intermediate results back and forth. In this case, not even do we make use of the shared memory but we also release the memory bottleneck resulting from materializing one of the largest activation matrices in the model (at long context length), the attention matrix.</p>
1935
 
1936
+ <p><img alt="image.png" src="/assets/images/flashattn2.png" /></p>
1937
  <p>From the FLASH-ATTENTION paper<d-cite bibtex-key="dao2022flashattention"></d-cite></p>
1938
 
1939
  <p>The idea of flash attention resolves so many bottlenecks in model training that it has quickly become the default way to perform attention in all transformers:</p>
 
2019
 
2020
  <p>Reducing the total number of bits comes at a price (no free lunch here either), but we have some control over how to pay. Either we can sacrifice more bits on the mantissa or exponent. For this reason there exist also two float8 formats, named according to exponent and mantissa, to flexibly choose the most appropriate format. We can look at the possible range of numbers for each format:</p>
2021
 
2022
+ <p><img alt="image.png" src="/assets/images/mixedprecision.png" /></p>
2023
 
2024
 
2025
  <p>We can see that float32 spans 80 orders of magnitude and float16 sacrifices a lot of range while bfloat16 maintains the full range. The two float8 formats reduce the range even further where e5e2 can maintain float16 range and e4m3 has an even smaller ranger.</p>
2026
 
2027
  <p>How come some format are able to maintain the range and other not? Let’s investigate the resolution by plotting 10,000 points between 1 and 2. Each point will be rounded to the nearest representable number in each format:</p>
2028
 
2029
+ <p><img alt="image.png" src="/assets/images/mixedprecision_2.png" /></p>
2030
 
2031
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2032
 
 
2064
 
2065
  <p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
2066
 
2067
+ <p><img alt="image.png" src="/assets/images/fp8_diagram.png" /></p>
2068
 
2069
  <p>In order to switch from high precision (e.g. FP32 or BF16) to lower precision (e.g. FP16 or FP8) with smaller range, we need to normalize the range of values by computing the absolute maximum. DeepSeek-V3 also introduces a quantization scheme, where the ranges are normalized per tile: 1x128 for inputs/activations and 128x128 for weights and scale elements. This makes the normalization less susceptible to outliers. There is a number of additional tricks they deploy to also reduce the memory and communication footprint which you can follow in section 3.3. of the DeepSeek-V3 technical report<d-cite bibtex-key="deepseekai2024deepseekv3technicalreport"></d-cite>. </p>
2070
 
 
2158
 
2159
  <p>Congratulations! You've completed quite a journey - from understanding how to train a simple model on a single GPU, all the way to mastering the complex techniques used to efficiently train massive language models like Llama-405B and DeepSeek-V3. By now, you should feel confident interpreting advanced parallelism diagrams like the one below, which would have seemed daunting when you first started.</p>
2160
 
2161
+ <p><img alt="image.png" src="/assets/images/conclusion_llama3_parallelism.png" /></p>
2162
 
2163
  <p>In distributed training, many concepts sound easy enough when you first hear them, like “Pipeline parallelism just distributes layers on different GPUs”, but we also worked through all the challenging details when implementing those methods. </p>
2164
 
 
2208
 
2209
  <p>First, let's examine this heatmap visualization:</p>
2210
 
2211
+ <p><img alt="image.png" src="/assets/images/what_we_learnt_heatmap.svg" /></p>
2212
  <p>Heatmap visualization showing the optimal training configurations across different model sizes and compute node counts. For each combination, the configuration details include Data Parallelism (DP), Tensor Parallelism (TP), Pipeline Parallelism (PP), Gradient Accumulation Steps (GAS), Micro Batch Size (MBS), and ZeRO optimization stage. The color intensity indicates the Model FLOPs Utilization (MFU), with brighter colors representing higher efficiency.</p>
2213
 
2214
  <p>To complement this, let's look at the relationships between different parameters:</p>
2215
 
2216
+ <!-- <p><img alt="image.png" src="/assets/images/what_we_learnt_parallel_coordinates.html" /></p> -->
2217
+ <iframe id="plotFrame" src="/assets/images/what_we_learnt_parallel_coordinates.html" height="540" width="1000" scrolling="no" frameborder="0"></iframe>
2218
+
2219
  <p>Parallel coordinates plot showing the relationship between different model parallelism configurations (Data Parallel degree, Tensor Parallel degree, Pipeline Parallel degree), training hyperparameters (gradient accumulation steps, micro batch size), ZeRO stage and the resulting Model FLOPs Utilization (MFU). Each line represents a different training configuration, with colors indicating the MFU value - warmer colors show higher efficiency.</p>
2220
 
2221
  <p>From these visualizations, we can draw several important insights:
src/index.html CHANGED
@@ -234,7 +234,7 @@
234
  <p>It looks generally like this: </p>
235
 
236
  <div class="svg-container" id="svg-first_steps_simple_training"> </div>
237
- <div class="info" id="info">Hover over the network elements to see their details</div>
238
  <script src="../assets/images/first_steps_simple_training.js"></script>
239
 
240
  <p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
@@ -297,8 +297,9 @@
297
 
298
  <p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
299
 
300
- <!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
301
- <script src="../assets/images/first_steps_memory_profile.js"></script>-->
 
302
 
303
  <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
304
 
@@ -353,7 +354,7 @@
353
  <div class="note-box">
354
  <p class="note-box-title">📝 Note</p>
355
  <p class="note-box-content">
356
- Some librarie store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
357
 
358
  </p>
359
  </div>
@@ -415,7 +416,14 @@
415
 
416
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
417
 
418
- <p><img alt="memusage_activations.svg" src="/assets/images/memusage_activations.svg" /></p>
 
 
 
 
 
 
 
419
 
420
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
421
 
@@ -429,8 +437,9 @@
429
 
430
  <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
431
 
432
- <p><img alt="image.png" src="/assets/images/activation_recomputation.png" /></p>
433
-
 
434
  <p>There are several strategies to select key activations to store:</p>
435
 
436
  <ul>
 
234
  <p>It looks generally like this: </p>
235
 
236
  <div class="svg-container" id="svg-first_steps_simple_training"> </div>
237
+ <div class="info" id="svg-first_steps_simple_training-info">Hover over the network elements to see their details</div>
238
  <script src="../assets/images/first_steps_simple_training.js"></script>
239
 
240
  <p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
 
297
 
298
  <p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
299
 
300
+ <div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
301
+ <div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
302
+ <script src="../assets/images/first_steps_memory_profile.js"></script>
303
 
304
  <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
305
 
 
354
  <div class="note-box">
355
  <p class="note-box-title">📝 Note</p>
356
  <p class="note-box-content">
357
+ Some libraries store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
358
 
359
  </p>
360
  </div>
 
416
 
417
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
418
 
419
+ <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/memusage_activations.html" width="90%" scrolling="no" frameborder="0"></iframe>
420
+ <script>
421
+ window.addEventListener('load', function() {
422
+ const frame = document.getElementById('plotFrame2');
423
+ frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
424
+ frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
425
+ });
426
+ </script>
427
 
428
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
429
 
 
437
 
438
  <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
439
 
440
+ <div class="svg-container" id="svg-activation_recomputation"> </div>
441
+ <div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
442
+ <script src="../assets/images/activation_recomputation.js"></script>
443
  <p>There are several strategies to select key activations to store:</p>
444
 
445
  <ul>