thomwolf HF staff commited on
Commit
e7323a7
·
verified ·
1 Parent(s): 44ecf71
dist/assets/data/fragments/cp_8Bmemoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="995b326f-ddf4-41a8-b9e5-fce1da175b3f" class="plotly-graph-div" style="height:410px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("995b326f-ddf4-41a8-b9e5-fce1da175b3f")) { Plotly.newPlot( "995b326f-ddf4-41a8-b9e5-fce1da175b3f", [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[59.828125,59.828125,59.828125,59.828125,59.828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[4.25,17.0,68.0,272.0,544.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[2.75,11.0,44.0,176.0,352.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[0.6875,2.75,11.0,44.0,88.0],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray","title":{"text":"Memory Usage (GB)"}},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=2 CP=1","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=2 CP=4","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage for 8B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":410}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/dp_ourjourney_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="9f507a17-fb27-4b9a-9224-34ffad9cd0d4" class="plotly-graph-div" style="height:410px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("9f507a17-fb27-4b9a-9224-34ffad9cd0d4")) { Plotly.newPlot( "9f507a17-fb27-4b9a-9224-34ffad9cd0d4", [{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[2.3017578125,2.3017578125,2.3017578125,2.3017578125,2.3017578125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[2.3017578125,2.3017578125,2.3017578125,2.3017578125,2.3017578125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[9.20703125,9.20703125,9.20703125,9.20703125,9.20703125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[1.0625,2.125,4.25,8.5,17.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[59.828125,59.828125,59.828125,59.828125,59.828125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[4.25,8.5,17.0,34.0,68.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[131.4140625,131.4140625,131.4140625,131.4140625,131.4140625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[131.4140625,131.4140625,131.4140625,131.4140625,131.4140625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[525.65625,525.65625,525.65625,525.65625,525.65625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[21.25,42.5,85.0,170.0,340.0],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889]},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"title":{"text":"GB memory"}},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445]},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150]},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0]},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150]},"annotations":[{"font":{"size":16},"showarrow":false,"text":"1B model","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"8B model","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"70B model","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage vs Sequence Length for Different Model Sizes"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":410}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/dp_scaling.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="f6b00dd8-6230-46cf-9b38-f7fd425a1dd3" class="plotly-graph-div" style="height:400px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("f6b00dd8-6230-46cf-9b38-f7fd425a1dd3")) { Plotly.newPlot( "f6b00dd8-6230-46cf-9b38-f7fd425a1dd3", [{"marker":{"color":"#4ea5b7"},"name":"Throughput (tokens\u002fsec\u002fGPU)","width":0.7,"x":["8","16","32","64","128","256"],"y":[40149.94,37609.69,35367.61,31112.23,26446.44,15700.38],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[37609.69],"marker":{"color":"#e889ab"},"name":"Performance Drop","showlegend":true,"width":0.0875,"x":["16"],"y":[2540.25],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[35367.61],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["32"],"y":[2242.0800000000017],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[31112.23],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["64"],"y":[4255.380000000001],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[26446.44],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["128"],"y":[4665.790000000001],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[15700.38],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["256"],"y":[10746.06],"type":"bar","xaxis":"x","yaxis":"y"},{"line":{"color":"#e889ab"},"marker":{"color":"#e889ab"},"mode":"lines+markers","name":"Memory Usage (GB)","x":["8","16","32","64","128","256"],"y":[36.66,36.66,36.66,36.66,36.66,36.66],"type":"scatter","xaxis":"x2","yaxis":"y2"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.45],"title":{"text":"Data Parallelism (DP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Throughput (tokens\u002fsec\u002fGPU)"},"showgrid":true,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.55,1.0],"title":{"text":"Data Parallelism (DP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"title":{"text":"Memory Usage (GB)"},"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Throughput Scaling with Data Parallelism","x":0.225,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Memory Usage Scaling with Data Parallelism","x":0.775,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-6.3%","x":1,"xanchor":"center","xref":"x","xshift":30,"y":38879.815,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-6.0%","x":2,"xanchor":"center","xref":"x","xshift":30,"y":36488.65,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-12.0%","x":3,"xanchor":"center","xref":"x","xshift":30,"y":33239.92,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-15.0%","x":4,"xanchor":"center","xref":"x","xshift":30,"y":28779.335,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-40.6%","x":5,"xanchor":"center","xref":"x","xshift":30,"y":21073.41,"yanchor":"middle","yref":"y"}],"legend":{"x":0.55,"y":1.0},"width":1000,"height":400,"barmode":"stack"}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/memory-profile.html CHANGED
The diff for this file is too large to render. See raw diff
 
dist/assets/data/fragments/memory-recomputation.html CHANGED
@@ -1 +1 @@
1
- <div> <div id="3a29a7b6-5a6e-4e51-aabe-515dafefbd63" class="plotly-graph-div" style="height:500px; width:850px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("3a29a7b6-5a6e-4e51-aabe-515dafefbd63")) { Plotly.newPlot( "3a29a7b6-5a6e-4e51-aabe-515dafefbd63", [{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnD0AAAAAASPcPQAAAAACkCxBAAAAAAKQrEEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnD0AAAAAASPcPQAAAAACkCxBAAAAAAKQrEEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnH0AAAAAASPcfQAAAAACkCyBAAAAAAKQrIEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAACEDEAAAAAAAEIoQAAAAAAAIUZAAAAAAIAQZUA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXKkAAAAAApZ0qQAAAAAClqSpAAAAAAKXBKkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXKkAAAAAApZ0qQAAAAAClqSpAAAAAAKXBKkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXOkAAAAAApZ06QAAAAAClqTpAAAAAAKXBOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAABLGEAAAAAAgLUyQAAAAACA1U9AAAAAAMAKbUA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6OUAAAAAAov45QAAAAACiBjpAAAAAAKIWOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6OUAAAAAAov45QAAAAACiBjpAAAAAAKIWOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6SUAAAAAAov5JQAAAAACiBkpAAAAAAKIWSkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAACCIkAAAAAAAII8QAAAAAAAQVhAAAAAAIAgdkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002fbkAAAACARIBuQAAAAIBEgm5AAAAAgESGbkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002fbkAAAACARIBuQAAAAIBEgm5AAAAAgESGbkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002ffkAAAACARIB+QAAAAIBEgn5AAAAAgESGfkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAAAhR0AAAAAAgNBhQAAAAACAUH5AAAAAAECom0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fl0AAAACANsCXQAAAAIC2wJdAAAAAgLbBl0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fl0AAAACANsCXQAAAAIC2wJdAAAAAgLbBl0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fp0AAAACANsCnQAAAAIC2wKdAAAAAgLbBp0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAIA2YkAAAAAAgA58QAAAAABA35dAAAAAAKDHtUA="},"type":"bar"}], {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"white","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"white","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"#C8D4E3","linecolor":"#C8D4E3","minorgridcolor":"#C8D4E3","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"#C8D4E3","linecolor":"#C8D4E3","minorgridcolor":"#C8D4E3","startlinecolor":"#2a3f5f"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"contour"}],"heatmap":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattergeo"}],"scattergl":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattermapbox"}],"scattermap":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattermap"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolar"}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]],"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]},"colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"geo":{"bgcolor":"white","lakecolor":"white","landcolor":"white","showlakes":true,"showland":true,"subunitcolor":"#C8D4E3"},"hoverlabel":{"align":"left"},"hovermode":"closest","mapbox":{"style":"light"},"paper_bgcolor":"white","plot_bgcolor":"white","polar":{"angularaxis":{"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":""},"bgcolor":"white","radialaxis":{"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":""}},"scene":{"xaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"},"yaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"},"zaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"ternary":{"aaxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""},"baxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""},"bgcolor":"white","caxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""}},"title":{"x":0.05},"xaxis":{"automargin":true,"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":"","title":{"standoff":15},"zerolinecolor":"#EBF0F8","zerolinewidth":2},"yaxis":{"automargin":true,"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":"","title":{"standoff":15},"zerolinecolor":"#EBF0F8","zerolinewidth":2}}},"title":{"text":"Memory Usage with Recomputation","x":0.5,"y":0.95},"margin":{"r":80},"legend":{"y":0.95,"x":1.02,"xanchor":"left","yanchor":"top"},"updatemenus":[{"active":0,"buttons":[{"args":[{"visible":[true,true,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,203.1547058105469]}],"label":"1B","method":"update"},{"args":[{"visible":[false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,314.4336639404297]}],"label":"3B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,504.2233764648438]}],"label":"8B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,false,false,false,false]},{"yaxis.range":[0,3021.530541992188]}],"label":"70B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true]},{"yaxis.range":[0,12823.0716796875]}],"label":"405B","method":"update"}],"direction":"down","showactive":true,"x":1.035,"xanchor":"left","y":0.6,"yanchor":"top"},{"active":0,"buttons":[{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[3.564453125,12.12890625,44.2578125,168.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[6.0732421875,18.708984375,63.66796875,232.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[9.25390625,28.5078125,97.015625,354.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[46.2578125,142.515625,485.03125,1770.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[145.703125,448.90625,1527.8125,5575.625]]}],"label":"None","method":"restyle"},{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[1.064453125,2.12890625,4.2578125,8.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[2.7919921875,5.583984375,11.16796875,22.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[4.25390625,8.5078125,17.015625,34.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[21.2578125,42.515625,85.03125,170.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[66.953125,133.90625,267.8125,535.625]]}],"label":"selective","method":"restyle"},{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[0.064453125,0.12890625,0.2578125,0.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[0.1669921875,0.333984375,0.66796875,1.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[0.25390625,0.5078125,1.015625,2.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[1.2578125,2.515625,5.03125,10.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[3.953125,7.90625,15.8125,31.625]]}],"label":"full","method":"restyle"}],"direction":"down","showactive":true,"x":1.035,"xanchor":"left","y":0.4,"yanchor":"top"}],"barmode":"stack","xaxis":{"title":{"text":"Sequence Length"}},"yaxis":{"title":{"text":"Memory (GB)"},"range":[0,203.1547058105469]},"width":850,"height":500,"annotations":[{"showarrow":false,"text":"Model Size:","x":1.035,"xanchor":"left","xref":"paper","y":0.6,"yanchor":"bottom","yref":"paper"},{"showarrow":false,"text":"Recomputation:","x":1.035,"xanchor":"left","xref":"paper","y":0.4,"yanchor":"bottom","yref":"paper"}]}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
 
1
+ <div> <div id="3a29a7b6-5a6e-4e51-aabe-515dafefbd63" class="plotly-graph-div" style="height:500px; width:850px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("3a29a7b6-5a6e-4e51-aabe-515dafefbd63")) { Plotly.newPlot( "3a29a7b6-5a6e-4e51-aabe-515dafefbd63", [{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnD0AAAAAASPcPQAAAAACkCxBAAAAAAKQrEEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnD0AAAAAASPcPQAAAAACkCxBAAAAAAKQrEEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnH0AAAAAASPcfQAAAAACkCyBAAAAAAKQrIEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAACEDEAAAAAAAEIoQAAAAAAAIUZAAAAAAIAQZUA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXKkAAAAAApZ0qQAAAAAClqSpAAAAAAKXBKkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXKkAAAAAApZ0qQAAAAAClqSpAAAAAAKXBKkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXOkAAAAAApZ06QAAAAAClqTpAAAAAAKXBOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAABLGEAAAAAAgLUyQAAAAACA1U9AAAAAAMAKbUA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6OUAAAAAAov45QAAAAACiBjpAAAAAAKIWOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6OUAAAAAAov45QAAAAACiBjpAAAAAAKIWOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6SUAAAAAAov5JQAAAAACiBkpAAAAAAKIWSkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAACCIkAAAAAAAII8QAAAAAAAQVhAAAAAAIAgdkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002fbkAAAACARIBuQAAAAIBEgm5AAAAAgESGbkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002fbkAAAACARIBuQAAAAIBEgm5AAAAAgESGbkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002ffkAAAACARIB+QAAAAIBEgn5AAAAAgESGfkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAAAhR0AAAAAAgNBhQAAAAACAUH5AAAAAAECom0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fl0AAAACANsCXQAAAAIC2wJdAAAAAgLbBl0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fl0AAAACANsCXQAAAAIC2wJdAAAAAgLbBl0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fp0AAAACANsCnQAAAAIC2wKdAAAAAgLbBp0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAIA2YkAAAAAAgA58QAAAAABA35dAAAAAAKDHtUA="},"type":"bar"}], {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"white","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"white","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"#C8D4E3","linecolor":"#C8D4E3","minorgridcolor":"#C8D4E3","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"#C8D4E3","linecolor":"#C8D4E3","minorgridcolor":"#C8D4E3","startlinecolor":"#2a3f5f"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"contour"}],"heatmap":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattergeo"}],"scattergl":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattermapbox"}],"scattermap":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattermap"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolar"}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]],"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]},"colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"geo":{"bgcolor":"white","lakecolor":"white","landcolor":"white","showlakes":true,"showland":true,"subunitcolor":"#C8D4E3"},"hoverlabel":{"align":"left"},"hovermode":"closest","mapbox":{"style":"light"},"paper_bgcolor":"white","plot_bgcolor":"white","polar":{"angularaxis":{"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":""},"bgcolor":"white","radialaxis":{"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":""}},"scene":{"xaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"},"yaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"},"zaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"ternary":{"aaxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""},"baxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""},"bgcolor":"white","caxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""}},"title":{"x":0.05},"xaxis":{"automargin":true,"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":"","title":{"standoff":15},"zerolinecolor":"#EBF0F8","zerolinewidth":2},"yaxis":{"automargin":true,"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":"","title":{"standoff":15},"zerolinecolor":"#EBF0F8","zerolinewidth":2}}},"title":{"text":"Memory Usage with Recomputation","x":0.5,"y":0.95},"margin":{"r":80},"legend":{"y":0.95,"x":1.02,"xanchor":"left","yanchor":"top"},"updatemenus":[{"active":0,"buttons":[{"args":[{"visible":[true,true,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,203.1547058105469]}],"label":"1B","method":"update"},{"args":[{"visible":[false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,314.4336639404297]}],"label":"3B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,504.2233764648438]}],"label":"8B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,false,false,false,false]},{"yaxis.range":[0,3021.530541992188]}],"label":"70B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true]},{"yaxis.range":[0,12823.0716796875]}],"label":"405B","method":"update"}],"direction":"down","showactive":true,"x":1.035,"xanchor":"left","y":0.6,"yanchor":"top"},{"active":0,"buttons":[{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[3.564453125,12.12890625,44.2578125,168.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[6.0732421875,18.708984375,63.66796875,232.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[9.25390625,28.5078125,97.015625,354.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[46.2578125,142.515625,485.03125,1770.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[145.703125,448.90625,1527.8125,5575.625]]}],"label":"None","method":"restyle"},{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[1.064453125,2.12890625,4.2578125,8.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[2.7919921875,5.583984375,11.16796875,22.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[4.25390625,8.5078125,17.015625,34.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[21.2578125,42.515625,85.03125,170.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[66.953125,133.90625,267.8125,535.625]]}],"label":"selective","method":"restyle"},{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[0.064453125,0.12890625,0.2578125,0.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[0.1669921875,0.333984375,0.66796875,1.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[0.25390625,0.5078125,1.015625,2.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[1.2578125,2.515625,5.03125,10.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[3.953125,7.90625,15.8125,31.625]]}],"label":"full","method":"restyle"}],"direction":"down","showactive":true,"x":1.035,"xanchor":"left","y":0.4,"yanchor":"top"}],"barmode":"stack","xaxis":{"title":{"text":"Sequence Length"}},"yaxis":{"title":{"text":"Memory (GB)"},"range":[0,203.1547058105469]},"width":850,"height":500,"annotations":[{"showarrow":false,"text":"Model Size:","x":1.035,"xanchor":"left","xref":"paper","y":0.6,"yanchor":"bottom","yref":"paper"},{"showarrow":false,"text":"Recomputation:","x":1.035,"xanchor":"left","xref":"paper","y":0.4,"yanchor":"bottom","yref":"paper"}]}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
dist/assets/data/fragments/memusage_activations.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="121faba8-d8ec-447e-9ab1-9a8fa34c1f63" class="plotly-graph-div" style="height:400px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("121faba8-d8ec-447e-9ab1-9a8fa34c1f63")) { Plotly.newPlot( "121faba8-d8ec-447e-9ab1-9a8fa34c1f63", [{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625,52.42681884765625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[9.25390625,28.5078125,97.015625,354.03125,1348.0625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125,488.8917236328125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[46.2578125,142.515625,485.03125,1770.0625,6740.125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125,3041.8564453125],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[145.703125,448.90625,1527.8125,5575.625,21231.25],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"GB memory"},"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-8B","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-70B","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-405B","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"barmode":"stack","width":1000,"height":400,"legend":{"title":{}}}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/pp_bubblesize.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="3997052d-7390-4347-abf3-6c8b08e53c31" class="plotly-graph-div" style="height:650px; width:900px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("3997052d-7390-4347-abf3-6c8b08e53c31")) { Plotly.newPlot( "3997052d-7390-4347-abf3-6c8b08e53c31", [{"marker":{"color":"#4ea5b7"},"orientation":"h","text":["0.03","0.05","0.05","0.11","0.11","0.11","0.22","0.22","0.22","0.22","0.44","0.44","0.44","0.44","0.88","0.88","0.88","0.88","1.75","1.75","1.75","3.50","3.50","7.00"],"textposition":"outside","x":[0.02734375,0.0546875,0.0546875,0.109375,0.109375,0.109375,0.21875,0.21875,0.21875,0.21875,0.4375,0.4375,0.4375,0.4375,0.875,0.875,0.875,0.875,1.75,1.75,1.75,3.5,3.5,7.0],"y":["m=32, v=8","m=16, v=8","m=32, v=4","m=32, v=2","m=16, v=4","m=8, v=8","m=32, v=1","m=16, v=2","m=8, v=4","m=4, v=8","m=4, v=4","m=8, v=2","m=2, v=8","m=16, v=1","m=8, v=1","m=2, v=4","m=1, v=8","m=4, v=2","m=4, v=1","m=2, v=2","m=1, v=4","m=2, v=1","m=1, v=2","m=1, v=1"],"type":"bar"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"yaxis":{"tickmode":"array","tickvals":["m=32, v=8","m=16, v=8","m=32, v=4","m=32, v=2","m=16, v=4","m=8, v=8","m=32, v=1","m=16, v=2","m=8, v=4","m=4, v=8","m=4, v=4","m=8, v=2","m=2, v=8","m=16, v=1","m=8, v=1","m=2, v=4","m=1, v=8","m=4, v=2","m=4, v=1","m=2, v=2","m=1, v=4","m=2, v=1","m=1, v=2","m=1, v=1"],"ticktext":["m=32, v=8","m=16, v=8","m=32, v=4","m=32, v=2","m=16, v=4","m=8, v=8","m=32, v=1","m=16, v=2","m=8, v=4","m=4, v=8","m=4, v=4","m=8, v=2","m=2, v=8","m=16, v=1","m=8, v=1","m=2, v=4","m=1, v=8","m=4, v=2","m=4, v=1","m=2, v=2","m=1, v=4","m=2, v=1","m=1, v=2","m=1, v=1"],"title":{"text":"PP configuration"}},"margin":{"l":150,"r":100,"t":100,"b":100},"title":{"text":"Bubble size for PP=8"},"xaxis":{"title":{"text":"Bubble size"}},"width":900,"height":650}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/pp_comm_bandwidth.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="91842954-4418-43b7-bc03-81a893ff71fe" class="plotly-graph-div" style="height:410px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("91842954-4418-43b7-bc03-81a893ff71fe")) { Plotly.newPlot( "91842954-4418-43b7-bc03-81a893ff71fe", [{"fill":"toself","fillcolor":"rgba(78,165,183,0.2)","hoverinfo":"skip","line":{"color":"rgba(255,255,255,0)"},"showlegend":false,"x":[0,1,2,3,4,5,6,6,5,4,3,2,1,0],"y":[461.538,436.13,219.09099999999998,192.73399999999992,188.42249999999999,194.227,177.35999999999999,12.493500000000001,24.965,31.34,41.587,44.509,302.33500000000004,422.288],"type":"scatter"},{"line":{"color":"#4ea5b7","width":3},"marker":{"size":10,"symbol":"circle"},"mode":"lines+markers+text","name":"AllReduce","text":["436.0","361.7","160.1","99.6","84.7","64.9","32.9"],"textposition":"bottom center","x":[0,1,2,3,4,5,6],"y":[435.9668115942029,361.74920529801324,160.13950738916256,99.56427561837455,84.74052884615384,64.92543661971831,32.937847222222224],"type":"scatter"},{"fill":"toself","fillcolor":"rgba(232,137,171,0.2)","hoverinfo":"skip","line":{"color":"rgba(255,255,255,0)"},"showlegend":false,"x":[0,1,2,3,4,5,6,6,5,4,3,2,1,0],"y":[264.93,226.26999999999998,229.40999999999997,178.47899999999998,126.6575,77.026,44.4165,6.1535,12.314,24.525000000000002,47.147,40.757000000000005,147.97500000000002,239.55200000000002],"type":"scatter"},{"line":{"color":"#e889ab","width":3},"marker":{"size":10,"symbol":"square"},"mode":"lines+markers","name":"AllGather","x":[0,1,2,3,4,5,6],"y":[249.84884057971013,184.61324503311258,118.96753694581281,68.99752650176679,54.972283653846155,27.969183098591547,11.038298611111111],"type":"scatter"},{"fill":"toself","fillcolor":"rgba(206,192,250,0.2)","hoverinfo":"skip","line":{"color":"rgba(255,255,255,0)"},"showlegend":false,"x":[0,1,2,3,4,5,6,6,5,4,3,2,1,0],"y":[264.64599999999996,226.37,215.492,177.54299999999998,126.4825,77.289,45.1295,6.1005,12.39,24.544999999999998,46.802,41.176,146.1,240.804],"type":"scatter"},{"line":{"color":"#cec0fa","width":3},"marker":{"size":10,"symbol":"triangle-up"},"mode":"lines+markers","name":"ReduceScatter","x":[0,1,2,3,4,5,6],"y":[249.72898550724636,181.5535761589404,115.7576354679803,68.55106007067138,54.524230769230776,27.944281690140844,11.069652777777778],"type":"scatter"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"title":{"text":"Number of Nodes"},"tickmode":"array","tickvals":[0,1,2,3,4,5,6],"ticktext":["1","2","4","8","16","32","64"]},"yaxis":{"title":{"text":"Bandwidth (GB\u002fs)"},"range":[0,480.0],"gridcolor":"rgba(0,0,0,0.1)"},"legend":{"x":0.85,"y":1,"bgcolor":"rgba(255,255,255,0.5)"},"margin":{"l":80,"r":80,"t":80,"b":80},"title":{"text":"Communication Bandwidth by Number of Nodes (size=256MB)"},"width":1000,"height":410}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/pp_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="8db330ae-52ab-4193-8ee5-dfa289fe4609" class="plotly-graph-div" style="height:410px; width:800px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("8db330ae-52ab-4193-8ee5-dfa289fe4609")) { Plotly.newPlot( "8db330ae-52ab-4193-8ee5-dfa289fe4609", [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384"],"y":[14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384"],"y":[14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384"],"y":[59.828125,59.828125,59.828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384"],"y":[4.25,17.0,68.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[2.603515625,2.603515625,2.603515625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[2.603515625,2.603515625,2.603515625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[10.4140625,10.4140625,10.4140625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[4.25,17.0,68.0],"type":"bar","xaxis":"x2","yaxis":"y2"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.45],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray","title":{"text":"Memory Usage (GB)"}},"xaxis2":{"anchor":"y2","domain":[0.55,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism","x":0.225,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"PP=8","x":0.775,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"}],"title":{"text":"Memory Usage for 8B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":800,"height":410}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/tp_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="eed45cdb-8f9e-4c54-b31f-85062d3362e8" class="plotly-graph-div" style="height:400px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("eed45cdb-8f9e-4c54-b31f-85062d3362e8")) { Plotly.newPlot( "eed45cdb-8f9e-4c54-b31f-85062d3362e8", [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384"],"y":[526.0,526.0,526.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384"],"y":[21.25,85.0,340.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[65.75,65.75,65.75],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[8.125,32.5,130.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[32.875,32.875,32.875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[7.1875,28.75,115.0],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"title":{"text":"Memory Usage (GB)"},"showgrid":true,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism (TP-1)","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=8","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=16","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage for 70B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":400}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/tp_scaling.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="8de1ceac-2d60-4368-8691-415db492ed15" class="plotly-graph-div" style="height:400px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("8de1ceac-2d60-4368-8691-415db492ed15")) { Plotly.newPlot( "8de1ceac-2d60-4368-8691-415db492ed15", [{"marker":{"color":"#4ea5b7"},"name":"Tokens\u002fsec\u002fGPU","width":0.7,"x":["2","4","8","16","32"],"y":[13923.18,12420.76,10903.32,6245.6,2146.44],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[12420.76],"marker":{"color":"#e889ab"},"name":"Performance Drop","showlegend":true,"width":0.0875,"x":["4"],"y":[1502.42],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[10903.32],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["8"],"y":[1517.4400000000005],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[6245.6],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["16"],"y":[4657.719999999999],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[2146.44],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["32"],"y":[4099.16],"type":"bar","xaxis":"x","yaxis":"y"},{"marker":{"color":"#cec0fa"},"name":"Max Batch Size","text":["3","8","12","16","20"],"textposition":"inside","width":0.7,"x":["2","4","8","16","32"],"y":[3,8,12,16,20],"type":"bar","xaxis":"x2","yaxis":"y2"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.45],"title":{"text":"Tensor Parallelism (TP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Tokens\u002fsec\u002fGPU"},"showgrid":true,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.55,1.0],"title":{"text":"Tensor Parallelism (TP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"title":{"text":"Maximum Batch Size"},"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Throughput Scaling with TP (3B Model)","x":0.225,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Maximum Batch Size per TP Value","x":0.775,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-10.8%","x":1,"xanchor":"center","xref":"x","xshift":30,"y":13171.970000000001,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-12.2%","x":2,"xanchor":"center","xref":"x","xshift":30,"y":11662.04,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-42.7%","x":3,"xanchor":"center","xref":"x","xshift":30,"y":8574.46,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-65.6%","x":4,"xanchor":"center","xref":"x","xshift":30,"y":4196.02,"yanchor":"middle","yref":"y"}],"legend":{"x":0.55,"y":1.0},"width":1000,"height":400,"barmode":"stack"}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/tp_sp_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="16242941-9fab-40ed-996e-ddd93a5a627c" class="plotly-graph-div" style="height:410px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("16242941-9fab-40ed-996e-ddd93a5a627c")) { Plotly.newPlot( "16242941-9fab-40ed-996e-ddd93a5a627c", [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384"],"y":[526.0,526.0,526.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[65.75,65.75,65.75],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[32.875,32.875,32.875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384"],"y":[21.25,85.0,340.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[2.65625,10.625,42.5],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[1.328125,5.3125,21.25],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray","title":{"text":"Memory Usage (GB)"}},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=8 (with SP)","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=16 (with SP)","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage for 70B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":410}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/tp_sp_scaling.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="e996eac5-dceb-42af-9cc6-d21f247621f5" class="plotly-graph-div" style="height:400px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("e996eac5-dceb-42af-9cc6-d21f247621f5")) { Plotly.newPlot( "e996eac5-dceb-42af-9cc6-d21f247621f5", [{"marker":{"color":"#4ea5b7"},"name":"Tokens\u002fsec\u002fGPU","width":0.7,"x":["2","4","8","16","32"],"y":[14167.25,13460.16,10888.53,6159.3,3609.73],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[13460.16],"marker":{"color":"#e889ab"},"name":"Performance Drop","showlegend":true,"width":0.0875,"x":["4"],"y":[707.0900000000001],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[10888.53],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["8"],"y":[2571.629999999999],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[6159.3],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["16"],"y":[4729.2300000000005],"type":"bar","xaxis":"x","yaxis":"y"},{"base":[3609.73],"marker":{"color":"#e889ab"},"showlegend":false,"width":0.0875,"x":["32"],"y":[2549.57],"type":"bar","xaxis":"x","yaxis":"y"},{"marker":{"color":"#cec0fa"},"name":"Max Batch Size","text":["4","10","20","40","100"],"textposition":"inside","width":0.7,"x":["2","4","8","16","32"],"y":[4,10,20,40,100],"type":"bar","xaxis":"x2","yaxis":"y2"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.45],"title":{"text":"Tensor Parallelism (TP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Tokens\u002fsec\u002fGPU"},"showgrid":true,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.55,1.0],"title":{"text":"Tensor Parallelism (TP)"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"title":{"text":"Maximum Batch Size"},"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Throughput Scaling with TP\u002fSP (3B Model)","x":0.225,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Maximum Batch Size per TP Value","x":0.775,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-5.0%","x":1,"xanchor":"center","xref":"x","xshift":30,"y":13813.705,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-19.1%","x":2,"xanchor":"center","xref":"x","xshift":30,"y":12174.345000000001,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-43.4%","x":3,"xanchor":"center","xref":"x","xshift":30,"y":8523.915,"yanchor":"middle","yref":"y"},{"font":{"color":"#e889ab"},"showarrow":false,"text":"-41.4%","x":4,"xanchor":"center","xref":"x","xshift":30,"y":4884.515,"yanchor":"middle","yref":"y"}],"legend":{"x":0.55,"y":1.0},"width":1000,"height":400,"barmode":"stack"}, {"responsive": true} ) }; </script> </div>
dist/assets/data/fragments/zero3_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="be6d1423-335a-45b6-a848-dc4d0d70706f" class="plotly-graph-div" style="height:400px; width:1000px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("be6d1423-335a-45b6-a848-dc4d0d70706f")) { Plotly.newPlot( "be6d1423-335a-45b6-a848-dc4d0d70706f", [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384"],"y":[15.0,15.0,15.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384"],"y":[15.0,15.0,15.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384"],"y":[60.0,60.0,60.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384"],"y":[4.25,17.0,68.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[15.0,15.0,15.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[15.0,15.0,15.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[7.5,7.5,7.5],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[4.25,17.0,68.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[15.0,15.0,15.0],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[1.875,1.875,1.875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[7.5,7.5,7.5],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[4.25,17.0,68.0],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[1.875,1.875,1.875],"type":"bar","xaxis":"x4","yaxis":"y4"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[1.875,1.875,1.875],"type":"bar","xaxis":"x4","yaxis":"y4"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[7.5,7.5,7.5],"type":"bar","xaxis":"x4","yaxis":"y4"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[4.25,17.0,68.0],"type":"bar","xaxis":"x4","yaxis":"y4"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2125],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Memory Usage (GB)"},"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.2625,0.475],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.525,0.7375],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"showgrid":true,"gridcolor":"LightGray"},"xaxis4":{"anchor":"y4","domain":[0.7875,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis4":{"anchor":"x4","domain":[0.0,1.0],"matches":"y","showticklabels":false,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"DP=8","x":0.10625,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"DP=8 Zero-1","x":0.36875,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"DP=8 Zero-2","x":0.6312500000000001,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"DP=8 Zero-3","x":0.89375,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x4 domain","y0":80,"y1":80,"yref":"y4"}],"title":{"text":"Memory Usage for 8B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":400}, {"responsive": true} ) }; </script> </div>
dist/index.html CHANGED
@@ -1580,21 +1580,28 @@
1580
 
1581
  <p>Congratulation reader, you have now seen all 5 parallelism strategies you can use to scale model training: </p>
1582
  <ol>
1583
- <li>Data Parallelism – along the batch dimension (including ZeRO)</li>
1584
- <li>Tensor Parallelism - along the hidden dimension</li>
1585
- <li>Sequence and Context Parallelism - along the sequence dimension</li>
1586
- <li>Pipeline Parallelism - along the model layers</li>
1587
- <li>Expert Parallelism - along the model experts</li>
1588
  </ol>
1589
 
1590
- <p>At this stage, one aspect you are probably curious about is how all these parallelism strategies (and ZeRO) compare to each other and how they interact with each other? In a nutshell, which one should we use and combine?</p>
 
 
 
 
 
1591
 
1592
- <p>Let’s take a look at the similarities and interplay. We'll start by bringing Pipeline parallelism are ZeRO-3 side-by-side as they have interesting similarities and differences.</p>
 
 
1593
 
1594
- <p><strong>Pipeline parallelism vs. ZeRO-3 -</strong> Both are ways to partition the model weights over several GPUs and perform communication/computation along the model depth axis (for example in ZeRO-3, we prefetch the next layer while computing). This means in both cases the full layers are computed on device, as opposed to TP, where the layers are sharded for the computation.</p>
1595
  <aside>In the following we say “a layer” to simplify what should be in general called “a set of layer” (as the basis sharding unit of the model).</aside>
1596
 
1597
- <p>However, there are a few major differences between the two:</p>
1598
 
1599
  <div class="l-body">
1600
  <table>
@@ -1635,50 +1642,50 @@
1635
  </table>
1636
  </div>
1637
 
1638
- <p>As you can see, ZeRO-3 and PP sove the same challenge through quite different approaches, whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory for each micro-batch in PP to minimize as much as possible the communication overhead.</p>
1639
 
1640
- <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1!</p>
1641
 
1642
- <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and interoperable with both Pipeline Parallelism and ZeRO-3, because it relies on the distributive property of matrix multiplication that allows weights and activations to be sharded and computed independently before being combined.</p>
1643
 
1644
  <img alt="TP & SP diagram" src="/assets/images/5D_nutshell_tp_sp.svg" style="width: 1000px; max-width: none;" />
1645
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1646
 
1647
 
1648
- <p>In practice TP has two important limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
1649
-
1650
- <p>When combining parallelism strategies, TP will typically be kept for high-speed intra-node communications while ZeRO-3 or PP can use parallelism groups spanning lower speed inter-node communications, since their communication patterns are more amenable to scaling. The main consideration is organizing the GPU groups efficiently for each parallelism dimension to maximize throughput and minimize communication overhead, while being mindful of TP's scaling limitations.</p>
1651
 
 
1652
 
1653
- <p><strong>Context Parallelism</strong> and <strong>Expert Parallelism</strong> also help us sharding activations, and can be seen as complimentary to TP The former handles long sequences while the latter enables distributed Mixture of Experts training.</p>
1654
 
1655
- <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. This is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where even with full activation recomputation the memory requirements for attention would be prohibitive on a single GPU.</p>
1656
 
1657
  <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1000px; max-width: none;" />
1658
 
1659
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1660
 
1661
 
1662
- <p><strong>Expert Parallelism (EP)</strong> specifically targets the challenge of training Mixture of Experts (MoE) models by sharding specialized "experts" across GPUs and dynamically routing tokens to relevant experts during computation. The key communication pattern in EP is the all-to-all operation needed to route tokens to their assigned experts and gather the results back. While this introduces some communication overhead, it enables scaling model capacity significantly since each token only needs to compute through a fraction of the total parameters. This partitioning of experts across GPUs becomes essential when working with models that have a large number of experts, like DeepSeek which uses 256 experts.</p>
 
1663
 
1664
- <p>It's worth noting the scope of impact for these different parallelism strategies:</p>
 
 
 
 
 
 
 
 
 
1665
 
1666
  <ul>
1667
- <li>Tensor Parallelism (with Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1668
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1669
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
 
1670
  </ul>
1671
 
1672
- <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1000px; max-width: none;" />
1673
-
1674
- <div class="note-box">
1675
- <p class="note-box-title">📝 Note</p>
1676
- <div class="note-box-content">
1677
- <p>This similarity between EP and DP in terms of input handling is why some implementations consider Expert Parallelism to be a subgroup of Data Parallelism, with the key difference being that EP uses specialized expert routing rather than having all GPUs process inputs through identical model copies.</p>
1678
- </div>
1679
- </div>
1680
-
1681
-
1682
  <table>
1683
  <thead>
1684
  <tr>
@@ -1711,25 +1718,22 @@
1711
  </tbody>
1712
  </table>
1713
 
1714
- <p>Which leads us to this beautiful diagram to summarize all what weve seen:</p>
 
1715
 
1716
  <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1000px; max-width: none;"/></p>
1717
 
1718
- <p>And to have an idea of the memory benefits of each parallelism:</p>
1719
 
1720
  <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1000px; max-width: none;"/>
1721
 
1722
- <h2>How to Find the Best Training Configuration</h2>
1723
-
1724
- <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models. There remain a general question: which ones should we choose and which ones are best combined? We touched a little bit on this at the end of the last section but in this section we will walk through the decision process step by step.</p>
1725
-
1726
- <p>First let's have a quick look at each parallel strategy and how it helps and at what cost it comes:</p>
1727
 
1728
  <table>
1729
  <thead>
1730
  <tr>
1731
  <th><strong>Method</strong></th>
1732
- <th><strong>Memory savings</strong></th>
1733
  <th><strong>Parallel/sharding dimension</strong></th>
1734
  <th><strong>Disadvantage</strong></th>
1735
  </tr>
@@ -1737,37 +1741,19 @@
1737
  <tbody>
1738
  <tr>
1739
  <td>DP</td>
1740
- <td>None (replicates everything)</td>
1741
  <td>Batch</td>
1742
  <td>Limited by max batch size</td>
1743
  </tr>
1744
- <tr>
1745
- <td>ZeRO-1</td>
1746
- <td>Optimizer states</td>
1747
- <td>Batch</td>
1748
- <td>Params communication overhead</td>
1749
- </tr>
1750
- <tr>
1751
- <td>ZeRO-2</td>
1752
- <td>Optimizer states and gradients</td>
1753
- <td>Batch</td>
1754
- <td>Params communication overhead</td>
1755
- </tr>
1756
- <tr>
1757
- <td>ZeRO-3</td>
1758
- <td>Optimizer states, gradients, and model parameters</td>
1759
- <td>Batch and Model Params</td>
1760
- <td>Params communication overhead</td>
1761
- </tr>
1762
  <tr>
1763
  <td>PP</td>
1764
- <td>Model</td>
1765
  <td>Model layers</td>
1766
  <td>Idle bubble and complex schedules</td>
1767
  </tr>
1768
  <tr>
1769
  <td>TP/SP</td>
1770
- <td>Model and activations</td>
1771
  <td>Hidden dimension / Sequence length</td>
1772
  <td>Requires high bandwidth communication</td>
1773
  </tr>
@@ -1775,86 +1761,111 @@
1775
  <td>CP</td>
1776
  <td>Activations</td>
1777
  <td>Sequence length</td>
1778
- <td>Communication overhead in attention</td>
1779
  </tr>
1780
  <tr>
1781
  <td>EP</td>
1782
  <td>Experts parameters</td>
1783
  <td>Expert dimension</td>
1784
- <td>Requires MoE layers, routing overhead</td>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1785
  </tr>
1786
  </tbody>
1787
  </table>
1788
 
1789
- <p>Clearly, there is no free lunch for any of those methods but we can actually come up with a few rules that help finding a good starting point. To find the definitive optimal setup you'll have to run a few experiments in any case.</p>
 
 
 
 
 
 
1790
 
1791
  <h3>Step 1: Fitting a Training Step in Memory</h3>
1792
 
1793
- <p>First, we need to figure out how we can fit a single model instance on GPUs. There are two general cases.</p>
1794
 
1795
  <p>GPU-rich case 🤑 - when you have plenty of GPUs available:</p>
1796
  <ul>
1797
- <li>For models under 10B parameters, you can use either Tensor Parallelism or Data Parallelism with ZeRO-3 and Full Recompute across 8 GPUs</li>
1798
  <li>For models between 10B-100B parameters requiring more than 8 GPUs, you have several options:</li>
1799
  <ul>
1800
- <li>Tensor Parallelism (TP=8) combined with Pipeline Parallelism</li>
1801
- <li>Tensor Parallelism (TP=8) with Data Parallelism (ZeRO-3)</li>
1802
- <li>Pure Data Parallelism with ZeRO-3</li>
1803
  </ul>
1804
- <li>At 512+ GPU scale, pure Data Parallelism becomes inefficient - better to combine DP with either Tensor or Pipeline Parallelism</li>
1805
- <li>At 1024+ GPU scale, the recommended setup is TP=8 with Data Parallelism (ZeRO-2) and Pipeline Parallelism</li>
1806
  </ul>
1807
 
1808
  <p>Special considerations:</p>
1809
  <ul>
1810
- <li>For very long sequences, add Context Parallelism (CP) across nodes</li>
1811
- <li>For Mixture of Experts architectures, use Expert Parallelism (EP) across nodes</li>
1812
  </ul>
1813
 
1814
- <p>GPU-poor case 😭 - when running out of GPU resources:</p>
1815
  <ul>
1816
- <li>Enable full activation recomputation to trade compute for memory</li>
1817
- <li>Use gradient accumulation to process larger batches with limited memory
1818
  </li>
1819
  </ul>
1820
 
1821
- <p>Now that we have a single model instance training, we need to make sure we have the right batch size.</p>
1822
 
1823
  <h3>Step 2: Achieving Target Global Batch Size </h3>
1824
 
1825
- <p>Depending on how we setup in step one in terms of micro batch size and DP, our current batch size might be too small or big. </p>
1826
 
1827
- <p>To increase global batch size:</p>
1828
  <ul>
1829
- <li>Scale up Data Parallelism or gradient accumulation steps</li>
1830
- <li>For long sequences, leverage Context Parallelism</li>
1831
  </ul>
1832
 
1833
- <p>To decrease global batch size:</p>
1834
  <ul>
1835
- <li>Reduce Data Parallelism in favor of other parallelization strategies</li>
1836
- <li>For long sequences, reduce Context Parallelism</li>
1837
  </ul>
1838
 
1839
- <p>Ok, now we have the model running in the configuration we want, but is it the fastest way? Let's optimize throughput next.</p>
1840
 
1841
  <h3>Step 3: Optimizing Training Throughput</h3>
1842
 
1843
  <p>So we want to make sure the training is running as fast as possible so all our precious GPUs are well utilized at all times. As long as memory and communication aren't bottlenecks we can try the following:</p>
1844
 
1845
  <ul>
1846
- <li>Scale up Tensor Parallelism within node to reduce other parallelism requirements</li>
1847
- <li>Increase Data Parallelism with ZeRO-3 while maintaining target batch size</li>
1848
- <li>When Data Parallelism communication becomes a bottleneck, transition to Pipeline Parallelism</li>
1849
- <li>Try scaling up different parallelisms, and fitting max micro batch size (mbs) to find optimal balance between max GBS, model size, compute, and communication.</li>
 
1850
  </ul>
1851
 
1852
- <p>We can roughly summarize the journey to the best configuration in the following diagram:</p>
1853
 
1854
  <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
 
1855
 
1856
-
1857
- <p>This concludes our very deep dive into the distribution methods of 5D parallelism. However, besides scaling our model efficiently across GPUs there is another way to improve model throughput and memory management. </p>
1858
 
1859
  <p>Time to turn the lights off and activate CUDA mode! </p>
1860
 
 
1580
 
1581
  <p>Congratulation reader, you have now seen all 5 parallelism strategies you can use to scale model training: </p>
1582
  <ol>
1583
+ <li>Data Parallelism (DP) – along the batch dimension</li>
1584
+ <li>Tensor Parallelism (TP) - along the hidden dimension</li>
1585
+ <li>Sequence and Context Parallelism (SP/CP) - along the sequence dimension</li>
1586
+ <li>Pipeline Parallelism (PP) - along the model layers</li>
1587
+ <li>Expert Parallelism (EP) - along the model experts</li>
1588
  </ol>
1589
 
1590
+ <p>As well as the 3 ZeRO strategies which can be combined with Data Parallelism for memory reduction: </p>
1591
+ <ol>
1592
+ <li>ZeRO-1 – sharding optimizer states among the DP replicas</li>
1593
+ <li>ZeRO-2 – sharding optimizer states and gradients among the DP replicas</li>
1594
+ <li>ZeRO-3 – sharding optimizer states, gradients and parameters among the DP replicas</li>
1595
+ </ol>
1596
 
1597
+ <p>At this stage, one aspect you are probably curious about is how all these parallelism and ZeRO strategies compare to, and interact with, each other. In other words, which ones should we use and efficiently combine together, and which ones should we rather keep separated?</p>
1598
+
1599
+ <p>Let’s take a look at the similarities and interplay. We'll start by comparing Pipeline parallelism are ZeRO-3 side-by-side as they have some very close similarities but also important differences.</p>
1600
 
1601
+ <p><strong>Pipeline parallelism vs. ZeRO-3 -</strong> Both PP and ZeRO-3 are ways to partition the model weights over several GPUs and perform communication/computation along the model depth axis (for example in ZeRO-3, we prefetch the next layer while computing). This means in both cases full layer operations are computed on each device, as opposed to TP or EP for instance in which computation are performed on sub-layer units.</p>
1602
  <aside>In the following we say “a layer” to simplify what should be in general called “a set of layer” (as the basis sharding unit of the model).</aside>
1603
 
1604
+ <p>However, there are a few major differences between PP and ZeRO-3 approaches:</p>
1605
 
1606
  <div class="l-body">
1607
  <table>
 
1642
  </table>
1643
  </div>
1644
 
1645
+ <p>As you can see, ZeRO-3 and PP sove the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1646
 
1647
+ <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1648
 
1649
+ <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and can be combined with both Pipeline Parallelism and ZeRO-3 as it relies on the distributive property of matrix multiplications which allows weights and activations to be sharded and computed independently before being combined.</p>
1650
 
1651
  <img alt="TP & SP diagram" src="/assets/images/5D_nutshell_tp_sp.svg" style="width: 1000px; max-width: none;" />
1652
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1653
 
1654
 
1655
+ <p>The main reason we don't want to use TP only for parallelism is that, in practice, TP has two limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
 
 
1656
 
1657
+ <p>As a consequence, when combining parallelism strategies, TP will typically be kept for high-speed intra-node communications while ZeRO-3 or PP can be used for parallelism groups spanning lower speed inter-node communications as their communication patterns require less bandwidth (for PP) or can be more easily overlapped with computation (for ZeRO-3). The main consideration when combining these techniques is to organize the GPU efficiently in groups for each parallelism dimension to maximize throughput and minimize communication overhead, while being mindful of TP's scaling limitations. For instance, the groups of GPUs communicating for TP should be kept inside nodes.</p>
1658
 
1659
+ <p><strong>Context Parallelism</strong> and <strong>Expert Parallelism</strong> also help us shard activations, and can be seen as complimentary to TP. The first one handles long sequences while the second enables distributed Mixture of Experts training and they can be combined together without any particular issue.</p>
1660
 
1661
+ <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. As we saw in <a target="_self" href="#context_parallelism"> CP section</a>, this is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where, even when using full activation recomputation, the memory requirements for attention would be prohibitive on a single GPU.</p>
1662
 
1663
  <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1000px; max-width: none;" />
1664
 
1665
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1666
 
1667
 
1668
+ <p><strong>Expert Parallelism (EP)</strong> specifically targets the challenge of training Mixture of Experts (MoE) models by sharding specialized "experts" across GPUs and dynamically routing tokens to relevant experts during computation. The key communication operation in EP is the `all-to-all` operations routing tokens to their assigned experts and gathering the results back. While this operation introduces some communication overhead, it enables scaling model capacity significantly since each token is only processed during inference (and training) by a much smaller fraction of the total parameters. In terms of distributed training/inference, partitioning experts across GPUs becomes relevant when models scales to a large number of experts.</p>
1669
+ <aside>For instance DeepSeek V3 uses 256 experts.</aside>
1670
 
1671
+ <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1000px; max-width: none;" />
1672
+
1673
+ <div class="note-box">
1674
+ <p class="note-box-title">📝 Note</p>
1675
+ <div class="note-box-content">
1676
+ <p>This similarity between EP and DP in terms of input handling is why some implementations consider Expert Parallelism to be a subgroup of Data Parallelism, with the key difference being that EP uses specialized expert routing rather than having all GPUs process inputs through identical model copies.</p>
1677
+ </div>
1678
+ </div>
1679
+
1680
+ <p><strong>Scope and focus</strong> Let's also quickly summarize the sub-part of the model where some of these different parallelism strategies have the most impact:</p>
1681
 
1682
  <ul>
1683
+ <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1684
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1685
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1686
+ <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipaline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1687
  </ul>
1688
 
 
 
 
 
 
 
 
 
 
 
1689
  <table>
1690
  <thead>
1691
  <tr>
 
1718
  </tbody>
1719
  </table>
1720
 
1721
+ <p><strong>Summarizing it all–</strong> Now what about gathering and combining all the techniques we've seen in a single diagram combining them all. Yes, we're up for the challenge!</p>
1722
+ <p>In this summary diagram, you will find illustrated activations and modules for a single transformers layer –in it's MoE variant–. We also illustrate the various directions of parallelism and the communication operations we've been discussing in all the previous sections.</p>
1723
 
1724
  <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1000px; max-width: none;"/></p>
1725
 
1726
+ <p>We can also represent side-by-side a <strong>full overview</strong> of the memory savings for each one of these strategies. We'll plot them with different sequence length as well as with selective (top) and full (bottom) recomputation so you can see how they all play with activations:</p>
1727
 
1728
  <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1000px; max-width: none;"/>
1729
 
1730
+ <p>Let's finish this section with a high level view at all of these techniques, their main underlying idea and major bottleneck:</p>
 
 
 
 
1731
 
1732
  <table>
1733
  <thead>
1734
  <tr>
1735
  <th><strong>Method</strong></th>
1736
+ <th><strong>Memory savings applies specifically on</strong></th>
1737
  <th><strong>Parallel/sharding dimension</strong></th>
1738
  <th><strong>Disadvantage</strong></th>
1739
  </tr>
 
1741
  <tbody>
1742
  <tr>
1743
  <td>DP</td>
1744
+ <td>Activations (reduce local batch size)</td>
1745
  <td>Batch</td>
1746
  <td>Limited by max batch size</td>
1747
  </tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1748
  <tr>
1749
  <td>PP</td>
1750
+ <td>Model parameters</td>
1751
  <td>Model layers</td>
1752
  <td>Idle bubble and complex schedules</td>
1753
  </tr>
1754
  <tr>
1755
  <td>TP/SP</td>
1756
+ <td>Model parameters and activations</td>
1757
  <td>Hidden dimension / Sequence length</td>
1758
  <td>Requires high bandwidth communication</td>
1759
  </tr>
 
1761
  <td>CP</td>
1762
  <td>Activations</td>
1763
  <td>Sequence length</td>
1764
+ <td>Add communication overhead in attention modules</td>
1765
  </tr>
1766
  <tr>
1767
  <td>EP</td>
1768
  <td>Experts parameters</td>
1769
  <td>Expert dimension</td>
1770
+ <td>Requires MoE layers, add routing communication overhead</td>
1771
+ </tr>
1772
+ <tr>
1773
+ <td>ZeRO-1</td>
1774
+ <td>Optimizer states</td>
1775
+ <td>Sharded among DP replicas</td>
1776
+ <td>Params communication overhead</td>
1777
+ </tr>
1778
+ <tr>
1779
+ <td>ZeRO-2</td>
1780
+ <td>Optimizer states and gradients</td>
1781
+ <td>Sharded among DP replicas</td>
1782
+ <td>Params communication overhead</td>
1783
+ </tr>
1784
+ <tr>
1785
+ <td>ZeRO-3</td>
1786
+ <td>Optimizer states, gradients, and model parameters</td>
1787
+ <td>Sharded among DP replicas</td>
1788
+ <td>Params communication overhead</td>
1789
  </tr>
1790
  </tbody>
1791
  </table>
1792
 
1793
+ <p>Clearly, none of these techniques is a silver bullet for magical scaling we'll often have to combine them in one way or another. Can we actually come up with a few rules that help finding a good starting point to select and combine them? This will be the topic of our next section.</p>
1794
+
1795
+ <h2>How to Find the Best Training Configuration</h2>
1796
+
1797
+ <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1798
+
1799
+ <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1800
 
1801
  <h3>Step 1: Fitting a Training Step in Memory</h3>
1802
 
1803
+ <p>First, we need to figure out how we can fit a full model instance on our GPUs (we focus on a single instance for now - even though we may use DP for ZeRO). There are two general cases.</p>
1804
 
1805
  <p>GPU-rich case 🤑 - when you have plenty of GPUs available:</p>
1806
  <ul>
1807
+ <li>For models under 10B parameters, you can use a single parallelism technique, e.g. Tensor Parallelism or ZeRO-3/DP with Full Recompute across 8 GPUs</li>
1808
  <li>For models between 10B-100B parameters requiring more than 8 GPUs, you have several options:</li>
1809
  <ul>
1810
+ <li>Combining Tensor Parallelism (TP=8) with Pipeline Parallelism</li>
1811
+ <li>Combining Tensor Parallelism (TP=8) with Data Parallelism (ZeRO-3)</li>
1812
+ <li>Using only ZeRO-3 (i.e. only pure Data Parallelism) </li>
1813
  </ul>
1814
+ <li>At 512+ GPU scale, pure Data Parallelism/ZeRO-3 will start to becomes inefficient due to communication cost - it can be better to then combine DP with either Tensor or Pipeline Parallelism</li>
1815
+ <li>At 1024+ GPU scale, a recommended setup can be Tensor Parallelism TP=8 with Data Parallelism (ZeRO-2) and Pipeline Parallelism</li>
1816
  </ul>
1817
 
1818
  <p>Special considerations:</p>
1819
  <ul>
1820
+ <li>For very long sequences, you will probably want to add Context Parallelism (CP) across nodes.</li>
1821
+ <li>For Mixture of Experts architectures, you will advantageously use Expert Parallelism (EP) across nodes.</li>
1822
  </ul>
1823
 
1824
+ <p>GPU-poor case 😭 - when you might be low on GPU resources:</p>
1825
  <ul>
1826
+ <li>You can enable full activation recomputation to trade some compute for memory (and train a bit slower).</li>
1827
+ <li>You can increase gradient accumulation to process larger batches with limited memory.
1828
  </li>
1829
  </ul>
1830
 
1831
+ <p>Now that we have a first model instance training, we need to make sure we have the right batch size.</p>
1832
 
1833
  <h3>Step 2: Achieving Target Global Batch Size </h3>
1834
 
1835
+ <p>Depending on where step 1 left us in terms of micro batch size and DP, our current batch size might be too small or too big. It's now time to hit our target batch size.</p>
1836
 
1837
+ <p>To increase our current global batch size:</p>
1838
  <ul>
1839
+ <li>We can scale up Data Parallelism or gradient accumulation steps</li>
1840
+ <li>For long sequences, we can leverage Context Parallelism</li>
1841
  </ul>
1842
 
1843
+ <p>To decrease our current global batch size:</p>
1844
  <ul>
1845
+ <li>We can reduce Data Parallelism in favor of other parallelization strategies</li>
1846
+ <li>For long sequences, we can reduce Context Parallelism</li>
1847
  </ul>
1848
 
1849
+ <p>Ok, now we have the model running in the general configuration we want in terms of model size and batch size, but are we training it the fastest way? Let's now start to optimize throughput as much as possible.</p>
1850
 
1851
  <h3>Step 3: Optimizing Training Throughput</h3>
1852
 
1853
  <p>So we want to make sure the training is running as fast as possible so all our precious GPUs are well utilized at all times. As long as memory and communication aren't bottlenecks we can try the following:</p>
1854
 
1855
  <ul>
1856
+ <li>Scale up Tensor Parallelism (using the fast intra-node bandwidth) until we reach a degree close to the node size, so that we can reduce other parallelism</li>
1857
+ <li>Increase Data Parallelism with ZeRO-3 while keeping target batch size</li>
1858
+ <li>When Data Parallelism communication starts to become a bottleneck, transition to using Pipeline Parallelism</li>
1859
+ <li>Try scaling up different parallelisms one by one</li>
1860
+ <li>Experiment with several micro batch size (mbs) to aim for an optimal balance between max GBS, model size, compute, and communication.</li>
1861
  </ul>
1862
 
1863
+ <!-- <p>We can roughly summarize the journey to the best configuration in the following diagram:</p>
1864
 
1865
  <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1866
+ -->
1867
 
1868
+ <p>This concludes our very deep dive into 5D parallelism. However, besides scaling our model efficiently across GPUs there is another way to improve model throughput and memory management. It involves a better understanding of how GPU operate at a low level and is among the necessary knowledge to be able to take maximal advantage of large GPU clusters.</p>
 
1869
 
1870
  <p>Time to turn the lights off and activate CUDA mode! </p>
1871
 
src/index.html CHANGED
@@ -1580,21 +1580,28 @@
1580
 
1581
  <p>Congratulation reader, you have now seen all 5 parallelism strategies you can use to scale model training: </p>
1582
  <ol>
1583
- <li>Data Parallelism – along the batch dimension (including ZeRO)</li>
1584
- <li>Tensor Parallelism - along the hidden dimension</li>
1585
- <li>Sequence and Context Parallelism - along the sequence dimension</li>
1586
- <li>Pipeline Parallelism - along the model layers</li>
1587
- <li>Expert Parallelism - along the model experts</li>
1588
  </ol>
1589
 
1590
- <p>At this stage, one aspect you are probably curious about is how all these parallelism strategies (and ZeRO) compare to each other and how they interact with each other? In a nutshell, which one should we use and combine?</p>
 
 
 
 
 
1591
 
1592
- <p>Let’s take a look at the similarities and interplay. We'll start by bringing Pipeline parallelism are ZeRO-3 side-by-side as they have interesting similarities and differences.</p>
 
 
1593
 
1594
- <p><strong>Pipeline parallelism vs. ZeRO-3 -</strong> Both are ways to partition the model weights over several GPUs and perform communication/computation along the model depth axis (for example in ZeRO-3, we prefetch the next layer while computing). This means in both cases the full layers are computed on device, as opposed to TP, where the layers are sharded for the computation.</p>
1595
  <aside>In the following we say “a layer” to simplify what should be in general called “a set of layer” (as the basis sharding unit of the model).</aside>
1596
 
1597
- <p>However, there are a few major differences between the two:</p>
1598
 
1599
  <div class="l-body">
1600
  <table>
@@ -1635,50 +1642,50 @@
1635
  </table>
1636
  </div>
1637
 
1638
- <p>As you can see, ZeRO-3 and PP sove the same challenge through quite different approaches, whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory for each micro-batch in PP to minimize as much as possible the communication overhead.</p>
1639
 
1640
- <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1!</p>
1641
 
1642
- <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and interoperable with both Pipeline Parallelism and ZeRO-3, because it relies on the distributive property of matrix multiplication that allows weights and activations to be sharded and computed independently before being combined.</p>
1643
 
1644
  <img alt="TP & SP diagram" src="/assets/images/5D_nutshell_tp_sp.svg" style="width: 1000px; max-width: none;" />
1645
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1646
 
1647
 
1648
- <p>In practice TP has two important limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
1649
-
1650
- <p>When combining parallelism strategies, TP will typically be kept for high-speed intra-node communications while ZeRO-3 or PP can use parallelism groups spanning lower speed inter-node communications, since their communication patterns are more amenable to scaling. The main consideration is organizing the GPU groups efficiently for each parallelism dimension to maximize throughput and minimize communication overhead, while being mindful of TP's scaling limitations.</p>
1651
 
 
1652
 
1653
- <p><strong>Context Parallelism</strong> and <strong>Expert Parallelism</strong> also help us sharding activations, and can be seen as complimentary to TP The former handles long sequences while the latter enables distributed Mixture of Experts training.</p>
1654
 
1655
- <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. This is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where even with full activation recomputation the memory requirements for attention would be prohibitive on a single GPU.</p>
1656
 
1657
  <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1000px; max-width: none;" />
1658
 
1659
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1660
 
1661
 
1662
- <p><strong>Expert Parallelism (EP)</strong> specifically targets the challenge of training Mixture of Experts (MoE) models by sharding specialized "experts" across GPUs and dynamically routing tokens to relevant experts during computation. The key communication pattern in EP is the all-to-all operation needed to route tokens to their assigned experts and gather the results back. While this introduces some communication overhead, it enables scaling model capacity significantly since each token only needs to compute through a fraction of the total parameters. This partitioning of experts across GPUs becomes essential when working with models that have a large number of experts, like DeepSeek which uses 256 experts.</p>
 
1663
 
1664
- <p>It's worth noting the scope of impact for these different parallelism strategies:</p>
 
 
 
 
 
 
 
 
 
1665
 
1666
  <ul>
1667
- <li>Tensor Parallelism (with Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1668
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1669
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
 
1670
  </ul>
1671
 
1672
- <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1000px; max-width: none;" />
1673
-
1674
- <div class="note-box">
1675
- <p class="note-box-title">📝 Note</p>
1676
- <div class="note-box-content">
1677
- <p>This similarity between EP and DP in terms of input handling is why some implementations consider Expert Parallelism to be a subgroup of Data Parallelism, with the key difference being that EP uses specialized expert routing rather than having all GPUs process inputs through identical model copies.</p>
1678
- </div>
1679
- </div>
1680
-
1681
-
1682
  <table>
1683
  <thead>
1684
  <tr>
@@ -1711,25 +1718,22 @@
1711
  </tbody>
1712
  </table>
1713
 
1714
- <p>Which leads us to this beautiful diagram to summarize all what weve seen:</p>
 
1715
 
1716
  <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1000px; max-width: none;"/></p>
1717
 
1718
- <p>And to have an idea of the memory benefits of each parallelism:</p>
1719
 
1720
  <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1000px; max-width: none;"/>
1721
 
1722
- <h2>How to Find the Best Training Configuration</h2>
1723
-
1724
- <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models. There remain a general question: which ones should we choose and which ones are best combined? We touched a little bit on this at the end of the last section but in this section we will walk through the decision process step by step.</p>
1725
-
1726
- <p>First let's have a quick look at each parallel strategy and how it helps and at what cost it comes:</p>
1727
 
1728
  <table>
1729
  <thead>
1730
  <tr>
1731
  <th><strong>Method</strong></th>
1732
- <th><strong>Memory savings</strong></th>
1733
  <th><strong>Parallel/sharding dimension</strong></th>
1734
  <th><strong>Disadvantage</strong></th>
1735
  </tr>
@@ -1737,37 +1741,19 @@
1737
  <tbody>
1738
  <tr>
1739
  <td>DP</td>
1740
- <td>None (replicates everything)</td>
1741
  <td>Batch</td>
1742
  <td>Limited by max batch size</td>
1743
  </tr>
1744
- <tr>
1745
- <td>ZeRO-1</td>
1746
- <td>Optimizer states</td>
1747
- <td>Batch</td>
1748
- <td>Params communication overhead</td>
1749
- </tr>
1750
- <tr>
1751
- <td>ZeRO-2</td>
1752
- <td>Optimizer states and gradients</td>
1753
- <td>Batch</td>
1754
- <td>Params communication overhead</td>
1755
- </tr>
1756
- <tr>
1757
- <td>ZeRO-3</td>
1758
- <td>Optimizer states, gradients, and model parameters</td>
1759
- <td>Batch and Model Params</td>
1760
- <td>Params communication overhead</td>
1761
- </tr>
1762
  <tr>
1763
  <td>PP</td>
1764
- <td>Model</td>
1765
  <td>Model layers</td>
1766
  <td>Idle bubble and complex schedules</td>
1767
  </tr>
1768
  <tr>
1769
  <td>TP/SP</td>
1770
- <td>Model and activations</td>
1771
  <td>Hidden dimension / Sequence length</td>
1772
  <td>Requires high bandwidth communication</td>
1773
  </tr>
@@ -1775,86 +1761,111 @@
1775
  <td>CP</td>
1776
  <td>Activations</td>
1777
  <td>Sequence length</td>
1778
- <td>Communication overhead in attention</td>
1779
  </tr>
1780
  <tr>
1781
  <td>EP</td>
1782
  <td>Experts parameters</td>
1783
  <td>Expert dimension</td>
1784
- <td>Requires MoE layers, routing overhead</td>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1785
  </tr>
1786
  </tbody>
1787
  </table>
1788
 
1789
- <p>Clearly, there is no free lunch for any of those methods but we can actually come up with a few rules that help finding a good starting point. To find the definitive optimal setup you'll have to run a few experiments in any case.</p>
 
 
 
 
 
 
1790
 
1791
  <h3>Step 1: Fitting a Training Step in Memory</h3>
1792
 
1793
- <p>First, we need to figure out how we can fit a single model instance on GPUs. There are two general cases.</p>
1794
 
1795
  <p>GPU-rich case 🤑 - when you have plenty of GPUs available:</p>
1796
  <ul>
1797
- <li>For models under 10B parameters, you can use either Tensor Parallelism or Data Parallelism with ZeRO-3 and Full Recompute across 8 GPUs</li>
1798
  <li>For models between 10B-100B parameters requiring more than 8 GPUs, you have several options:</li>
1799
  <ul>
1800
- <li>Tensor Parallelism (TP=8) combined with Pipeline Parallelism</li>
1801
- <li>Tensor Parallelism (TP=8) with Data Parallelism (ZeRO-3)</li>
1802
- <li>Pure Data Parallelism with ZeRO-3</li>
1803
  </ul>
1804
- <li>At 512+ GPU scale, pure Data Parallelism becomes inefficient - better to combine DP with either Tensor or Pipeline Parallelism</li>
1805
- <li>At 1024+ GPU scale, the recommended setup is TP=8 with Data Parallelism (ZeRO-2) and Pipeline Parallelism</li>
1806
  </ul>
1807
 
1808
  <p>Special considerations:</p>
1809
  <ul>
1810
- <li>For very long sequences, add Context Parallelism (CP) across nodes</li>
1811
- <li>For Mixture of Experts architectures, use Expert Parallelism (EP) across nodes</li>
1812
  </ul>
1813
 
1814
- <p>GPU-poor case 😭 - when running out of GPU resources:</p>
1815
  <ul>
1816
- <li>Enable full activation recomputation to trade compute for memory</li>
1817
- <li>Use gradient accumulation to process larger batches with limited memory
1818
  </li>
1819
  </ul>
1820
 
1821
- <p>Now that we have a single model instance training, we need to make sure we have the right batch size.</p>
1822
 
1823
  <h3>Step 2: Achieving Target Global Batch Size </h3>
1824
 
1825
- <p>Depending on how we setup in step one in terms of micro batch size and DP, our current batch size might be too small or big. </p>
1826
 
1827
- <p>To increase global batch size:</p>
1828
  <ul>
1829
- <li>Scale up Data Parallelism or gradient accumulation steps</li>
1830
- <li>For long sequences, leverage Context Parallelism</li>
1831
  </ul>
1832
 
1833
- <p>To decrease global batch size:</p>
1834
  <ul>
1835
- <li>Reduce Data Parallelism in favor of other parallelization strategies</li>
1836
- <li>For long sequences, reduce Context Parallelism</li>
1837
  </ul>
1838
 
1839
- <p>Ok, now we have the model running in the configuration we want, but is it the fastest way? Let's optimize throughput next.</p>
1840
 
1841
  <h3>Step 3: Optimizing Training Throughput</h3>
1842
 
1843
  <p>So we want to make sure the training is running as fast as possible so all our precious GPUs are well utilized at all times. As long as memory and communication aren't bottlenecks we can try the following:</p>
1844
 
1845
  <ul>
1846
- <li>Scale up Tensor Parallelism within node to reduce other parallelism requirements</li>
1847
- <li>Increase Data Parallelism with ZeRO-3 while maintaining target batch size</li>
1848
- <li>When Data Parallelism communication becomes a bottleneck, transition to Pipeline Parallelism</li>
1849
- <li>Try scaling up different parallelisms, and fitting max micro batch size (mbs) to find optimal balance between max GBS, model size, compute, and communication.</li>
 
1850
  </ul>
1851
 
1852
- <p>We can roughly summarize the journey to the best configuration in the following diagram:</p>
1853
 
1854
  <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
 
1855
 
1856
-
1857
- <p>This concludes our very deep dive into the distribution methods of 5D parallelism. However, besides scaling our model efficiently across GPUs there is another way to improve model throughput and memory management. </p>
1858
 
1859
  <p>Time to turn the lights off and activate CUDA mode! </p>
1860
 
 
1580
 
1581
  <p>Congratulation reader, you have now seen all 5 parallelism strategies you can use to scale model training: </p>
1582
  <ol>
1583
+ <li>Data Parallelism (DP) – along the batch dimension</li>
1584
+ <li>Tensor Parallelism (TP) - along the hidden dimension</li>
1585
+ <li>Sequence and Context Parallelism (SP/CP) - along the sequence dimension</li>
1586
+ <li>Pipeline Parallelism (PP) - along the model layers</li>
1587
+ <li>Expert Parallelism (EP) - along the model experts</li>
1588
  </ol>
1589
 
1590
+ <p>As well as the 3 ZeRO strategies which can be combined with Data Parallelism for memory reduction: </p>
1591
+ <ol>
1592
+ <li>ZeRO-1 – sharding optimizer states among the DP replicas</li>
1593
+ <li>ZeRO-2 – sharding optimizer states and gradients among the DP replicas</li>
1594
+ <li>ZeRO-3 – sharding optimizer states, gradients and parameters among the DP replicas</li>
1595
+ </ol>
1596
 
1597
+ <p>At this stage, one aspect you are probably curious about is how all these parallelism and ZeRO strategies compare to, and interact with, each other. In other words, which ones should we use and efficiently combine together, and which ones should we rather keep separated?</p>
1598
+
1599
+ <p>Let’s take a look at the similarities and interplay. We'll start by comparing Pipeline parallelism are ZeRO-3 side-by-side as they have some very close similarities but also important differences.</p>
1600
 
1601
+ <p><strong>Pipeline parallelism vs. ZeRO-3 -</strong> Both PP and ZeRO-3 are ways to partition the model weights over several GPUs and perform communication/computation along the model depth axis (for example in ZeRO-3, we prefetch the next layer while computing). This means in both cases full layer operations are computed on each device, as opposed to TP or EP for instance in which computation are performed on sub-layer units.</p>
1602
  <aside>In the following we say “a layer” to simplify what should be in general called “a set of layer” (as the basis sharding unit of the model).</aside>
1603
 
1604
+ <p>However, there are a few major differences between PP and ZeRO-3 approaches:</p>
1605
 
1606
  <div class="l-body">
1607
  <table>
 
1642
  </table>
1643
  </div>
1644
 
1645
+ <p>As you can see, ZeRO-3 and PP sove the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1646
 
1647
+ <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1648
 
1649
+ <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and can be combined with both Pipeline Parallelism and ZeRO-3 as it relies on the distributive property of matrix multiplications which allows weights and activations to be sharded and computed independently before being combined.</p>
1650
 
1651
  <img alt="TP & SP diagram" src="/assets/images/5D_nutshell_tp_sp.svg" style="width: 1000px; max-width: none;" />
1652
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1653
 
1654
 
1655
+ <p>The main reason we don't want to use TP only for parallelism is that, in practice, TP has two limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
 
 
1656
 
1657
+ <p>As a consequence, when combining parallelism strategies, TP will typically be kept for high-speed intra-node communications while ZeRO-3 or PP can be used for parallelism groups spanning lower speed inter-node communications as their communication patterns require less bandwidth (for PP) or can be more easily overlapped with computation (for ZeRO-3). The main consideration when combining these techniques is to organize the GPU efficiently in groups for each parallelism dimension to maximize throughput and minimize communication overhead, while being mindful of TP's scaling limitations. For instance, the groups of GPUs communicating for TP should be kept inside nodes.</p>
1658
 
1659
+ <p><strong>Context Parallelism</strong> and <strong>Expert Parallelism</strong> also help us shard activations, and can be seen as complimentary to TP. The first one handles long sequences while the second enables distributed Mixture of Experts training and they can be combined together without any particular issue.</p>
1660
 
1661
+ <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. As we saw in <a target="_self" href="#context_parallelism"> CP section</a>, this is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where, even when using full activation recomputation, the memory requirements for attention would be prohibitive on a single GPU.</p>
1662
 
1663
  <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1000px; max-width: none;" />
1664
 
1665
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1666
 
1667
 
1668
+ <p><strong>Expert Parallelism (EP)</strong> specifically targets the challenge of training Mixture of Experts (MoE) models by sharding specialized "experts" across GPUs and dynamically routing tokens to relevant experts during computation. The key communication operation in EP is the `all-to-all` operations routing tokens to their assigned experts and gathering the results back. While this operation introduces some communication overhead, it enables scaling model capacity significantly since each token is only processed during inference (and training) by a much smaller fraction of the total parameters. In terms of distributed training/inference, partitioning experts across GPUs becomes relevant when models scales to a large number of experts.</p>
1669
+ <aside>For instance DeepSeek V3 uses 256 experts.</aside>
1670
 
1671
+ <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1000px; max-width: none;" />
1672
+
1673
+ <div class="note-box">
1674
+ <p class="note-box-title">📝 Note</p>
1675
+ <div class="note-box-content">
1676
+ <p>This similarity between EP and DP in terms of input handling is why some implementations consider Expert Parallelism to be a subgroup of Data Parallelism, with the key difference being that EP uses specialized expert routing rather than having all GPUs process inputs through identical model copies.</p>
1677
+ </div>
1678
+ </div>
1679
+
1680
+ <p><strong>Scope and focus</strong> Let's also quickly summarize the sub-part of the model where some of these different parallelism strategies have the most impact:</p>
1681
 
1682
  <ul>
1683
+ <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1684
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1685
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1686
+ <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipaline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1687
  </ul>
1688
 
 
 
 
 
 
 
 
 
 
 
1689
  <table>
1690
  <thead>
1691
  <tr>
 
1718
  </tbody>
1719
  </table>
1720
 
1721
+ <p><strong>Summarizing it all–</strong> Now what about gathering and combining all the techniques we've seen in a single diagram combining them all. Yes, we're up for the challenge!</p>
1722
+ <p>In this summary diagram, you will find illustrated activations and modules for a single transformers layer –in it's MoE variant–. We also illustrate the various directions of parallelism and the communication operations we've been discussing in all the previous sections.</p>
1723
 
1724
  <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1000px; max-width: none;"/></p>
1725
 
1726
+ <p>We can also represent side-by-side a <strong>full overview</strong> of the memory savings for each one of these strategies. We'll plot them with different sequence length as well as with selective (top) and full (bottom) recomputation so you can see how they all play with activations:</p>
1727
 
1728
  <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1000px; max-width: none;"/>
1729
 
1730
+ <p>Let's finish this section with a high level view at all of these techniques, their main underlying idea and major bottleneck:</p>
 
 
 
 
1731
 
1732
  <table>
1733
  <thead>
1734
  <tr>
1735
  <th><strong>Method</strong></th>
1736
+ <th><strong>Memory savings applies specifically on</strong></th>
1737
  <th><strong>Parallel/sharding dimension</strong></th>
1738
  <th><strong>Disadvantage</strong></th>
1739
  </tr>
 
1741
  <tbody>
1742
  <tr>
1743
  <td>DP</td>
1744
+ <td>Activations (reduce local batch size)</td>
1745
  <td>Batch</td>
1746
  <td>Limited by max batch size</td>
1747
  </tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1748
  <tr>
1749
  <td>PP</td>
1750
+ <td>Model parameters</td>
1751
  <td>Model layers</td>
1752
  <td>Idle bubble and complex schedules</td>
1753
  </tr>
1754
  <tr>
1755
  <td>TP/SP</td>
1756
+ <td>Model parameters and activations</td>
1757
  <td>Hidden dimension / Sequence length</td>
1758
  <td>Requires high bandwidth communication</td>
1759
  </tr>
 
1761
  <td>CP</td>
1762
  <td>Activations</td>
1763
  <td>Sequence length</td>
1764
+ <td>Add communication overhead in attention modules</td>
1765
  </tr>
1766
  <tr>
1767
  <td>EP</td>
1768
  <td>Experts parameters</td>
1769
  <td>Expert dimension</td>
1770
+ <td>Requires MoE layers, add routing communication overhead</td>
1771
+ </tr>
1772
+ <tr>
1773
+ <td>ZeRO-1</td>
1774
+ <td>Optimizer states</td>
1775
+ <td>Sharded among DP replicas</td>
1776
+ <td>Params communication overhead</td>
1777
+ </tr>
1778
+ <tr>
1779
+ <td>ZeRO-2</td>
1780
+ <td>Optimizer states and gradients</td>
1781
+ <td>Sharded among DP replicas</td>
1782
+ <td>Params communication overhead</td>
1783
+ </tr>
1784
+ <tr>
1785
+ <td>ZeRO-3</td>
1786
+ <td>Optimizer states, gradients, and model parameters</td>
1787
+ <td>Sharded among DP replicas</td>
1788
+ <td>Params communication overhead</td>
1789
  </tr>
1790
  </tbody>
1791
  </table>
1792
 
1793
+ <p>Clearly, none of these techniques is a silver bullet for magical scaling we'll often have to combine them in one way or another. Can we actually come up with a few rules that help finding a good starting point to select and combine them? This will be the topic of our next section.</p>
1794
+
1795
+ <h2>How to Find the Best Training Configuration</h2>
1796
+
1797
+ <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1798
+
1799
+ <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1800
 
1801
  <h3>Step 1: Fitting a Training Step in Memory</h3>
1802
 
1803
+ <p>First, we need to figure out how we can fit a full model instance on our GPUs (we focus on a single instance for now - even though we may use DP for ZeRO). There are two general cases.</p>
1804
 
1805
  <p>GPU-rich case 🤑 - when you have plenty of GPUs available:</p>
1806
  <ul>
1807
+ <li>For models under 10B parameters, you can use a single parallelism technique, e.g. Tensor Parallelism or ZeRO-3/DP with Full Recompute across 8 GPUs</li>
1808
  <li>For models between 10B-100B parameters requiring more than 8 GPUs, you have several options:</li>
1809
  <ul>
1810
+ <li>Combining Tensor Parallelism (TP=8) with Pipeline Parallelism</li>
1811
+ <li>Combining Tensor Parallelism (TP=8) with Data Parallelism (ZeRO-3)</li>
1812
+ <li>Using only ZeRO-3 (i.e. only pure Data Parallelism) </li>
1813
  </ul>
1814
+ <li>At 512+ GPU scale, pure Data Parallelism/ZeRO-3 will start to becomes inefficient due to communication cost - it can be better to then combine DP with either Tensor or Pipeline Parallelism</li>
1815
+ <li>At 1024+ GPU scale, a recommended setup can be Tensor Parallelism TP=8 with Data Parallelism (ZeRO-2) and Pipeline Parallelism</li>
1816
  </ul>
1817
 
1818
  <p>Special considerations:</p>
1819
  <ul>
1820
+ <li>For very long sequences, you will probably want to add Context Parallelism (CP) across nodes.</li>
1821
+ <li>For Mixture of Experts architectures, you will advantageously use Expert Parallelism (EP) across nodes.</li>
1822
  </ul>
1823
 
1824
+ <p>GPU-poor case 😭 - when you might be low on GPU resources:</p>
1825
  <ul>
1826
+ <li>You can enable full activation recomputation to trade some compute for memory (and train a bit slower).</li>
1827
+ <li>You can increase gradient accumulation to process larger batches with limited memory.
1828
  </li>
1829
  </ul>
1830
 
1831
+ <p>Now that we have a first model instance training, we need to make sure we have the right batch size.</p>
1832
 
1833
  <h3>Step 2: Achieving Target Global Batch Size </h3>
1834
 
1835
+ <p>Depending on where step 1 left us in terms of micro batch size and DP, our current batch size might be too small or too big. It's now time to hit our target batch size.</p>
1836
 
1837
+ <p>To increase our current global batch size:</p>
1838
  <ul>
1839
+ <li>We can scale up Data Parallelism or gradient accumulation steps</li>
1840
+ <li>For long sequences, we can leverage Context Parallelism</li>
1841
  </ul>
1842
 
1843
+ <p>To decrease our current global batch size:</p>
1844
  <ul>
1845
+ <li>We can reduce Data Parallelism in favor of other parallelization strategies</li>
1846
+ <li>For long sequences, we can reduce Context Parallelism</li>
1847
  </ul>
1848
 
1849
+ <p>Ok, now we have the model running in the general configuration we want in terms of model size and batch size, but are we training it the fastest way? Let's now start to optimize throughput as much as possible.</p>
1850
 
1851
  <h3>Step 3: Optimizing Training Throughput</h3>
1852
 
1853
  <p>So we want to make sure the training is running as fast as possible so all our precious GPUs are well utilized at all times. As long as memory and communication aren't bottlenecks we can try the following:</p>
1854
 
1855
  <ul>
1856
+ <li>Scale up Tensor Parallelism (using the fast intra-node bandwidth) until we reach a degree close to the node size, so that we can reduce other parallelism</li>
1857
+ <li>Increase Data Parallelism with ZeRO-3 while keeping target batch size</li>
1858
+ <li>When Data Parallelism communication starts to become a bottleneck, transition to using Pipeline Parallelism</li>
1859
+ <li>Try scaling up different parallelisms one by one</li>
1860
+ <li>Experiment with several micro batch size (mbs) to aim for an optimal balance between max GBS, model size, compute, and communication.</li>
1861
  </ul>
1862
 
1863
+ <!-- <p>We can roughly summarize the journey to the best configuration in the following diagram:</p>
1864
 
1865
  <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1866
+ -->
1867
 
1868
+ <p>This concludes our very deep dive into 5D parallelism. However, besides scaling our model efficiently across GPUs there is another way to improve model throughput and memory management. It involves a better understanding of how GPU operate at a low level and is among the necessary knowledge to be able to take maximal advantage of large GPU clusters.</p>
 
1869
 
1870
  <p>Time to turn the lights off and activate CUDA mode! </p>
1871