lvwerra HF staff commited on
Commit
bf2ffca
·
verified ·
1 Parent(s): 815f1b5
assets/images/ultra-cheatsheet.svg CHANGED
dist/assets/images/ultra-cheatsheet.svg CHANGED
dist/distill.bundle.js CHANGED
@@ -2146,7 +2146,7 @@ function _arrayWithHoles(r) { if (Array.isArray(r)) return r; }
2146
  function bylineTemplate(frontMatter) {
2147
  return "\n <div class=\"byline grid\">\n <div>\n <h3>Authors</h3>\n <div>\n ".concat(frontMatter.authors.map(function (author, i) {
2148
  return "\n <span class=\"author\">\n ".concat(author.personalURL ? "\n <a class=\"name\" href=\"".concat(author.personalURL, "\">").concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</a>" : "\n <span class=\"name\">".concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</span>", "\n </span>\n ");
2149
- }).join(''), "\n </div>\n </div>\n <div >\n <h3>Affiliation</h3>\n <div><a href=\"https://huggingface.co/\">HuggingFace</a>\n </div>\n </div>\n <div >\n <h3>Published</h3>\n <div>Feb 19, 2025</div>\n </div>\n </div>\n");
2150
  }
2151
  var Byline = /*#__PURE__*/function (_HTMLElement4) {
2152
  function Byline() {
 
2146
  function bylineTemplate(frontMatter) {
2147
  return "\n <div class=\"byline grid\">\n <div>\n <h3>Authors</h3>\n <div>\n ".concat(frontMatter.authors.map(function (author, i) {
2148
  return "\n <span class=\"author\">\n ".concat(author.personalURL ? "\n <a class=\"name\" href=\"".concat(author.personalURL, "\">").concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</a>" : "\n <span class=\"name\">".concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</span>", "\n </span>\n ");
2149
+ }).join(''), "\n </div>\n </div>\n <div >\n <h3>Affiliation</h3>\n <div><a href=\"https://huggingface.co/\">Hugging Face</a>\n </div>\n </div>\n <div >\n <h3>Published</h3>\n <div>Feb 19, 2025</div>\n </div>\n </div>\n");
2150
  }
2151
  var Byline = /*#__PURE__*/function (_HTMLElement4) {
2152
  function Byline() {
dist/distill.bundle.js.map CHANGED
The diff for this file is too large to render. See raw diff
 
dist/fragments/banner.html CHANGED
The diff for this file is too large to render. See raw diff
 
dist/fragments/benchmarks_interactive.html CHANGED
The diff for this file is too large to render. See raw diff
 
dist/index.html CHANGED
The diff for this file is too large to render. See raw diff
 
src/distill.js CHANGED
@@ -2097,7 +2097,7 @@ d-appendix > distill-appendix {
2097
  </div>
2098
  <div >
2099
  <h3>Affiliation</h3>
2100
- <div><a href="https://huggingface.co/">HuggingFace</a>
2101
  </div>
2102
  </div>
2103
  <div >
 
2097
  </div>
2098
  <div >
2099
  <h3>Affiliation</h3>
2100
+ <div><a href="https://huggingface.co/">Hugging Face</a>
2101
  </div>
2102
  </div>
2103
  <div >
src/fragments/banner.html CHANGED
The diff for this file is too large to render. See raw diff
 
src/fragments/benchmarks_interactive.html CHANGED
The diff for this file is too large to render. See raw diff
 
src/index.html CHANGED
@@ -1381,7 +1381,7 @@
1381
 
1382
  <ol>
1383
  <li>Send “current keys and values” to the next machine except during the last time step in a non-blocking manner so we can starts the following step before this step is finished</li>
1384
- <li>Locally compute the attention score on the “current keys and values” it already has, which typically involves performing <d-math>Softmax(\frac{QK^T}{\sqrt{d}}) * V</d-math>d-math>.</li>
1385
  <li>Wait to receive keys and values from the previous GPU and then circle back to step 1. where “current keys and values” are now the key/values just received from the previous GPU.</li>
1386
  </ol>
1387
 
 
1381
 
1382
  <ol>
1383
  <li>Send “current keys and values” to the next machine except during the last time step in a non-blocking manner so we can starts the following step before this step is finished</li>
1384
+ <li>Locally compute the attention score on the “current keys and values” it already has, which typically involves performing <d-math>Softmax(\frac{QK^T}{\sqrt{d}}) * V</d-math>.</li>
1385
  <li>Wait to receive keys and values from the previous GPU and then circle back to step 1. where “current keys and values” are now the key/values just received from the previous GPU.</li>
1386
  </ol>
1387