<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">import{s as de,n as me,o as he}from"../chunks/scheduler.7da89386.js";import{S as ue,i as ge,g as n,s as l,r as L,A as fe,h as r,f as o,c as a,j as ce,u as C,x as s,k as se,y as pe,a as i,v as T,d as z,t as A,w as P}from"../chunks/index.0b7befd3.js";import{H as E,E as _e}from"../chunks/EditOnGithub.0cb2bc8e.js";function be(X){let c,q,M,H,d,U,m,j,h,Y="These metrics use log-likelihood of the different possible targets.",R,u,ee="&lt;li&gt;&lt;code&gt;loglikelihood_acc&lt;/code&gt;: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token (&lt;code&gt;loglikelihood_acc_single_token&lt;/code&gt;)&lt;/li&gt; &lt;li&gt;&lt;code&gt;loglikelihood_acc_norm&lt;/code&gt;: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token (&lt;code&gt;loglikelihood_acc_norm_single_token&lt;/code&gt;)&lt;/li&gt; &lt;li&gt;&lt;code&gt;loglikelihood_acc_norm_nospace&lt;/code&gt;: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored&lt;/li&gt; &lt;li&gt;&lt;code&gt;loglikelihood_f1&lt;/code&gt;: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token (&lt;code&gt;loglikelihood_f1_single_token&lt;/code&gt;)&lt;/li&gt; &lt;li&gt;&lt;code&gt;mcc&lt;/code&gt;: Matthew’s correlation coefficient (a measure of agreement between statistical distributions),&lt;/li&gt; &lt;li&gt;&lt;code&gt;recall_at_1&lt;/code&gt;: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (&lt;code&gt;recall_at_1_single_token&lt;/code&gt;)&lt;/li&gt; &lt;li&gt;&lt;code&gt;recall_at_2&lt;/code&gt;: Fraction of instances where the choice with the 2nd best logprob or better was correct  - also exists in a faster version for tasks where the possible choices include only one token per choice (&lt;code&gt;recall_at_2_single_token&lt;/code&gt;)&lt;/li&gt; &lt;li&gt;&lt;code&gt;mrr&lt;/code&gt;: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance  - also exists in a faster version for tasks where the possible choices include only one token (&lt;code&gt;mrr_single_token&lt;/code&gt;)&lt;/li&gt; &lt;li&gt;&lt;code&gt;target_perplexity&lt;/code&gt;: Perplexity of the different choices available.&lt;/li&gt; &lt;li&gt;&lt;code&gt;acc_golds_likelihood&lt;/code&gt;:: A bit different, it actually checks if the average logprob of a single target is above or below 0.5&lt;/li&gt; &lt;li&gt;&lt;code&gt;multi_f1_numeric&lt;/code&gt;: Loglikelihood F1 score for multiple gold targets&lt;/li&gt;",G,g,te="All these metrics also exist in a “single token” version (&lt;code&gt;loglikelihood_acc_single_token&lt;/code&gt;, &lt;code&gt;loglikelihood_acc_norm_single_token&lt;/code&gt;, &lt;code&gt;loglikelihood_f1_single_token&lt;/code&gt;, &lt;code&gt;mcc_single_token&lt;/code&gt;, &lt;code&gt;recall@2_single_token&lt;/code&gt; and &lt;code&gt;mrr_single_token&lt;/code&gt;). When the multichoice option compares only one token (ex: “A” vs “B” vs “C” vs “D”, or “yes” vs “no”), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include:",O,f,oe="&lt;li&gt;&lt;code&gt;multi_f1_numeric&lt;/code&gt;: computes the f1 score of all possible choices and averages it.&lt;/li&gt;",B,p,I,_,ie="These metrics use log-likelihood of prompt.",S,b,le="&lt;li&gt;&lt;code&gt;word_perplexity&lt;/code&gt;: Perplexity (log probability of the input) weighted by the number of words of the sequence.&lt;/li&gt; &lt;li&gt;&lt;code&gt;byte_perplexity&lt;/code&gt;: Perplexity (log probability of the input) weighted by the number of bytes of the sequence.&lt;/li&gt; &lt;li&gt;&lt;code&gt;bits_per_byte&lt;/code&gt;: Average number of bits per byte according to model probabilities.&lt;/li&gt; &lt;li&gt;&lt;code&gt;log_prob&lt;/code&gt;: Predicted output’s average log probability (input’s log prob for language modeling).&lt;/li&gt;",W,v,D,w,ae="These metrics need the model to generate an output. They are therefore slower.",J,y,ne='&lt;li&gt;Base:&lt;ul&gt;&lt;li&gt;&lt;code&gt;perfect_exact_match&lt;/code&gt;: Fraction of instances where the prediction matches the gold exactly.&lt;/li&gt; &lt;li&gt;&lt;code&gt;exact_match&lt;/code&gt;: Fraction of instances where the prediction matches the gold with the exception of the border whitespaces (= after a &lt;code&gt;strip&lt;/code&gt; has been applied to both).&lt;/li&gt; &lt;li&gt;&lt;code&gt;quasi_exact_match&lt;/code&gt;: Fraction of instances where the normalized prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, …). Other variations exist, with other normalizers, such as &lt;code&gt;quasi_exact_match_triviaqa&lt;/code&gt;, which only normalizes the predictions after applying a strip to all sentences.&lt;/li&gt; &lt;li&gt;&lt;code&gt;prefix_exact_match&lt;/code&gt;: Fraction of instances where the beginning of the prediction matches the gold at the exception of the border whitespaces (= after a &lt;code&gt;strip&lt;/code&gt; has been applied to both).&lt;/li&gt; &lt;li&gt;&lt;code&gt;prefix_quasi_exact_match&lt;/code&gt;: Fraction of instances where the normalized beginning of the prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, …)&lt;/li&gt; &lt;li&gt;&lt;code&gt;exact_match_indicator&lt;/code&gt;: Exact match with some preceding context (before an indicator) removed&lt;/li&gt; &lt;li&gt;&lt;code&gt;f1_score_quasi&lt;/code&gt;: Average F1 score in terms of word overlap between the model output and gold, with both being normalized first&lt;/li&gt; &lt;li&gt;&lt;code&gt;f1_score&lt;/code&gt;:  Average F1 score in terms of word overlap between the model output and gold without normalisation&lt;/li&gt; &lt;li&gt;&lt;code&gt;f1_score_macro&lt;/code&gt;: Corpus level macro F1 score&lt;/li&gt; &lt;li&gt;&lt;code&gt;f1_score_macro&lt;/code&gt;: Corpus level micro F1 score&lt;/li&gt; &lt;li&gt;&lt;code&gt;maj_at_5&lt;/code&gt; and &lt;code&gt;maj_at_8&lt;/code&gt;: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction.&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt; &lt;li&gt;Summarization:&lt;ul&gt;&lt;li&gt;&lt;code&gt;rouge&lt;/code&gt;: Average ROUGE score &lt;a href="https://aclanthology.org/W04-1013/" rel="nofollow"&gt;(Lin, 2004)&lt;/a&gt;&lt;/li&gt; &lt;li&gt;&lt;code&gt;rouge1&lt;/code&gt;: Average ROUGE score &lt;a href="https://aclanthology.org/W04-1013/" rel="nofollow"&gt;(Lin, 2004)&lt;/a&gt; based on 1-gram overlap.&lt;/li&gt; &lt;li&gt;&lt;code&gt;rouge2&lt;/code&gt;: Average ROUGE score &lt;a href="https://aclanthology.org/W04-1013/" rel="nofollow"&gt;(Lin, 2004)&lt;/a&gt; based on 2-gram overlap.&lt;/li&gt; &lt;li&gt;&lt;code&gt;rougeL&lt;/code&gt;: Average ROUGE score &lt;a href="https://aclanthology.org/W04-1013/" rel="nofollow"&gt;(Lin, 2004)&lt;/a&gt; based on longest common subsequence overlap.&lt;/li&gt; &lt;li&gt;&lt;code&gt;rougeLsum&lt;/code&gt;: Average ROUGE score &lt;a href="https://aclanthology.org/W04-1013/" rel="nofollow"&gt;(Lin, 2004)&lt;/a&gt; based on longest common subsequence overlap.&lt;/li&gt; &lt;li&gt;&lt;code&gt;rouge_t5&lt;/code&gt; (BigBench): Corpus level ROUGE score for all available ROUGE metrics&lt;/li&gt; &lt;li&gt;&lt;code&gt;faithfulness&lt;/code&gt;: Faithfulness scores based on the SummaC method of &lt;a href="https://aclanthology.org/2022.tacl-1.10/" rel="nofollow"&gt;Laban et al. (2022)&lt;/a&gt;.&lt;/li&gt; &lt;li&gt;&lt;code&gt;extractiveness&lt;/code&gt;: Reports, based on &lt;a href="https://aclanthology.org/N18-1065/" rel="nofollow"&gt;(Grusky et al., 2018)&lt;/a&gt;&lt;ul&gt;&lt;li&gt;&lt;code&gt;summarization_coverage&lt;/code&gt;: Extent to which the model-generated summaries are extractive fragments from the source document,&lt;/li&gt; &lt;li&gt;&lt;code&gt;summarization_density&lt;/code&gt;: Extent to which the model-generated summaries are extractive summaries based on the source document,&lt;/li&gt; &lt;li&gt;&lt;code&gt;summarization_compression&lt;/code&gt;: Extent to which the model-generated summaries are compressed relative to the source document.&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt; &lt;li&gt;&lt;code&gt;bert_score&lt;/code&gt;: Reports the average BERTScore precision, recall, and f1 score &lt;a href="https://openreview.net/pdf?id=SkeHuCVFDr" rel="nofollow"&gt;(Zhang et al., 2020)&lt;/a&gt; between model generation and gold summary.&lt;/li&gt; &lt;li&gt;Translation&lt;/li&gt; &lt;li&gt;&lt;code&gt;bleu&lt;/code&gt;: Corpus level BLEU score &lt;a href="https://aclanthology.org/P02-1040/" rel="nofollow"&gt;(Papineni et al., 2002)&lt;/a&gt; - uses the sacrebleu implementation.&lt;/li&gt; &lt;li&gt;&lt;code&gt;bleu_1&lt;/code&gt;: Average sample BLEU score &lt;a href="https://aclanthology.org/P02-1040/" rel="nofollow"&gt;(Papineni et al., 2002)&lt;/a&gt; based on 1-gram overlap - uses the nltk implementation.&lt;/li&gt; &lt;li&gt;&lt;code&gt;bleu_4&lt;/code&gt;: Average sample BLEU score &lt;a href="https://aclanthology.org/P02-1040/" rel="nofollow"&gt;(Papineni et al., 2002)&lt;/a&gt; based on 4-gram overlap - uses the nltk implementation.&lt;/li&gt; &lt;li&gt;&lt;code&gt;chrf&lt;/code&gt;: Character n-gram matches f-score.&lt;/li&gt; &lt;li&gt;&lt;code&gt;ter&lt;/code&gt;: Translation edit/error rate.&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt; &lt;li&gt;Copyright&lt;ul&gt;&lt;li&gt;&lt;code&gt;copyright&lt;/code&gt;: Reports:&lt;ul&gt;&lt;li&gt;&lt;code&gt;longest_common_prefix_length&lt;/code&gt;: average length of longest common prefix between model generation and reference,&lt;/li&gt; &lt;li&gt;&lt;code&gt;edit_distance&lt;/code&gt;: average Levenshtein edit distance between model generation and reference,&lt;/li&gt; &lt;li&gt;&lt;code&gt;edit_similarity&lt;/code&gt;: average Levenshtein edit similarity (normalized by length of longer sequence) between model generation and reference.&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt; &lt;li&gt;Math:&lt;ul&gt;&lt;li&gt;&lt;code&gt;quasi_exact_match_math&lt;/code&gt;: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed)&lt;/li&gt; &lt;li&gt;&lt;code&gt;maj_at_4_math&lt;/code&gt;: Majority choice evaluation, using the math normalisation for the predictions and gold&lt;/li&gt; &lt;li&gt;&lt;code&gt;quasi_exact_match_gsm8k&lt;/code&gt;: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed)&lt;/li&gt; &lt;li&gt;&lt;code&gt;maj_at_8_gsm8k&lt;/code&gt;: Majority choice evaluation, using the gsm8k normalisation for the predictions and gold&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt;',N,k,V,x,re="&lt;li&gt;&lt;code&gt;llm_judge_gpt3p5&lt;/code&gt;: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API&lt;/li&gt; &lt;li&gt;&lt;code&gt;llm_judge_llama_3_405b&lt;/code&gt;: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API&lt;/li&gt; &lt;li&gt;&lt;code&gt;llm_judge_multi_turn_gpt3p5&lt;/code&gt;: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.&lt;/li&gt; &lt;li&gt;&lt;code&gt;llm_judge_multi_turn_llama_3_405b&lt;/code&gt;: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.&lt;/li&gt;",Z,$,K,F,Q;return d=new E({props:{title:"Metric List",local:"metric-list",headingTag:"h1"}}),m=new E({props:{title:"Automatic metrics for multiple choice tasks",local:"automatic-metrics-for-multiple-choice-tasks",headingTag:"h2"}}),p=new E({props:{title:"Automatic metrics for perplexity and language modeling",local:"automatic-metrics-for-perplexity-and-language-modeling",headingTag:"h2"}}),v=new E({props:{title:"Automatic metrics for generative tasks",local:"automatic-metrics-for-generative-tasks",headingTag:"h2"}}),k=new E({props:{title:"LLM-as-Judge",local:"llm-as-judge",headingTag:"h2"}}),$=new _e({props:{source:"https://github.com/huggingface/lighteval/blob/main/docs/source/metric-list.mdx"}}),{c(){c=n("meta"),q=l(),M=n("p"),H=l(),L(d.$$.fragment),U=l(),L(m.$$.fragment),j=l(),h=n("p"),h.textContent=Y,R=l(),u=n("ul"),u.innerHTML=ee,G=l(),g=n("p"),g.innerHTML=te,O=l(),f=n("ul"),f.innerHTML=oe,B=l(),L(p.$$.fragment),I=l(),_=n("p"),_.textContent=ie,S=l(),b=n("ul"),b.innerHTML=le,W=l(),L(v.$$.fragment),D=l(),w=n("p"),w.textContent=ae,J=l(),y=n("ul"),y.innerHTML=ne,N=l(),L(k.$$.fragment),V=l(),x=n("ul"),x.innerHTML=re,Z=l(),L($.$$.fragment),K=l(),F=n("p"),this.h()},l(e){const t=fe("svelte-u9bgzb",document.head);c=r(t,"META",{name:!0,content:!0}),t.forEach(o),q=a(e),M=r(e,"P",{}),ce(M).forEach(o),H=a(e),C(d.$$.fragment,e),U=a(e),C(m.$$.fragment,e),j=a(e),h=r(e,"P",{"data-svelte-h":!0}),s(h)!=="svelte-108o49i"&amp;&amp;(h.textContent=Y),R=a(e),u=r(e,"UL",{"data-svelte-h":!0}),s(u)!=="svelte-fa4prh"&amp;&amp;(u.innerHTML=ee),G=a(e),g=r(e,"P",{"data-svelte-h":!0}),s(g)!=="svelte-1u8onu4"&amp;&amp;(g.innerHTML=te),O=a(e),f=r(e,"UL",{"data-svelte-h":!0}),s(f)!=="svelte-2jy5ki"&amp;&amp;(f.innerHTML=oe),B=a(e),C(p.$$.fragment,e),I=a(e),_=r(e,"P",{"data-svelte-h":!0}),s(_)!=="svelte-3tccvl"&amp;&amp;(_.textContent=ie),S=a(e),b=r(e,"UL",{"data-svelte-h":!0}),s(b)!=="svelte-zznuqn"&amp;&amp;(b.innerHTML=le),W=a(e),C(v.$$.fragment,e),D=a(e),w=r(e,"P",{"data-svelte-h":!0}),s(w)!=="svelte-14ncypm"&amp;&amp;(w.textContent=ae),J=a(e),y=r(e,"UL",{"data-svelte-h":!0}),s(y)!=="svelte-1nv3bab"&amp;&amp;(y.innerHTML=ne),N=a(e),C(k.$$.fragment,e),V=a(e),x=r(e,"UL",{"data-svelte-h":!0}),s(x)!=="svelte-1se6tas"&amp;&amp;(x.innerHTML=re),Z=a(e),C($.$$.fragment,e),K=a(e),F=r(e,"P",{}),ce(F).forEach(o),this.h()},h(){se(c,"name","hf:doc:metadata"),se(c,"content",ve)},m(e,t){pe(document.head,c),i(e,q,t),i(e,M,t),i(e,H,t),T(d,e,t),i(e,U,t),T(m,e,t),i(e,j,t),i(e,h,t),i(e,R,t),i(e,u,t),i(e,G,t),i(e,g,t),i(e,O,t),i(e,f,t),i(e,B,t),T(p,e,t),i(e,I,t),i(e,_,t),i(e,S,t),i(e,b,t),i(e,W,t),T(v,e,t),i(e,D,t),i(e,w,t),i(e,J,t),i(e,y,t),i(e,N,t),T(k,e,t),i(e,V,t),i(e,x,t),i(e,Z,t),T($,e,t),i(e,K,t),i(e,F,t),Q=!0},p:me,i(e){Q||(z(d.$$.fragment,e),z(m.$$.fragment,e),z(p.$$.fragment,e),z(v.$$.fragment,e),z(k.$$.fragment,e),z($.$$.fragment,e),Q=!0)},o(e){A(d.$$.fragment,e),A(m.$$.fragment,e),A(p.$$.fragment,e),A(v.$$.fragment,e),A(k.$$.fragment,e),A($.$$.fragment,e),Q=!1},d(e){e&amp;&amp;(o(q),o(M),o(H),o(U),o(j),o(h),o(R),o(u),o(G),o(g),o(O),o(f),o(B),o(I),o(_),o(S),o(b),o(W),o(D),o(w),o(J),o(y),o(N),o(V),o(x),o(Z),o(K),o(F)),o(c),P(d,e),P(m,e),P(p,e),P(v,e),P(k,e),P($,e)}}}const ve='{"title":"Metric List","local":"metric-list","sections":[{"title":"Automatic metrics for multiple choice tasks","local":"automatic-metrics-for-multiple-choice-tasks","sections":[],"depth":2},{"title":"Automatic metrics for perplexity and language modeling","local":"automatic-metrics-for-perplexity-and-language-modeling","sections":[],"depth":2},{"title":"Automatic metrics for generative tasks","local":"automatic-metrics-for-generative-tasks","sections":[],"depth":2},{"title":"LLM-as-Judge","local":"llm-as-judge","sections":[],"depth":2}],"depth":1}';function we(X){return he(()=&gt;{new URLSearchParams(window.location.search).get("fw")}),[]}class $e extends ue{constructor(c){super(),ge(this,c,we,be,de,{})}}export{$e as component};
</pre></body></html>