import{s as de,n as me,o as he}from"../chunks/scheduler.7da89386.js";import{S as ue,i as ge,g as n,s as l,r as L,A as fe,h as r,f as o,c as a,j as ce,u as C,x as s,k as se,y as pe,a as i,v as T,d as z,t as A,w as P}from"../chunks/index.0b7befd3.js";import{H as E,E as _e}from"../chunks/EditOnGithub.0cb2bc8e.js";function be(X){let c,q,M,H,d,U,m,j,h,Y="These metrics use log-likelihood of the different possible targets.",R,u,ee="<li><code>loglikelihood_acc</code>: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token (<code>loglikelihood_acc_single_token</code>)</li> <li><code>loglikelihood_acc_norm</code>: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token (<code>loglikelihood_acc_norm_single_token</code>)</li> <li><code>loglikelihood_acc_norm_nospace</code>: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored</li> <li><code>loglikelihood_f1</code>: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token (<code>loglikelihood_f1_single_token</code>)</li> <li><code>mcc</code>: Matthew’s correlation coefficient (a measure of agreement between statistical distributions),</li> <li><code>recall_at_1</code>: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (<code>recall_at_1_single_token</code>)</li> <li><code>recall_at_2</code>: Fraction of instances where the choice with the 2nd best logprob or better was correct  - also exists in a faster version for tasks where the possible choices include only one token per choice (<code>recall_at_2_single_token</code>)</li> <li><code>mrr</code>: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance  - also exists in a faster version for tasks where the possible choices include only one token (<code>mrr_single_token</code>)</li> <li><code>target_perplexity</code>: Perplexity of the different choices available.</li> <li><code>acc_golds_likelihood</code>:: A bit different, it actually checks if the average logprob of a single target is above or below 0.5</li> <li><code>multi_f1_numeric</code>: Loglikelihood F1 score for multiple gold targets</li>",G,g,te="All these metrics also exist in a “single token” version (<code>loglikelihood_acc_single_token</code>, <code>loglikelihood_acc_norm_single_token</code>, <code>loglikelihood_f1_single_token</code>, <code>mcc_single_token</code>, <code>recall@2_single_token</code> and <code>mrr_single_token</code>). When the multichoice option compares only one token (ex: “A” vs “B” vs “C” vs “D”, or “yes” vs “no”), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include:",O,f,oe="<li><code>multi_f1_numeric</code>: computes the f1 score of all possible choices and averages it.</li>",B,p,I,_,ie="These metrics use log-likelihood of prompt.",S,b,le="<li><code>word_perplexity</code>: Perplexity (log probability of the input) weighted by the number of words of the sequence.</li> <li><code>byte_perplexity</code>: Perplexity (log probability of the input) weighted by the number of bytes of the sequence.</li> <li><code>bits_per_byte</code>: Average number of bits per byte according to model probabilities.</li> <li><code>log_prob</code>: Predicted output’s average log probability (input’s log prob for language modeling).</li>",W,v,D,w,ae="These metrics need the model to generate an output. They are therefore slower.",J,y,ne='<li>Base:<ul><li><code>perfect_exact_match</code>: Fraction of instances where the prediction matches the gold exactly.</li> <li><code>exact_match</code>: Fraction of instances where the prediction matches the gold with the exception of the border whitespaces (= after a <code>strip</code> has been applied to both).</li> <li><code>quasi_exact_match</code>: Fraction of instances where the normalized prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, …). Other variations exist, with other normalizers, such as <code>quasi_exact_match_triviaqa</code>, which only normalizes the predictions after applying a strip to all sentences.</li> <li><code>prefix_exact_match</code>: Fraction of instances where the beginning of the prediction matches the gold at the exception of the border whitespaces (= after a <code>strip</code> has been applied to both).</li> <li><code>prefix_quasi_exact_match</code>: Fraction of instances where the normalized beginning of the prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, …)</li> <li><code>exact_match_indicator</code>: Exact match with some preceding context (before an indicator) removed</li> <li><code>f1_score_quasi</code>: Average F1 score in terms of word overlap between the model output and gold, with both being normalized first</li> <li><code>f1_score</code>:  Average F1 score in terms of word overlap between the model output and gold without normalisation</li> <li><code>f1_score_macro</code>: Corpus level macro F1 score</li> <li><code>f1_score_macro</code>: Corpus level micro F1 score</li> <li><code>maj_at_5</code> and <code>maj_at_8</code>: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction.</li></ul></li> <li>Summarization:<ul><li><code>rouge</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a></li> <li><code>rouge1</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on 1-gram overlap.</li> <li><code>rouge2</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on 2-gram overlap.</li> <li><code>rougeL</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on longest common subsequence overlap.</li> <li><code>rougeLsum</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on longest common subsequence overlap.</li> <li><code>rouge_t5</code> (BigBench): Corpus level ROUGE score for all available ROUGE metrics</li> <li><code>faithfulness</code>: Faithfulness scores based on the SummaC method of <a href="https://aclanthology.org/2022.tacl-1.10/" rel="nofollow">Laban et al. (2022)</a>.</li> <li><code>extractiveness</code>: Reports, based on <a href="https://aclanthology.org/N18-1065/" rel="nofollow">(Grusky et al., 2018)</a><ul><li><code>summarization_coverage</code>: Extent to which the model-generated summaries are extractive fragments from the source document,</li> <li><code>summarization_density</code>: Extent to which the model-generated summaries are extractive summaries based on the source document,</li> <li><code>summarization_compression</code>: Extent to which the model-generated summaries are compressed relative to the source document.</li></ul></li> <li><code>bert_score</code>: Reports the average BERTScore precision, recall, and f1 score <a href="https://openreview.net/pdf?id=SkeHuCVFDr" rel="nofollow">(Zhang et al., 2020)</a> between model generation and gold summary.</li> <li>Translation</li> <li><code>bleu</code>: Corpus level BLEU score <a href="https://aclanthology.org/P02-1040/" rel="nofollow">(Papineni et al., 2002)</a> - uses the sacrebleu implementation.</li> <li><code>bleu_1</code>: Average sample BLEU score <a href="https://aclanthology.org/P02-1040/" rel="nofollow">(Papineni et al., 2002)</a> based on 1-gram overlap - uses the nltk implementation.</li> <li><code>bleu_4</code>: Average sample BLEU score <a href="https://aclanthology.org/P02-1040/" rel="nofollow">(Papineni et al., 2002)</a> based on 4-gram overlap - uses the nltk implementation.</li> <li><code>chrf</code>: Character n-gram matches f-score.</li> <li><code>ter</code>: Translation edit/error rate.</li></ul></li> <li>Copyright<ul><li><code>copyright</code>: Reports:<ul><li><code>longest_common_prefix_length</code>: average length of longest common prefix between model generation and reference,</li> <li><code>edit_distance</code>: average Levenshtein edit distance between model generation and reference,</li> <li><code>edit_similarity</code>: average Levenshtein edit similarity (normalized by length of longer sequence) between model generation and reference.</li></ul></li></ul></li> <li>Math:<ul><li><code>quasi_exact_match_math</code>: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed)</li> <li><code>maj_at_4_math</code>: Majority choice evaluation, using the math normalisation for the predictions and gold</li> <li><code>quasi_exact_match_gsm8k</code>: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed)</li> <li><code>maj_at_8_gsm8k</code>: Majority choice evaluation, using the gsm8k normalisation for the predictions and gold</li></ul></li>',N,k,V,x,re="<li><code>llm_judge_gpt3p5</code>: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API</li> <li><code>llm_judge_llama_3_405b</code>: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API</li> <li><code>llm_judge_multi_turn_gpt3p5</code>: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.</li> <li><code>llm_judge_multi_turn_llama_3_405b</code>: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.</li>",Z,$,K,F,Q;return d=new E({props:{title:"Metric List",local:"metric-list",headingTag:"h1"}}),m=new E({props:{title:"Automatic metrics for multiple choice tasks",local:"automatic-metrics-for-multiple-choice-tasks",headingTag:"h2"}}),p=new E({props:{title:"Automatic metrics for perplexity and language modeling",local:"automatic-metrics-for-perplexity-and-language-modeling",headingTag:"h2"}}),v=new E({props:{title:"Automatic metrics for generative tasks",local:"automatic-metrics-for-generative-tasks",headingTag:"h2"}}),k=new E({props:{title:"LLM-as-Judge",local:"llm-as-judge",headingTag:"h2"}}),$=new _e({props:{source:"https://github.com/huggingface/lighteval/blob/main/docs/source/metric-list.mdx"}}),{c(){c=n("meta"),q=l(),M=n("p"),H=l(),L(d.$$.fragment),U=l(),L(m.$$.fragment),j=l(),h=n("p"),h.textContent=Y,R=l(),u=n("ul"),u.innerHTML=ee,G=l(),g=n("p"),g.innerHTML=te,O=l(),f=n("ul"),f.innerHTML=oe,B=l(),L(p.$$.fragment),I=l(),_=n("p"),_.textContent=ie,S=l(),b=n("ul"),b.innerHTML=le,W=l(),L(v.$$.fragment),D=l(),w=n("p"),w.textContent=ae,J=l(),y=n("ul"),y.innerHTML=ne,N=l(),L(k.$$.fragment),V=l(),x=n("ul"),x.innerHTML=re,Z=l(),L($.$$.fragment),K=l(),F=n("p"),this.h()},l(e){const t=fe("svelte-u9bgzb",document.head);c=r(t,"META",{name:!0,content:!0}),t.forEach(o),q=a(e),M=r(e,"P",{}),ce(M).forEach(o),H=a(e),C(d.$$.fragment,e),U=a(e),C(m.$$.fragment,e),j=a(e),h=r(e,"P",{"data-svelte-h":!0}),s(h)!=="svelte-108o49i"&&(h.textContent=Y),R=a(e),u=r(e,"UL",{"data-svelte-h":!0}),s(u)!=="svelte-fa4prh"&&(u.innerHTML=ee),G=a(e),g=r(e,"P",{"data-svelte-h":!0}),s(g)!=="svelte-1u8onu4"&&(g.innerHTML=te),O=a(e),f=r(e,"UL",{"data-svelte-h":!0}),s(f)!=="svelte-2jy5ki"&&(f.innerHTML=oe),B=a(e),C(p.$$.fragment,e),I=a(e),_=r(e,"P",{"data-svelte-h":!0}),s(_)!=="svelte-3tccvl"&&(_.textContent=ie),S=a(e),b=r(e,"UL",{"data-svelte-h":!0}),s(b)!=="svelte-zznuqn"&&(b.innerHTML=le),W=a(e),C(v.$$.fragment,e),D=a(e),w=r(e,"P",{"data-svelte-h":!0}),s(w)!=="svelte-14ncypm"&&(w.textContent=ae),J=a(e),y=r(e,"UL",{"data-svelte-h":!0}),s(y)!=="svelte-1nv3bab"&&(y.innerHTML=ne),N=a(e),C(k.$$.fragment,e),V=a(e),x=r(e,"UL",{"data-svelte-h":!0}),s(x)!=="svelte-1se6tas"&&(x.innerHTML=re),Z=a(e),C($.$$.fragment,e),K=a(e),F=r(e,"P",{}),ce(F).forEach(o),this.h()},h(){se(c,"name","hf:doc:metadata"),se(c,"content",ve)},m(e,t){pe(document.head,c),i(e,q,t),i(e,M,t),i(e,H,t),T(d,e,t),i(e,U,t),T(m,e,t),i(e,j,t),i(e,h,t),i(e,R,t),i(e,u,t),i(e,G,t),i(e,g,t),i(e,O,t),i(e,f,t),i(e,B,t),T(p,e,t),i(e,I,t),i(e,_,t),i(e,S,t),i(e,b,t),i(e,W,t),T(v,e,t),i(e,D,t),i(e,w,t),i(e,J,t),i(e,y,t),i(e,N,t),T(k,e,t),i(e,V,t),i(e,x,t),i(e,Z,t),T($,e,t),i(e,K,t),i(e,F,t),Q=!0},p:me,i(e){Q||(z(d.$$.fragment,e),z(m.$$.fragment,e),z(p.$$.fragment,e),z(v.$$.fragment,e),z(k.$$.fragment,e),z($.$$.fragment,e),Q=!0)},o(e){A(d.$$.fragment,e),A(m.$$.fragment,e),A(p.$$.fragment,e),A(v.$$.fragment,e),A(k.$$.fragment,e),A($.$$.fragment,e),Q=!1},d(e){e&&(o(q),o(M),o(H),o(U),o(j),o(h),o(R),o(u),o(G),o(g),o(O),o(f),o(B),o(I),o(_),o(S),o(b),o(W),o(D),o(w),o(J),o(y),o(N),o(V),o(x),o(Z),o(K),o(F)),o(c),P(d,e),P(m,e),P(p,e),P(v,e),P(k,e),P($,e)}}}const ve='{"title":"Metric List","local":"metric-list","sections":[{"title":"Automatic metrics for multiple choice tasks","local":"automatic-metrics-for-multiple-choice-tasks","sections":[],"depth":2},{"title":"Automatic metrics for perplexity and language modeling","local":"automatic-metrics-for-perplexity-and-language-modeling","sections":[],"depth":2},{"title":"Automatic metrics for generative tasks","local":"automatic-metrics-for-generative-tasks","sections":[],"depth":2},{"title":"LLM-as-Judge","local":"llm-as-judge","sections":[],"depth":2}],"depth":1}';function we(X){return he(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class $e extends ue{constructor(c){super(),ge(this,c,we,be,de,{})}}export{$e as component};