Spaces:
Runtime error
Runtime error
| scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value | |
| Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 | |
| Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 | |
| Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534 | |
| Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 | |
| Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111 | |
| Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 | |
| Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.47280542884465016,0.10506382347888965 | |
| Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.836501912571304,0.004136737098676645 | |
| Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.40006613209931935,0.17023995462900499 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8153742483272113,0.0057021327615243405 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 | |
| OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6182840223353117,0.0340492747686748 | |
| OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347 | |
| OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347 | |
| OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.836501912571304,0.004136737098676645 | |
| Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534 | |
| Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 | |
| Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 | |
| Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111 | |
| Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 | |
| Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.036369648372665396,0.9007802600472398 | |
| Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 | |
| Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 | |
| Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.836501912571304,0.004136737098676645 | |
| MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7637626158259734,0.008839740160738534 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5455447255899809,0.0614649096074132 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7412493166611012,0.011966745157436277 | |
| AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762 | |
| HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476 | |
| HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637 | |
| HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556 | |
| HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 | |
| HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762 | |
| HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.4999999999999999,0.10868055555555556 | |
| HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 | |
| HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556 | |
| HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 | |
| HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 | |
| HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556 | |
| HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.2857142857142857,0.39875992063492066 | |
| HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.4999999999999999,0.10868055555555556 | |
| HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.32732683535398854,0.2618277009271762 | |
| HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.7142857142857142,0.014136904761904762 | |
| HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.4999999999999999,0.10868055555555556 | |
| HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476 | |
| HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.47280542884465016,0.10506382347888965 | |
| HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.7142857142857142,0.014136904761904762 | |
| HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.6182840223353117,0.0340492747686748 | |
| HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.40006613209931935,0.17023995462900499 | |
| HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.40006613209931935,0.17023995462900499 | |
| HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.40006613209931935,0.17023995462900499 | |
| HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.6182840223353117,0.0340492747686748 | |
| HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5455447255899809,0.0614649096074132 | |
| HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.47280542884465016,0.10506382347888965 | |
| HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556 | |
| HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 | |
| HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.4999999999999999,0.10868055555555556 | |
| HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762 | |
| HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5455447255899809,0.0614649096074132 | |
| OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 | |
| OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 | |
| OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 | |
| OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111 | |
| OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 | |
| OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965 | |
| OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111 | |
| OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6182840223353117,0.0340492747686748 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,0,0.4447495899966607,0.1315867602811863 | |
| Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 | |
| Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534 | |
| Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 | |
| HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 | |
| HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 | |
| HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534 | |
| HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748 | |
| BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 | |
| BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6182840223353117,0.0340492747686748 | |
| BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 | |
| BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 | |
| BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 | |
| BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 | |
| BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 | |
| BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.836501912571304,0.004136737098676645 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7412493166611012,0.011966745157436277 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111 | |
| hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 | |
| hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 | |
| hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 | |
| hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 | |
| hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111 | |
| hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 | |
| aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534 | |
| aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 | |
| aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 | |
| aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,1,0.47280542884465016,0.10506382347888965 | |
| aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,0,0.836501912571304,0.004136737098676645 | |
| aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.40006613209931935,0.17023995462900499 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.8153742483272113,0.0057021327615243405 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 | |
| aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,0,0.6182840223353117,0.0340492747686748 | |
| aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347 | |
| aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347 | |
| aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,0,0.836501912571304,0.004136737098676645 | |
| aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534 | |
| aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,0,0.0,1.0 | |
| aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,1,0.036369648372665396,0.9007802600472398 | |
| aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 | |
| aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,2,0.836501912571304,0.004136737098676645 | |
| aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,0,0.7637626158259734,0.008839740160738534 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,0,0.5455447255899809,0.0614649096074132 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.7412493166611012,0.011966745157436277 | |
| aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,2,-0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,2,-0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.32732683535398854,0.2618277009271762 | |
| aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,0,-0.47280542884465016,0.10506382347888965 | |
| aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,1,-0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,2,-0.6182840223353117,0.0340492747686748 | |
| aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,0,-0.40006613209931935,0.17023995462900499 | |
| aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,2,-0.40006613209931935,0.17023995462900499 | |
| aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,0,-0.40006613209931935,0.17023995462900499 | |
| aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,2,-0.6182840223353117,0.0340492747686748 | |
| aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,1,-0.5455447255899809,0.0614649096074132 | |
| aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,2,-0.47280542884465016,0.10506382347888965 | |
| aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,0,-0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,2,-0.5455447255899809,0.0614649096074132 | |
| aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 | |
| aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 | |
| aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965 | |
| aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,0,0.6182840223353117,0.0340492747686748 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,0,0.4447495899966607,0.1315867602811863 | |
| aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534 | |
| aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.0,1.0 | |
| aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534 | |
| aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748 | |
| aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 | |
| aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,0,0.6182840223353117,0.0340492747686748 | |
| aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 | |
| aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 | |
| aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 | |
| aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 | |
| aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 | |
| aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,0,0.836501912571304,0.004136737098676645 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,1,0.7412493166611012,0.011966745157436277 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,0,0.0,1.0 | |
| aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,0,0.0,1.0 | |
| aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 | |
| aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |
| aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111 | |
| aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 | |