craffel HF Staff commited on
Commit
02ca801
·
verified ·
1 Parent(s): af2129b

Upload metrics.eval.cd_hq.jsonl with huggingface_hub

Browse files
Files changed (1) hide show
  1. metrics.eval.cd_hq.jsonl +2 -0
metrics.eval.cd_hq.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"created_at": "2025-05-08T23:34:26.557449", "global_step": 2000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4257679180887372, "acc_stderr,none": 0.0144494642788688, "acc_norm,none": 0.4658703071672355, "acc_norm_stderr,none": 0.014577311315231099}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7289562289562289, "acc_stderr,none": 0.0091209197417606, "acc_norm,none": 0.7146464646464646, "acc_norm_stderr,none": 0.00926628058499775}, "boolq": {"alias": "boolq", "acc,none": 0.7318042813455657, "acc_stderr,none": 0.007748469592030345}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2989352989352989, "acc_stderr,none": 0.01310653028279809}, "copa": {"alias": "copa", "acc,none": 0.8, "acc_stderr,none": 0.040201512610368445}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4737104162517427, "acc_stderr,none": 0.004982879340691398, "acc_norm,none": 0.6406094403505278, "acc_norm_stderr,none": 0.004788412062375702}, "mmlu": {"acc,none": 0.34959407491810285, "acc_stderr,none": 0.003986813763121511, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.32539851222104144, "acc_stderr,none": 0.0067554212334649385, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235172}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.37575757575757573, "acc_stderr,none": 0.037818873532059816}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.44607843137254904, "acc_stderr,none": 0.03488845451304974}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5063291139240507, "acc_stderr,none": 0.03254462010767859}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4049586776859504, "acc_stderr,none": 0.04481137755942469}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04668408033024931}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.035590395316173425}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3208092485549133, "acc_stderr,none": 0.0251310002336479}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2212290502793296, "acc_stderr,none": 0.013882164598887288}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.36977491961414793, "acc_stderr,none": 0.027417996705631005}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.39197530864197533, "acc_stderr,none": 0.02716368603827124}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30834419817470665, "acc_stderr,none": 0.011794833789715334}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3742690058479532, "acc_stderr,none": 0.037116011853894806}, "mmlu_other": {"acc,none": 0.38461538461538464, "acc_stderr,none": 0.0087012710587326, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3849056603773585, "acc_stderr,none": 0.02994649856769995}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3815028901734104, "acc_stderr,none": 0.03703851193099521}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.45739910313901344, "acc_stderr,none": 0.03343577705583065}, "mmlu_management": {"alias": " - management", "acc,none": 0.4563106796116505, "acc_stderr,none": 0.049318019942204146}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4358974358974359, "acc_stderr,none": 0.03248577511578401}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.46, "acc_stderr,none": 0.05009082659620332}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3895274584929757, "acc_stderr,none": 0.017438082556264594}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.37254901960784315, "acc_stderr,none": 0.027684181883302898}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.30141843971631205, "acc_stderr,none": 0.027374128882631146}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3602941176470588, "acc_stderr,none": 0.029163128570670736}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.037400593820293204}, "mmlu_social_sciences": {"acc,none": 0.3737406564835879, "acc_stderr,none": 0.008670878696396921, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.0433913832257986}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.37823834196891193, "acc_stderr,none": 0.03499807276193339}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.33076923076923076, "acc_stderr,none": 0.023854795680971128}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3403361344537815, "acc_stderr,none": 0.030778057422931673}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.42752293577981654, "acc_stderr,none": 0.02121091020430043}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.366412213740458, "acc_stderr,none": 0.04225875451969638}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.019431775677037313}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.04494290866252088}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.30612244897959184, "acc_stderr,none": 0.029504896454595947}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.527363184079602, "acc_stderr,none": 0.035302355173346824}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_stem": {"acc,none": 0.3276244846178243, "acc_stderr,none": 0.008277320689787858, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.375, "acc_stderr,none": 0.039397364351956274}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4236111111111111, "acc_stderr,none": 0.04132125019723369}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252606}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993177}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.45, "acc_stderr,none": 0.05}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534443}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2751322751322751, "acc_stderr,none": 0.023000086859068642}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.43870967741935485, "acc_stderr,none": 0.028229497320317216}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678241}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.024388430433987657}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763744}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.03350991604696042}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.042878587513404544}, "mmlu_pro": {"exact_match,custom-extract": 0.18201462765957446, "exact_match_stderr,custom-extract": 0.0034771449900885703, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3291492329149233, "exact_match_stderr,custom-extract": 0.017561146780265928}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16983523447401774, "exact_match_stderr,custom-extract": 0.013376205653007208}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1068904593639576, "exact_match_stderr,custom-extract": 0.009187355756744656}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.22195121951219512, "exact_match_stderr,custom-extract": 0.020548045890068298}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2476303317535545, "exact_match_stderr,custom-extract": 0.014866330095923884}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.12899896800825594, "exact_match_stderr,custom-extract": 0.010773697418009065}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22860635696821516, "exact_match_stderr,custom-extract": 0.014691669532004209}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1968503937007874, "exact_match_stderr,custom-extract": 0.020397388648694077}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1444141689373297, "exact_match_stderr,custom-extract": 0.010598401112152015}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14507772020725387, "exact_match_stderr,custom-extract": 0.009585103230059955}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1634199134199134, "exact_match_stderr,custom-extract": 0.01217041531796006}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16633266533066132, "exact_match_stderr,custom-extract": 0.01668670139852614}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15473441108545036, "exact_match_stderr,custom-extract": 0.010038127358043917}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2781954887218045, "exact_match_stderr,custom-extract": 0.015872877950292862}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.01996610354027947, "acc_norm,none": 0.386, "acc_norm_stderr,none": 0.02179352921928117}, "piqa": {"alias": "piqa", "acc,none": 0.720892274211099, "acc_stderr,none": 0.010465657948498228, "acc_norm,none": 0.7230685527747551, "acc_norm_stderr,none": 0.01044049996933454}, "race": {"alias": "race", "acc,none": 0.36650717703349284, "acc_stderr,none": 0.014912890943719231}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45138178096212894, "acc_stderr,none": 0.01126045668162444}, "winogrande": {"alias": "winogrande", "acc,none": 0.6503551696921863, "acc_stderr,none": 0.013402073680850503}}
2
+ {"created_at": "2025-05-09T01:28:41.634251", "global_step": 4000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4206484641638225, "acc_stderr,none": 0.014426211252508396, "acc_norm,none": 0.45307167235494883, "acc_norm_stderr,none": 0.01454689205200563}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7196969696969697, "acc_stderr,none": 0.009216306864088033, "acc_norm,none": 0.7079124579124579, "acc_norm_stderr,none": 0.00933070561656907}, "boolq": {"alias": "boolq", "acc,none": 0.7685015290519878, "acc_stderr,none": 0.007377156064425054}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.3202293202293202, "acc_stderr,none": 0.013357704926272657}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4737104162517427, "acc_stderr,none": 0.0049828793406913995, "acc_norm,none": 0.6374228241386178, "acc_norm_stderr,none": 0.004797616754372304}, "mmlu": {"acc,none": 0.36227033186155816, "acc_stderr,none": 0.004005752094084227, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3324123273113709, "acc_stderr,none": 0.0067679612722847775, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.03395490020856111}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.37575757575757573, "acc_stderr,none": 0.03781887353205982}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.45588235294117646, "acc_stderr,none": 0.03495624522015473}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5527426160337553, "acc_stderr,none": 0.03236564251614192}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4297520661157025, "acc_stderr,none": 0.04519082021319773}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.047500773411999854}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3128834355828221, "acc_stderr,none": 0.036429145782924055}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.02519018132760841}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.22905027932960895, "acc_stderr,none": 0.014054314935614553}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.40192926045016075, "acc_stderr,none": 0.02784647600593048}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.39814814814814814, "acc_stderr,none": 0.02723741509459248}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3135593220338983, "acc_stderr,none": 0.011849234291459329}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03615507630310935}, "mmlu_other": {"acc,none": 0.3994206630189894, "acc_stderr,none": 0.008747914827807462, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.4377358490566038, "acc_stderr,none": 0.030533338430467512}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3872832369942196, "acc_stderr,none": 0.03714325906302065}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4484304932735426, "acc_stderr,none": 0.033378837362550984}, "mmlu_management": {"alias": " - management", "acc,none": 0.4563106796116505, "acc_stderr,none": 0.049318019942204146}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.46153846153846156, "acc_stderr,none": 0.032659033811861936}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.384418901660281, "acc_stderr,none": 0.01739568874281962}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.39869281045751637, "acc_stderr,none": 0.02803609227389177}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.02746470844202214}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.4227941176470588, "acc_stderr,none": 0.030008562845003476}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.39759036144578314, "acc_stderr,none": 0.038099730845402184}, "mmlu_social_sciences": {"acc,none": 0.40201494962625933, "acc_stderr,none": 0.008753318520333115, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.04227054451232199}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35353535353535354, "acc_stderr,none": 0.03406086723547153}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.42487046632124353, "acc_stderr,none": 0.035674713352125395}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3564102564102564, "acc_stderr,none": 0.0242831405294673}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3697478991596639, "acc_stderr,none": 0.031357095996135904}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.5119266055045871, "acc_stderr,none": 0.021431223617362227}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.041423137719966634}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3709150326797386, "acc_stderr,none": 0.01954210156485412}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.38181818181818183, "acc_stderr,none": 0.046534298079135075}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3183673469387755, "acc_stderr,none": 0.02982253379398207}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5373134328358209, "acc_stderr,none": 0.03525675167467974}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_stem": {"acc,none": 0.33143038376149697, "acc_stderr,none": 0.008305453682363425, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3925925925925926, "acc_stderr,none": 0.04218506215368879}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3618421052631579, "acc_stderr,none": 0.03910525752849724}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4513888888888889, "acc_stderr,none": 0.04161402398403279}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.04533838195929775}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.43, "acc_stderr,none": 0.049756985195624284}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.37872340425531914, "acc_stderr,none": 0.03170995606040655}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277696}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633345}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.44516129032258067, "acc_stderr,none": 0.028272410186214906}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655078}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969654}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.375, "acc_stderr,none": 0.033016908987210894}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.1815159574468085, "exact_match_stderr,custom-extract": 0.0034691711332705353, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3333333333333333, "exact_match_stderr,custom-extract": 0.017617214086056418}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1596958174904943, "exact_match_stderr,custom-extract": 0.013049741978046024}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10247349823321555, "exact_match_stderr,custom-extract": 0.009017748507579058}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.20243902439024392, "exact_match_stderr,custom-extract": 0.019868606646141387}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2381516587677725, "exact_match_stderr,custom-extract": 0.014670579907447287}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.14654282765737875, "exact_match_stderr,custom-extract": 0.011366728093938227}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2567237163814181, "exact_match_stderr,custom-extract": 0.015282595032542022}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.2125984251968504, "exact_match_stderr,custom-extract": 0.02098873976311752}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12806539509536785, "exact_match_stderr,custom-extract": 0.010075381773702268}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.1391561806069578, "exact_match_stderr,custom-extract": 0.009419905559243678}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17532467532467533, "exact_match_stderr,custom-extract": 0.01251590249750937}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16432865731462926, "exact_match_stderr,custom-extract": 0.016605797464661214}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.14857582755966128, "exact_match_stderr,custom-extract": 0.009872103972550979}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2756892230576441, "exact_match_stderr,custom-extract": 0.01582862563189321}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.019966103540279462, "acc_norm,none": 0.386, "acc_norm_stderr,none": 0.021793529219281165}, "piqa": {"alias": "piqa", "acc,none": 0.719804134929271, "acc_stderr,none": 0.010478122015577086, "acc_norm,none": 0.720892274211099, "acc_norm_stderr,none": 0.01046565794849823}, "race": {"alias": "race", "acc,none": 0.3712918660287081, "acc_stderr,none": 0.014953126515089411}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45752302968270214, "acc_stderr,none": 0.011273168825920714}, "winogrande": {"alias": "winogrande", "acc,none": 0.6464088397790055, "acc_stderr,none": 0.013436541262599954}}