Commit
·
3335d76
1
Parent(s):
d51220d
Merge
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 2b855b11bc4seed2/evaluation/generation/merged.csv +53 -0
- 2b855b11bc4seed2/evaluation/generation/merged.json +1 -0
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0.csv +21 -0
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +0 -87
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1.csv +21 -0
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json +0 -87
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2.csv +21 -0
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +0 -87
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3.csv +21 -0
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json +0 -87
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4.csv +21 -0
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +0 -87
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5.csv +21 -0
- 2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json +0 -87
- 2b855b11bc4seed3/evaluation/generation/merged.csv +39 -0
- 2b855b11bc4seed3/evaluation/generation/merged.json +1 -0
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0.csv +21 -0
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json +0 -87
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1.csv +21 -0
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +0 -87
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2.csv +21 -0
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json +0 -87
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3.csv +21 -0
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +0 -87
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4.csv +21 -0
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +0 -87
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5.csv +21 -0
- 2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +0 -87
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0.csv +21 -0
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +0 -87
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1.csv +21 -0
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +0 -87
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2.csv +21 -0
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +0 -87
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3.csv +21 -0
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +0 -87
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4.csv +21 -0
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json +0 -87
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5.csv +21 -0
- 2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +0 -87
- 2b855b14bc4seed1/evaluation/generation/merged.csv +53 -0
- 2b855b14bc4seed1/evaluation/generation/merged.json +1 -0
- 2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0.csv +21 -0
- 2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json +0 -87
- 2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1.csv +21 -0
- 2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json +0 -87
- 2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2.csv +21 -0
- 2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json +0 -87
- 2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3.csv +21 -0
- 2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json +0 -87
2b855b11bc4seed2/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.05603098508787396
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.05603098508787396
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.11813763333164416
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.11813763333164416
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.13757318526100176
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.13757318526100176
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.14464807696490528
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.14464807696490528
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.14604424103058358
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.14604424103058358
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.15173507276703452
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.15173507276703452
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.1256948657405072
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.044139249367473786
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.044139249367473786
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03540453263680104
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.03540453263680104
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.0348723834844472
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.0348723834844472
|
21 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03171913029877269
|
22 |
+
gem_xsum,3,median,rouge2_fmeasure,0.03171913029877269
|
23 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.007074958138573533
|
24 |
+
gem_xsum,4,median,rouge2_fmeasure,0.007074958138573533
|
25 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00012314549854098596
|
26 |
+
gem_xsum,5,median,rouge2_fmeasure,0.00012314549854098596
|
27 |
+
gem_xsum,5,average,multiple,0.025555566570768205
|
28 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04899451308031972
|
29 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.04899451308031972
|
30 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.050978314953605365
|
31 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.050978314953605365
|
32 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05107703767051227
|
33 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.05107703767051227
|
34 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.053756988677712864
|
35 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.053756988677712864
|
36 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05523496800965618
|
37 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.05523496800965618
|
38 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.054826378486590326
|
39 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.054826378486590326
|
40 |
+
web_nlg_en,5,average,multiple,0.05247803347973279
|
41 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03498414446894612
|
42 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.03498414446894612
|
43 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.044212366766547766
|
44 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.044212366766547766
|
45 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05013925437862328
|
46 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.05013925437862328
|
47 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04125017703720899
|
48 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.04125017703720899
|
49 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01492978001583821
|
50 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.01492978001583821
|
51 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0022181250558980193
|
52 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.0022181250558980193
|
53 |
+
wiki_lingua_en,5,average,multiple,0.0312889746205104
|
2b855b11bc4seed2/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3669019692085673, "bleu_stderr": 0.03735576208579507, "rouge1_fmeasure": 0.10488940315310863, "rouge1_fmeasure_stderr": 0.002182807125149089, "rouge1_precision": 0.07108949670602635, "rouge1_precision_stderr": 0.0018605155075932162, "rouge1_recall": 0.2944008471177708, "rouge1_recall_stderr": 0.00496048505670551, "rouge2_fmeasure": 0.04899451308031972, "rouge2_fmeasure_stderr": 0.0013409522053557752, "rouge2_precision": 0.03282356988116583, "rouge2_precision_stderr": 0.0011424627167812825, "rouge2_recall": 0.14057659944269588, "rouge2_recall_stderr": 0.003256076102514853, "rougeL_fmeasure": 0.10057011768365899, "rougeL_fmeasure_stderr": 0.002005974356548281, "rougeL_precision": 0.06788803543605783, "rougeL_precision_stderr": 0.0017222927384213094, "rougeL_recall": 0.2859195847396627, "rougeL_recall_stderr": 0.00482334950262861, "rougeLsum_fmeasure": 0.0995943061498225, "rougeLsum_fmeasure_stderr": 0.002034666255159939, "rougeLsum_precision": 0.0675573531009672, "rougeLsum_precision_stderr": 0.001764984902165412, "rougeLsum_recall": 0.2797874789377513, "rougeLsum_recall_stderr": 0.004633335830609517}}, "1": {"PALM_prompt": {"bleu": 0.4457796396280548, "bleu_stderr": 0.03663789805602827, "rouge1_fmeasure": 0.11290267432024997, "rouge1_fmeasure_stderr": 0.002086212254260406, "rouge1_precision": 0.07328276677295399, "rouge1_precision_stderr": 0.001595296009075034, "rouge1_recall": 0.357873529297584, "rouge1_recall_stderr": 0.005356216012775593, "rouge2_fmeasure": 0.050978314953605365, "rouge2_fmeasure_stderr": 0.0012648239214954904, "rouge2_precision": 0.032960565662680366, "rouge2_precision_stderr": 0.00092590184441572, "rouge2_recall": 0.1681640544477541, "rouge2_recall_stderr": 0.003577309598153851, "rougeL_fmeasure": 0.10461737503039704, "rougeL_fmeasure_stderr": 0.0018238001476137153, "rougeL_precision": 0.0676147386351206, "rougeL_precision_stderr": 0.0013707383628966691, "rougeL_recall": 0.3328818182866075, "rougeL_recall_stderr": 0.004895092850273884, "rougeLsum_fmeasure": 0.10672422186964665, "rougeLsum_fmeasure_stderr": 0.0019474862069458747, "rougeLsum_precision": 0.06926122229303046, "rougeLsum_precision_stderr": 0.0014823373285525329, "rougeLsum_recall": 0.3364706883571429, "rougeLsum_recall_stderr": 0.004858431069268795}}, "2": {"PALM_prompt": {"bleu": 0.46605282718698626, "bleu_stderr": 0.02996676777558062, "rouge1_fmeasure": 0.11300207247927123, "rouge1_fmeasure_stderr": 0.0019165980987731685, "rouge1_precision": 0.07223534858221925, "rouge1_precision_stderr": 0.0014523280167324517, "rouge1_recall": 0.3709874761998271, "rouge1_recall_stderr": 0.005246128541242479, "rouge2_fmeasure": 0.05107703767051227, "rouge2_fmeasure_stderr": 0.001189241631450374, "rouge2_precision": 0.032619601917078085, "rouge2_precision_stderr": 0.0008732560599076257, "rouge2_recall": 0.17538727298076712, "rouge2_recall_stderr": 0.0036277508794783438, "rougeL_fmeasure": 0.104830081432862, "rougeL_fmeasure_stderr": 0.0017334774718182442, "rougeL_precision": 0.06700314943348495, "rougeL_precision_stderr": 0.0013017430875308894, "rougeL_recall": 0.341280675217576, "rougeL_recall_stderr": 0.004680138815460377, "rougeLsum_fmeasure": 0.10664511244354742, "rougeLsum_fmeasure_stderr": 0.0017935730802929586, "rougeLsum_precision": 0.0682192697317043, "rougeLsum_precision_stderr": 0.0013544348051587117, "rougeLsum_recall": 0.347890761022798, "rougeLsum_recall_stderr": 0.004814945746675548}}, "3": {"PALM_prompt": {"bleu": 0.5506448987587048, "bleu_stderr": 0.028040930117572985, "rouge1_fmeasure": 0.11789825375153909, "rouge1_fmeasure_stderr": 0.001932195619461612, "rouge1_precision": 0.07573922996380036, "rouge1_precision_stderr": 0.0015170397473250185, "rouge1_recall": 0.38492634002858683, "rouge1_recall_stderr": 0.005176207474155607, "rouge2_fmeasure": 0.053756988677712864, "rouge2_fmeasure_stderr": 0.0011965966106312165, "rouge2_precision": 0.03435743387037634, "rouge2_precision_stderr": 0.0008961471238384336, "rouge2_recall": 0.18616259178006056, "rouge2_recall_stderr": 0.0037451350035256482, "rougeL_fmeasure": 0.10879682748659922, "rougeL_fmeasure_stderr": 0.0017295701095651076, "rougeL_precision": 0.06978173508855735, "rougeL_precision_stderr": 0.0013414658300881625, "rougeL_recall": 0.3536412782595861, "rougeL_recall_stderr": 0.004596054949628527, "rougeLsum_fmeasure": 0.11158082524412793, "rougeLsum_fmeasure_stderr": 0.0018211626697717824, "rougeLsum_precision": 0.07172260100588973, "rougeLsum_precision_stderr": 0.0014274208122089145, "rougeLsum_recall": 0.36244343249377947, "rougeLsum_recall_stderr": 0.004772754547214642}}, "4": {"PALM_prompt": {"bleu": 0.5881318297710302, "bleu_stderr": 0.039908239363835654, "rouge1_fmeasure": 0.12053757788822311, "rouge1_fmeasure_stderr": 0.0018961787838997102, "rouge1_precision": 0.07735300568910049, "rouge1_precision_stderr": 0.0015086378866700828, "rouge1_recall": 0.3985419746518129, "rouge1_recall_stderr": 0.005029418888525787, "rouge2_fmeasure": 0.05523496800965618, "rouge2_fmeasure_stderr": 0.0011740287332534603, "rouge2_precision": 0.035199176173288736, "rouge2_precision_stderr": 0.0008620977702919277, "rouge2_recall": 0.19638451832650627, "rouge2_recall_stderr": 0.0037186121018020772, "rougeL_fmeasure": 0.10988623597552064, "rougeL_fmeasure_stderr": 0.0016628282980400671, "rougeL_precision": 0.07030939128233044, "rougeL_precision_stderr": 0.0012828117493288978, "rougeL_recall": 0.36285260508288164, "rougeL_recall_stderr": 0.0044527827738133135, "rougeLsum_fmeasure": 0.11360120534535015, "rougeLsum_fmeasure_stderr": 0.0017628360499995425, "rougeLsum_precision": 0.07278434161088057, "rougeLsum_precision_stderr": 0.0013675084372612996, "rougeLsum_recall": 0.3756859898920006, "rougeLsum_recall_stderr": 0.004687342906896715}}, "5": {"PALM_prompt": {"bleu": 0.6100325324177117, "bleu_stderr": 0.02703478355184964, "rouge1_fmeasure": 0.11969950579850888, "rouge1_fmeasure_stderr": 0.0019228477421699686, "rouge1_precision": 0.07722442196685407, "rouge1_precision_stderr": 0.0016109314348509566, "rouge1_recall": 0.4017267929127603, "rouge1_recall_stderr": 0.005110092029665619, "rouge2_fmeasure": 0.054826378486590326, "rouge2_fmeasure_stderr": 0.0011899938265323704, "rouge2_precision": 0.035279538401147746, "rouge2_precision_stderr": 0.0009640245304949675, "rouge2_recall": 0.19826855938594587, "rouge2_recall_stderr": 0.003837030403723965, "rougeL_fmeasure": 0.1091894265235256, "rougeL_fmeasure_stderr": 0.0017016342146881787, "rougeL_precision": 0.07042983922009842, "rougeL_precision_stderr": 0.0014289861593103398, "rougeL_recall": 0.3644310690944154, "rougeL_recall_stderr": 0.004466552122874037, "rougeLsum_fmeasure": 0.11289648560916707, "rougeLsum_fmeasure_stderr": 0.001777647929316373, "rougeLsum_precision": 0.07279721150667341, "rougeLsum_precision_stderr": 0.0014832472297194098, "rougeLsum_recall": 0.3778701373910634, "rougeLsum_recall_stderr": 0.0046818712318910235}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5110660909639972, "bleu_stderr": 0.07401749441784054, "rouge1_fmeasure": 0.17437063929496616, "rouge1_fmeasure_stderr": 0.0018679203169137393, "rouge1_precision": 0.14841430227531063, "rouge1_precision_stderr": 0.0018849317623447377, "rouge1_recall": 0.2546402796166594, "rouge1_recall_stderr": 0.002706292482883891, "rouge2_fmeasure": 0.03498414446894612, "rouge2_fmeasure_stderr": 0.0008298176462282185, "rouge2_precision": 0.02943881399707129, "rouge2_precision_stderr": 0.0007248235247046832, "rouge2_recall": 0.053608397228254355, "rouge2_recall_stderr": 0.0014462144916590092, "rougeL_fmeasure": 0.13752082994315118, "rougeL_fmeasure_stderr": 0.0013441166414505977, "rougeL_precision": 0.1156713065994801, "rougeL_precision_stderr": 0.0013243962160205257, "rougeL_recall": 0.2058047431206632, "rougeL_recall_stderr": 0.002206261287530996, "rougeLsum_fmeasure": 0.15963577366030096, "rougeLsum_fmeasure_stderr": 0.0017015868824249002, "rougeLsum_precision": 0.13571633871621264, "rougeLsum_precision_stderr": 0.0017186096958220363, "rougeLsum_recall": 0.2340095218448455, "rougeLsum_recall_stderr": 0.002497602615389906}}, "1": {"tldr_en": {"bleu": 2.246682518650239, "bleu_stderr": 0.06775703875806485, "rouge1_fmeasure": 0.20178154664890433, "rouge1_fmeasure_stderr": 0.0019091801402858672, "rouge1_precision": 0.1730537365856929, "rouge1_precision_stderr": 0.0020147050749397054, "rouge1_recall": 0.29341449100333866, "rouge1_recall_stderr": 0.0027275267975047585, "rouge2_fmeasure": 0.044212366766547766, "rouge2_fmeasure_stderr": 0.0009611400325850913, "rouge2_precision": 0.0378044364023744, "rouge2_precision_stderr": 0.0008707368086399108, "rouge2_recall": 0.06598843832993921, "rouge2_recall_stderr": 0.0015965497648305382, "rougeL_fmeasure": 0.14567255875431964, "rougeL_fmeasure_stderr": 0.0012969079642798744, "rougeL_precision": 0.12349613685503863, "rougeL_precision_stderr": 0.0013414079107970274, "rougeL_recall": 0.21768049847327345, "rougeL_recall_stderr": 0.002171417223641686, "rougeLsum_fmeasure": 0.18855005555536836, "rougeLsum_fmeasure_stderr": 0.0017792963187320534, "rougeLsum_precision": 0.16149868456320995, "rougeLsum_precision_stderr": 0.0018740327287461542, "rougeLsum_recall": 0.2751854506345283, "rougeLsum_recall_stderr": 0.0025841619520934733}}, "2": {"tldr_en": {"bleu": 2.5041236014249213, "bleu_stderr": 0.054677815274649495, "rouge1_fmeasure": 0.21020781216935502, "rouge1_fmeasure_stderr": 0.0019360400703538375, "rouge1_precision": 0.1819807478308173, "rouge1_precision_stderr": 0.002091157564590908, "rouge1_recall": 0.3029305112527695, "rouge1_recall_stderr": 0.0027548325089732, "rouge2_fmeasure": 0.05013925437862328, "rouge2_fmeasure_stderr": 0.0009903485147246848, "rouge2_precision": 0.04325856897701525, "rouge2_precision_stderr": 0.000932364650414809, "rouge2_recall": 0.07429867293210415, "rouge2_recall_stderr": 0.0016111887959417725, "rougeL_fmeasure": 0.15515714332881575, "rougeL_fmeasure_stderr": 0.0013346431577317703, "rougeL_precision": 0.13305722535113657, "rougeL_precision_stderr": 0.0014277373936989982, "rougeL_recall": 0.22871882340075878, "rougeL_recall_stderr": 0.0021821233762823362, "rougeLsum_fmeasure": 0.1968617160132362, "rougeLsum_fmeasure_stderr": 0.0018143106376436737, "rougeLsum_precision": 0.17024159081815748, "rougeLsum_precision_stderr": 0.001957592483299715, "rougeLsum_recall": 0.2845967556685176, "rougeLsum_recall_stderr": 0.002621760111154941}}, "3": {"tldr_en": {"bleu": 2.414166858396001, "bleu_stderr": 0.1017204202701052, "rouge1_fmeasure": 0.1772632037488201, "rouge1_fmeasure_stderr": 0.002218928925982093, "rouge1_precision": 0.16185089345243805, "rouge1_precision_stderr": 0.002447244225379841, "rouge1_recall": 0.251517278638771, "rouge1_recall_stderr": 0.003226010774604141, "rouge2_fmeasure": 0.04125017703720899, "rouge2_fmeasure_stderr": 0.0009395476601233144, "rouge2_precision": 0.03709929137355734, "rouge2_precision_stderr": 0.0009517935690523235, "rouge2_recall": 0.061570751938897314, "rouge2_recall_stderr": 0.0015824433764131248, "rougeL_fmeasure": 0.13317965374627014, "rougeL_fmeasure_stderr": 0.0016023423958461526, "rougeL_precision": 0.12128193808204789, "rougeL_precision_stderr": 0.00183443739135039, "rougeL_recall": 0.19343716089196752, "rougeL_recall_stderr": 0.002584189617132911, "rougeLsum_fmeasure": 0.1666081331968882, "rougeLsum_fmeasure_stderr": 0.0020738729000881525, "rougeLsum_precision": 0.15219158593894744, "rougeLsum_precision_stderr": 0.0023055319887651145, "rougeLsum_recall": 0.23707015010427476, "rougeLsum_recall_stderr": 0.003060030836900186}}, "4": {"tldr_en": {"bleu": 0.6242302767315615, "bleu_stderr": 0.04253215138946399, "rouge1_fmeasure": 0.059095668151701794, "rouge1_fmeasure_stderr": 0.0019914573168015462, "rouge1_precision": 0.05589660634771713, "rouge1_precision_stderr": 0.0020789247458225762, "rouge1_recall": 0.0866210676290151, "rouge1_recall_stderr": 0.002970118175189146, "rouge2_fmeasure": 0.01492978001583821, "rouge2_fmeasure_stderr": 0.0007051123182883665, "rouge2_precision": 0.013794438861716635, "rouge2_precision_stderr": 0.0007198996863920006, "rouge2_recall": 0.02323840526022537, "rouge2_recall_stderr": 0.0012081229235487365, "rougeL_fmeasure": 0.04527379830332456, "rougeL_fmeasure_stderr": 0.0014982426681793449, "rougeL_precision": 0.04287508113995099, "rougeL_precision_stderr": 0.00159737758143571, "rougeL_recall": 0.06772654959144096, "rougeL_recall_stderr": 0.002355177689293077, "rougeLsum_fmeasure": 0.055240332031071544, "rougeLsum_fmeasure_stderr": 0.001858674299324512, "rougeLsum_precision": 0.05244988982418966, "rougeLsum_precision_stderr": 0.0019680061386683676, "rougeLsum_recall": 0.0810586452425775, "rougeLsum_recall_stderr": 0.002782405355135347}}, "5": {"tldr_en": {"bleu": 1.615827815577086e-06, "bleu_stderr": 3.2333552495423247e-06, "rouge1_fmeasure": 0.009318253326635167, "rouge1_fmeasure_stderr": 0.000866152474725978, "rouge1_precision": 0.00946125436925863, "rouge1_precision_stderr": 0.0009815109693569397, "rouge1_recall": 0.013468009711675888, "rouge1_recall_stderr": 0.001269746421645073, "rouge2_fmeasure": 0.0022181250558980193, "rouge2_fmeasure_stderr": 0.0002700454897219965, "rouge2_precision": 0.0020886287050252525, "rouge2_precision_stderr": 0.00026488805810467837, "rouge2_recall": 0.0034252359070913153, "rouge2_recall_stderr": 0.0004431407661930414, "rougeL_fmeasure": 0.007216579385654348, "rougeL_fmeasure_stderr": 0.0006514725881035319, "rougeL_precision": 0.007442143655450109, "rougeL_precision_stderr": 0.000775111822355702, "rougeL_recall": 0.010621704812097997, "rougeL_recall_stderr": 0.0010052653546245714, "rougeLsum_fmeasure": 0.008729835862025952, "rougeLsum_fmeasure_stderr": 0.0008108342616344646, "rougeLsum_precision": 0.008854227589439165, "rougeLsum_precision_stderr": 0.0009238074004769046, "rougeLsum_recall": 0.012656577138490685, "rougeLsum_recall_stderr": 0.0011982797701887724}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.0378761453257783, "bleu_stderr": 0.08518203279465711, "rouge1_fmeasure": 0.15284466517030987, "rouge1_fmeasure_stderr": 0.0020733229264210833, "rouge1_precision": 0.12346238513217829, "rouge1_precision_stderr": 0.0018953361220130477, "rouge1_recall": 0.22095983282063245, "rouge1_recall_stderr": 0.0028901633809294567, "rouge2_fmeasure": 0.05603098508787396, "rouge2_fmeasure_stderr": 0.001104580558808195, "rouge2_precision": 0.04442101205623975, "rouge2_precision_stderr": 0.0009174859090253856, "rouge2_recall": 0.08189624776685947, "rouge2_recall_stderr": 0.0016402680486423498, "rougeL_fmeasure": 0.14424875152214156, "rougeL_fmeasure_stderr": 0.0018997489713444592, "rougeL_precision": 0.11579616240640865, "rougeL_precision_stderr": 0.0016878357390951878, "rougeL_recall": 0.21005341644582798, "rougeL_recall_stderr": 0.0027249280433669612, "rougeLsum_fmeasure": 0.13266404353961783, "rougeLsum_fmeasure_stderr": 0.0018444407314532878, "rougeLsum_precision": 0.10723369718745066, "rougeLsum_precision_stderr": 0.0016785659862114447, "rougeLsum_recall": 0.19175382965940935, "rougeLsum_recall_stderr": 0.002581731401648066}}, "1": {"generate_text_restaurant": {"bleu": 5.126169908039335, "bleu_stderr": 0.060019707849600396, "rouge1_fmeasure": 0.30084375583488465, "rouge1_fmeasure_stderr": 0.0017666196438528153, "rouge1_precision": 0.2374397743893787, "rouge1_precision_stderr": 0.001827366166961729, "rouge1_recall": 0.4538895860871574, "rouge1_recall_stderr": 0.002739145525628191, "rouge2_fmeasure": 0.11813763333164416, "rouge2_fmeasure_stderr": 0.001265066374684441, "rouge2_precision": 0.09311186153520964, "rouge2_precision_stderr": 0.0012117859139284565, "rouge2_recall": 0.18237560978508383, "rouge2_recall_stderr": 0.0020325975329361454, "rougeL_fmeasure": 0.24433694694761532, "rougeL_fmeasure_stderr": 0.00130346416067227, "rougeL_precision": 0.19149503022784875, "rougeL_precision_stderr": 0.0013376484927668058, "rougeL_recall": 0.3734418164524156, "rougeL_recall_stderr": 0.0023461065260632426, "rougeLsum_fmeasure": 0.24256192147463654, "rougeLsum_fmeasure_stderr": 0.0016652743992374153, "rougeLsum_precision": 0.19159105345376767, "rougeLsum_precision_stderr": 0.0016482031013163958, "rougeLsum_recall": 0.3662200697989403, "rougeLsum_recall_stderr": 0.0026084151410531906}}, "2": {"generate_text_restaurant": {"bleu": 6.249384385334651, "bleu_stderr": 0.07770725715172515, "rouge1_fmeasure": 0.3236620882311894, "rouge1_fmeasure_stderr": 0.001730869351887234, "rouge1_precision": 0.2571488513560848, "rouge1_precision_stderr": 0.0017479087453492585, "rouge1_recall": 0.4776474806351213, "rouge1_recall_stderr": 0.0027381915088070803, "rouge2_fmeasure": 0.13757318526100176, "rouge2_fmeasure_stderr": 0.0013355463995447833, "rouge2_precision": 0.1085707308611601, "rouge2_precision_stderr": 0.0012335636842772677, "rouge2_recall": 0.20861118989211233, "rouge2_recall_stderr": 0.0021888224556685905, "rougeL_fmeasure": 0.265325935238766, "rougeL_fmeasure_stderr": 0.001349636506785451, "rougeL_precision": 0.21005072572789943, "rougeL_precision_stderr": 0.0013866447459134966, "rougeL_recall": 0.39533407328249814, "rougeL_recall_stderr": 0.0024077424449100897, "rougeLsum_fmeasure": 0.2652383774880513, "rougeLsum_fmeasure_stderr": 0.0016824765629729075, "rougeLsum_precision": 0.21084881314164047, "rougeLsum_precision_stderr": 0.0016535970510521015, "rougeLsum_recall": 0.39227019176650024, "rougeLsum_recall_stderr": 0.0026723625566665876}}, "3": {"generate_text_restaurant": {"bleu": 6.757816195559476, "bleu_stderr": 0.0636105382851002, "rouge1_fmeasure": 0.3311529384971786, "rouge1_fmeasure_stderr": 0.0017193353624414865, "rouge1_precision": 0.2649025981125516, "rouge1_precision_stderr": 0.001816521271723974, "rouge1_recall": 0.485500501669586, "rouge1_recall_stderr": 0.0027035556710176397, "rouge2_fmeasure": 0.14464807696490528, "rouge2_fmeasure_stderr": 0.0013667792152310628, "rouge2_precision": 0.1156613222917535, "rouge2_precision_stderr": 0.00136914737695428, "rouge2_recall": 0.2172590676961178, "rouge2_recall_stderr": 0.0021909769319400805, "rougeL_fmeasure": 0.27382376721341406, "rougeL_fmeasure_stderr": 0.0013534412847209798, "rougeL_precision": 0.21833801769994254, "rougeL_precision_stderr": 0.0014604899732955515, "rougeL_recall": 0.40505182586725, "rougeL_recall_stderr": 0.0023977022968250496, "rougeLsum_fmeasure": 0.27330735067492545, "rougeLsum_fmeasure_stderr": 0.0016914308676584363, "rougeLsum_precision": 0.2188141627190453, "rougeLsum_precision_stderr": 0.0017198980966469132, "rougeLsum_recall": 0.40131515523410427, "rougeLsum_recall_stderr": 0.0026468309652264163}}, "4": {"generate_text_restaurant": {"bleu": 6.8116932588036025, "bleu_stderr": 0.08803986085496286, "rouge1_fmeasure": 0.33109808130241025, "rouge1_fmeasure_stderr": 0.001743234134850443, "rouge1_precision": 0.2692618389240567, "rouge1_precision_stderr": 0.0019668736185400296, "rouge1_recall": 0.4771107096079804, "rouge1_recall_stderr": 0.0026181488455626965, "rouge2_fmeasure": 0.14604424103058358, "rouge2_fmeasure_stderr": 0.0013887223383308772, "rouge2_precision": 0.11873708025959091, "rouge2_precision_stderr": 0.001398837418169464, "rouge2_recall": 0.21498597349348783, "rouge2_recall_stderr": 0.002140162295467893, "rougeL_fmeasure": 0.27557083918753333, "rougeL_fmeasure_stderr": 0.0013751463805022712, "rougeL_precision": 0.22307248979456698, "rougeL_precision_stderr": 0.0015556609206750656, "rougeL_recall": 0.40100397544077326, "rougeL_recall_stderr": 0.002350965909261926, "rougeLsum_fmeasure": 0.27488278035950847, "rougeLsum_fmeasure_stderr": 0.0017432716698613845, "rougeLsum_precision": 0.22375069182479146, "rougeLsum_precision_stderr": 0.0018651442719424599, "rougeLsum_recall": 0.39667186960697276, "rougeLsum_recall_stderr": 0.002625562673886146}}, "5": {"generate_text_restaurant": {"bleu": 7.123004030201217, "bleu_stderr": 0.10755431721392276, "rouge1_fmeasure": 0.34009425919881386, "rouge1_fmeasure_stderr": 0.0018127528383437095, "rouge1_precision": 0.283387194738878, "rouge1_precision_stderr": 0.0021889859805422174, "rouge1_recall": 0.47762279214889136, "rouge1_recall_stderr": 0.002598204572656884, "rouge2_fmeasure": 0.15173507276703452, "rouge2_fmeasure_stderr": 0.0014423284554178045, "rouge2_precision": 0.1268317618878522, "rouge2_precision_stderr": 0.001539845631916355, "rouge2_recall": 0.21704921295074978, "rouge2_recall_stderr": 0.00213381640113868, "rougeL_fmeasure": 0.28167675573246964, "rougeL_fmeasure_stderr": 0.0014174528695001733, "rougeL_precision": 0.23348226143885964, "rougeL_precision_stderr": 0.0017364200503098236, "rougeL_recall": 0.4000256021331328, "rougeL_recall_stderr": 0.0023393501528240785, "rougeLsum_fmeasure": 0.2841776706521971, "rougeLsum_fmeasure_stderr": 0.0017988587175840989, "rougeLsum_precision": 0.23733946015237703, "rougeLsum_precision_stderr": 0.0020776364265581017, "rougeLsum_recall": 0.3992233197709026, "rougeLsum_recall_stderr": 0.002574196241305113}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9362576802818214, "bleu_stderr": 0.07613560025591426, "rouge1_fmeasure": 0.20774854380832464, "rouge1_fmeasure_stderr": 0.002559300503441153, "rouge1_precision": 0.1555958457764852, "rouge1_precision_stderr": 0.0021160845746587115, "rouge1_recall": 0.34158447671046027, "rouge1_recall_stderr": 0.004323062909638541, "rouge2_fmeasure": 0.044139249367473786, "rouge2_fmeasure_stderr": 0.0015460831827267061, "rouge2_precision": 0.032455922926097835, "rouge2_precision_stderr": 0.0011708356708942493, "rouge2_recall": 0.07494285935393781, "rouge2_recall_stderr": 0.0026689396207543992, "rougeL_fmeasure": 0.15374732499060603, "rougeL_fmeasure_stderr": 0.0019079112053866937, "rougeL_precision": 0.11511600394721422, "rougeL_precision_stderr": 0.001594627834760935, "rougeL_recall": 0.25430483568408224, "rougeL_recall_stderr": 0.003345749633090871, "rougeLsum_fmeasure": 0.16253188243527864, "rougeLsum_fmeasure_stderr": 0.002156777336291824, "rougeLsum_precision": 0.1213281516988994, "rougeLsum_precision_stderr": 0.0017356602287848383, "rougeLsum_recall": 0.26933797612929455, "rougeLsum_recall_stderr": 0.003780314309352455}}, "1": {"article_DOC_summary": {"bleu": 1.4112544524316093, "bleu_stderr": 0.08496214042954833, "rouge1_fmeasure": 0.17760255904564273, "rouge1_fmeasure_stderr": 0.0024690797538052664, "rouge1_precision": 0.126369802264852, "rouge1_precision_stderr": 0.0018405489799639353, "rouge1_recall": 0.31122732461789854, "rouge1_recall_stderr": 0.004193666873674977, "rouge2_fmeasure": 0.03540453263680104, "rouge2_fmeasure_stderr": 0.0013817982389489629, "rouge2_precision": 0.024966878604215887, "rouge2_precision_stderr": 0.0009821028802625116, "rouge2_recall": 0.06352834342639134, "rouge2_recall_stderr": 0.002525805879626373, "rougeL_fmeasure": 0.13621833396997876, "rougeL_fmeasure_stderr": 0.0017888518922440696, "rougeL_precision": 0.09670754836321287, "rougeL_precision_stderr": 0.0013239157948012734, "rougeL_recall": 0.2405099629356952, "rougeL_recall_stderr": 0.0031904466101414556, "rougeLsum_fmeasure": 0.1430983873534407, "rougeLsum_fmeasure_stderr": 0.002024072232741044, "rougeLsum_precision": 0.10153767215572433, "rougeLsum_precision_stderr": 0.0014859568414341794, "rougeLsum_recall": 0.2527563825359106, "rougeLsum_recall_stderr": 0.0036111698335435823}}, "2": {"article_DOC_summary": {"bleu": 1.3507302866689068, "bleu_stderr": 0.08510431402948973, "rouge1_fmeasure": 0.17585370691736185, "rouge1_fmeasure_stderr": 0.0023925094764087468, "rouge1_precision": 0.1246367268120949, "rouge1_precision_stderr": 0.001773757664330906, "rouge1_recall": 0.31045490731458164, "rouge1_recall_stderr": 0.004084886636922553, "rouge2_fmeasure": 0.0348723834844472, "rouge2_fmeasure_stderr": 0.0013509469241876884, "rouge2_precision": 0.024475004166713072, "rouge2_precision_stderr": 0.0009533956524831488, "rouge2_recall": 0.06310551153540954, "rouge2_recall_stderr": 0.002485011154241004, "rougeL_fmeasure": 0.13534716420373505, "rougeL_fmeasure_stderr": 0.001731586302015437, "rougeL_precision": 0.09570739798294506, "rougeL_precision_stderr": 0.0012693669937521557, "rougeL_recall": 0.24064083206853418, "rougeL_recall_stderr": 0.003131371167507212, "rougeLsum_fmeasure": 0.14114161105688136, "rougeLsum_fmeasure_stderr": 0.001994499763031248, "rougeLsum_precision": 0.0998302330407736, "rougeLsum_precision_stderr": 0.0014584981746615054, "rougeLsum_recall": 0.250757479147429, "rougeLsum_recall_stderr": 0.0035444274589021135}}, "3": {"article_DOC_summary": {"bleu": 1.2479004603054775, "bleu_stderr": 0.10042150687708706, "rouge1_fmeasure": 0.16561635893672155, "rouge1_fmeasure_stderr": 0.0025006010127745777, "rouge1_precision": 0.1198460419958428, "rouge1_precision_stderr": 0.0019417773960241746, "rouge1_recall": 0.28812863730867544, "rouge1_recall_stderr": 0.00435221479942355, "rouge2_fmeasure": 0.03171913029877269, "rouge2_fmeasure_stderr": 0.0012890667321138701, "rouge2_precision": 0.022378408876613878, "rouge2_precision_stderr": 0.0009076721878497753, "rouge2_recall": 0.05703336298601347, "rouge2_recall_stderr": 0.002389320400830883, "rougeL_fmeasure": 0.12816630446404473, "rougeL_fmeasure_stderr": 0.0018586126201792224, "rougeL_precision": 0.0923882446291546, "rougeL_precision_stderr": 0.001403733929268059, "rougeL_recall": 0.2245336745602999, "rougeL_recall_stderr": 0.003387032194341533, "rougeLsum_fmeasure": 0.1325551184027993, "rougeLsum_fmeasure_stderr": 0.002071515875578705, "rougeLsum_precision": 0.09550627197662054, "rougeLsum_precision_stderr": 0.0015531216409860379, "rougeLsum_recall": 0.23229017215385045, "rougeLsum_recall_stderr": 0.003730217543869211}}, "4": {"article_DOC_summary": {"bleu": 0.47215686015269726, "bleu_stderr": 0.08574887203656281, "rouge1_fmeasure": 0.04413918217227094, "rouge1_fmeasure_stderr": 0.002480156364042011, "rouge1_precision": 0.03751349089373983, "rouge1_precision_stderr": 0.0022800813493760543, "rouge1_recall": 0.0697871819923432, "rouge1_recall_stderr": 0.0040381330725425294, "rouge2_fmeasure": 0.007074958138573533, "rouge2_fmeasure_stderr": 0.0006827311475169871, "rouge2_precision": 0.005207350657842113, "rouge2_precision_stderr": 0.0005060299660598674, "rouge2_recall": 0.012175067651071106, "rouge2_recall_stderr": 0.0012112660495086575, "rougeL_fmeasure": 0.03395950866138848, "rougeL_fmeasure_stderr": 0.0018808375195373738, "rougeL_precision": 0.029249072317561912, "rougeL_precision_stderr": 0.0018195030363634698, "rougeL_recall": 0.053925106681511445, "rougeL_recall_stderr": 0.003092654840712885, "rougeLsum_fmeasure": 0.035243014423520146, "rougeLsum_fmeasure_stderr": 0.0019800538937599426, "rougeLsum_precision": 0.03021312469117469, "rougeLsum_precision_stderr": 0.0018780320378658164, "rougeLsum_recall": 0.05599613612313265, "rougeLsum_recall_stderr": 0.0032612293404024133}}, "5": {"article_DOC_summary": {"bleu": 1.2727719743841858e-37, "bleu_stderr": 1.2076447049036407e-32, "rouge1_fmeasure": 0.002321071661055069, "rouge1_fmeasure_stderr": 0.0006408973331857279, "rouge1_precision": 0.0026069444960735337, "rouge1_precision_stderr": 0.0007388856823274387, "rouge1_recall": 0.0021572935558770803, "rouge1_recall_stderr": 0.0005864279242216826, "rouge2_fmeasure": 0.00012314549854098596, "rouge2_fmeasure_stderr": 7.122462679033626e-05, "rouge2_precision": 0.00014854426619132502, "rouge2_precision_stderr": 8.571900312776038e-05, "rouge2_recall": 0.00010649727630859706, "rouge2_recall_stderr": 6.211817611366337e-05, "rougeL_fmeasure": 0.0016508273325058706, "rougeL_fmeasure_stderr": 0.00043880923309574715, "rougeL_precision": 0.0018449802658868611, "rougeL_precision_stderr": 0.0005037469801884364, "rougeL_recall": 0.0015501254308975295, "rougeL_recall_stderr": 0.0004091731091328618, "rougeLsum_fmeasure": 0.0016438057763284056, "rougeLsum_fmeasure_stderr": 0.00044670243892416974, "rougeLsum_precision": 0.0018421775438832511, "rougeLsum_precision_stderr": 0.0005163110280087291, "rougeLsum_recall": 0.0015410499501239353, "rougeLsum_recall_stderr": 0.0004141584827452071}}}}
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.326,0.01483050720454104,0
|
3 |
+
anli_r2,acc,0.332,0.014899597242811475,0
|
4 |
+
anli_r3,acc,0.3308333333333333,0.013588208070709002,0
|
5 |
+
arc_challenge,acc,0.23720136518771331,0.012430399829260842,0
|
6 |
+
arc_challenge,acc_norm,0.2721843003412969,0.013006600406423704,0
|
7 |
+
arc_easy,acc,0.5753367003367004,0.01014265368748041,0
|
8 |
+
arc_easy,acc_norm,0.49873737373737376,0.010259750807991153,0
|
9 |
+
boolq,acc,0.5559633027522936,0.008690105214920793,1
|
10 |
+
cb,acc,0.4642857142857143,0.06724777654937658,1
|
11 |
+
cb,f1,0.30718954248366015,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.4340768771161123,0.0049462215121452765,0
|
14 |
+
hellaswag,acc_norm,0.5581557458673571,0.004955914693717967,0
|
15 |
+
piqa,acc,0.7328618063112078,0.01032344049261244,0
|
16 |
+
piqa,acc_norm,0.735582154515778,0.010289787244767158,0
|
17 |
+
rte,acc,0.516245487364621,0.030080573208738064,0
|
18 |
+
sciq,acc,0.813,0.01233625482807413,0
|
19 |
+
sciq,acc_norm,0.724,0.014142984975740666,0
|
20 |
+
storycloze_2016,acc,0.6873329770176376,0.010720223172953174,0
|
21 |
+
winogrande,acc,0.5627466456195738,0.013941393310695924,0
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.326,
|
5 |
-
"acc_stderr": 0.01483050720454104
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.332,
|
9 |
-
"acc_stderr": 0.014899597242811475
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3308333333333333,
|
13 |
-
"acc_stderr": 0.013588208070709002
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4642857142857143,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.30718954248366015
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4340768771161123,
|
26 |
-
"acc_stderr": 0.0049462215121452765,
|
27 |
-
"acc_norm": 0.5581557458673571,
|
28 |
-
"acc_norm_stderr": 0.004955914693717967
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.516245487364621,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5627466456195738,
|
36 |
-
"acc_stderr": 0.013941393310695924
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6873329770176376,
|
40 |
-
"acc_stderr": 0.010720223172953174
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5559633027522936,
|
44 |
-
"acc_stderr": 0.008690105214920793
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5753367003367004,
|
48 |
-
"acc_stderr": 0.01014265368748041,
|
49 |
-
"acc_norm": 0.49873737373737376,
|
50 |
-
"acc_norm_stderr": 0.010259750807991153
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.23720136518771331,
|
54 |
-
"acc_stderr": 0.012430399829260842,
|
55 |
-
"acc_norm": 0.2721843003412969,
|
56 |
-
"acc_norm_stderr": 0.013006600406423704
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.813,
|
60 |
-
"acc_stderr": 0.01233625482807413,
|
61 |
-
"acc_norm": 0.724,
|
62 |
-
"acc_norm_stderr": 0.014142984975740666
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7328618063112078,
|
66 |
-
"acc_stderr": 0.01032344049261244,
|
67 |
-
"acc_norm": 0.735582154515778,
|
68 |
-
"acc_norm_stderr": 0.010289787244767158
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.356,0.015149042659306628,0
|
3 |
+
anli_r2,acc,0.332,0.014899597242811483,0
|
4 |
+
anli_r3,acc,0.34833333333333333,0.01375943749887408,0
|
5 |
+
arc_challenge,acc,0.2568259385665529,0.0127669237941168,0
|
6 |
+
arc_challenge,acc_norm,0.2935153583617747,0.013307250444941127,0
|
7 |
+
arc_easy,acc,0.5728114478114478,0.010150415974210868,0
|
8 |
+
arc_easy,acc_norm,0.5256734006734006,0.010246249665591215,0
|
9 |
+
boolq,acc,0.5758409785932722,0.00864386902338812,1
|
10 |
+
cb,acc,0.5892857142857143,0.0663363415035954,1
|
11 |
+
cb,f1,0.4111718275652702,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.4314877514439355,0.004942716091996078,0
|
14 |
+
hellaswag,acc_norm,0.5596494722166899,0.004954146286513344,0
|
15 |
+
piqa,acc,0.7323177366702938,0.01033011118937043,0
|
16 |
+
piqa,acc_norm,0.7334058759521219,0.010316749863541365,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.842,0.011539894677559562,0
|
19 |
+
sciq,acc_norm,0.812,0.01236158601510375,0
|
20 |
+
storycloze_2016,acc,0.6835916622127205,0.010754780097940887,0
|
21 |
+
winogrande,acc,0.56353591160221,0.013938569465677023,0
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.356,
|
5 |
-
"acc_stderr": 0.015149042659306628
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.332,
|
9 |
-
"acc_stderr": 0.014899597242811483
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34833333333333333,
|
13 |
-
"acc_stderr": 0.01375943749887408
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5892857142857143,
|
17 |
-
"acc_stderr": 0.0663363415035954,
|
18 |
-
"f1": 0.4111718275652702
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4314877514439355,
|
26 |
-
"acc_stderr": 0.004942716091996078,
|
27 |
-
"acc_norm": 0.5596494722166899,
|
28 |
-
"acc_norm_stderr": 0.004954146286513344
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5234657039711191,
|
32 |
-
"acc_stderr": 0.03006330041190266
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.56353591160221,
|
36 |
-
"acc_stderr": 0.013938569465677023
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6835916622127205,
|
40 |
-
"acc_stderr": 0.010754780097940887
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5758409785932722,
|
44 |
-
"acc_stderr": 0.00864386902338812
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5728114478114478,
|
48 |
-
"acc_stderr": 0.010150415974210868,
|
49 |
-
"acc_norm": 0.5256734006734006,
|
50 |
-
"acc_norm_stderr": 0.010246249665591215
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2568259385665529,
|
54 |
-
"acc_stderr": 0.0127669237941168,
|
55 |
-
"acc_norm": 0.2935153583617747,
|
56 |
-
"acc_norm_stderr": 0.013307250444941127
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.842,
|
60 |
-
"acc_stderr": 0.011539894677559562,
|
61 |
-
"acc_norm": 0.812,
|
62 |
-
"acc_norm_stderr": 0.01236158601510375
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7323177366702938,
|
66 |
-
"acc_stderr": 0.01033011118937043,
|
67 |
-
"acc_norm": 0.7334058759521219,
|
68 |
-
"acc_norm_stderr": 0.010316749863541365
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.343,0.015019206922356953,0
|
3 |
+
anli_r2,acc,0.335,0.014933117490932573,0
|
4 |
+
anli_r3,acc,0.34833333333333333,0.013759437498874066,0
|
5 |
+
arc_challenge,acc,0.25853242320819114,0.012794553754288686,0
|
6 |
+
arc_challenge,acc_norm,0.2738907849829352,0.013032004972989503,0
|
7 |
+
arc_easy,acc,0.5841750841750841,0.01011334824464787,0
|
8 |
+
arc_easy,acc_norm,0.5555555555555556,0.010196254838691682,0
|
9 |
+
boolq,acc,0.5767584097859327,0.008641391399113586,1
|
10 |
+
cb,acc,0.42857142857142855,0.06672848092813058,1
|
11 |
+
cb,f1,0.2908054169636493,,1
|
12 |
+
copa,acc,0.71,0.045604802157206845,0
|
13 |
+
hellaswag,acc,0.4312885879306911,0.004942440746328496,0
|
14 |
+
hellaswag,acc_norm,0.5573590918143796,0.004956839256162738,0
|
15 |
+
piqa,acc,0.735038084874864,0.010296557993316056,0
|
16 |
+
piqa,acc_norm,0.7388465723612623,0.010248738649935592,0
|
17 |
+
rte,acc,0.5523465703971119,0.029931070362939526,0
|
18 |
+
sciq,acc,0.862,0.010912152632504401,0
|
19 |
+
sciq,acc_norm,0.838,0.011657267771304413,0
|
20 |
+
storycloze_2016,acc,0.6878674505611972,0.010715220346279683,0
|
21 |
+
winogrande,acc,0.5698500394632992,0.013914685094716696,0
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.343,
|
5 |
-
"acc_stderr": 0.015019206922356953
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.335,
|
9 |
-
"acc_stderr": 0.014933117490932573
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34833333333333333,
|
13 |
-
"acc_stderr": 0.013759437498874066
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.2908054169636493
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.71,
|
22 |
-
"acc_stderr": 0.045604802157206845
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4312885879306911,
|
26 |
-
"acc_stderr": 0.004942440746328496,
|
27 |
-
"acc_norm": 0.5573590918143796,
|
28 |
-
"acc_norm_stderr": 0.004956839256162738
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5523465703971119,
|
32 |
-
"acc_stderr": 0.029931070362939526
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5698500394632992,
|
36 |
-
"acc_stderr": 0.013914685094716696
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6878674505611972,
|
40 |
-
"acc_stderr": 0.010715220346279683
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5767584097859327,
|
44 |
-
"acc_stderr": 0.008641391399113586
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5841750841750841,
|
48 |
-
"acc_stderr": 0.01011334824464787,
|
49 |
-
"acc_norm": 0.5555555555555556,
|
50 |
-
"acc_norm_stderr": 0.010196254838691682
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.25853242320819114,
|
54 |
-
"acc_stderr": 0.012794553754288686,
|
55 |
-
"acc_norm": 0.2738907849829352,
|
56 |
-
"acc_norm_stderr": 0.013032004972989503
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.862,
|
60 |
-
"acc_stderr": 0.010912152632504401,
|
61 |
-
"acc_norm": 0.838,
|
62 |
-
"acc_norm_stderr": 0.011657267771304413
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.735038084874864,
|
66 |
-
"acc_stderr": 0.010296557993316056,
|
67 |
-
"acc_norm": 0.7388465723612623,
|
68 |
-
"acc_norm_stderr": 0.010248738649935592
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.336,0.014944140233795021,0
|
3 |
+
anli_r2,acc,0.351,0.015100563798316403,0
|
4 |
+
anli_r3,acc,0.35333333333333333,0.013804572162314928,0
|
5 |
+
arc_challenge,acc,0.25853242320819114,0.012794553754288687,0
|
6 |
+
arc_challenge,acc_norm,0.2645051194539249,0.012889272949313368,0
|
7 |
+
arc_easy,acc,0.5837542087542088,0.010114819404500873,0
|
8 |
+
arc_easy,acc_norm,0.5686026936026936,0.010162752847747506,0
|
9 |
+
boolq,acc,0.5694189602446483,0.00866036014598874,1
|
10 |
+
cb,acc,0.5357142857142857,0.06724777654937658,1
|
11 |
+
cb,f1,0.35846267553584626,,1
|
12 |
+
copa,acc,0.77,0.04229525846816506,0
|
13 |
+
hellaswag,acc,0.4303923521210914,0.004941191607317909,0
|
14 |
+
hellaswag,acc_norm,0.5641306512646883,0.004948567856373861,0
|
15 |
+
piqa,acc,0.735038084874864,0.010296557993316056,0
|
16 |
+
piqa,acc_norm,0.7421109902067464,0.010206956662056245,0
|
17 |
+
rte,acc,0.5306859205776173,0.030039730592197812,0
|
18 |
+
sciq,acc,0.862,0.010912152632504397,0
|
19 |
+
sciq,acc_norm,0.845,0.011450157470799456,0
|
20 |
+
storycloze_2016,acc,0.6857295563869589,0.01073513228510818,0
|
21 |
+
winogrande,acc,0.5714285714285714,0.013908353814606693,0
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.336,
|
5 |
-
"acc_stderr": 0.014944140233795021
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.351,
|
9 |
-
"acc_stderr": 0.015100563798316403
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.35333333333333333,
|
13 |
-
"acc_stderr": 0.013804572162314928
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5357142857142857,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.35846267553584626
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.77,
|
22 |
-
"acc_stderr": 0.04229525846816506
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4303923521210914,
|
26 |
-
"acc_stderr": 0.004941191607317909,
|
27 |
-
"acc_norm": 0.5641306512646883,
|
28 |
-
"acc_norm_stderr": 0.004948567856373861
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5306859205776173,
|
32 |
-
"acc_stderr": 0.030039730592197812
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5714285714285714,
|
36 |
-
"acc_stderr": 0.013908353814606693
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6857295563869589,
|
40 |
-
"acc_stderr": 0.01073513228510818
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5694189602446483,
|
44 |
-
"acc_stderr": 0.00866036014598874
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5837542087542088,
|
48 |
-
"acc_stderr": 0.010114819404500873,
|
49 |
-
"acc_norm": 0.5686026936026936,
|
50 |
-
"acc_norm_stderr": 0.010162752847747506
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.25853242320819114,
|
54 |
-
"acc_stderr": 0.012794553754288687,
|
55 |
-
"acc_norm": 0.2645051194539249,
|
56 |
-
"acc_norm_stderr": 0.012889272949313368
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.862,
|
60 |
-
"acc_stderr": 0.010912152632504397,
|
61 |
-
"acc_norm": 0.845,
|
62 |
-
"acc_norm_stderr": 0.011450157470799456
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.735038084874864,
|
66 |
-
"acc_stderr": 0.010296557993316056,
|
67 |
-
"acc_norm": 0.7421109902067464,
|
68 |
-
"acc_norm_stderr": 0.010206956662056245
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.367,0.015249378464171749,0
|
3 |
+
anli_r2,acc,0.355,0.015139491543780532,0
|
4 |
+
anli_r3,acc,0.35583333333333333,0.01382651874849331,0
|
5 |
+
arc_challenge,acc,0.2525597269624573,0.012696728980207708,0
|
6 |
+
arc_challenge,acc_norm,0.2832764505119454,0.013167478735134576,0
|
7 |
+
arc_easy,acc,0.5917508417508418,0.010085566195791245,0
|
8 |
+
arc_easy,acc_norm,0.5669191919191919,0.010167478013701789,0
|
9 |
+
boolq,acc,0.5724770642201835,0.008652692997177337,1
|
10 |
+
cb,acc,0.5178571428571429,0.06737697508644647,1
|
11 |
+
cb,f1,0.3175,,1
|
12 |
+
copa,acc,0.78,0.04163331998932261,0
|
13 |
+
hellaswag,acc,0.4297948615813583,0.004940349676769324,0
|
14 |
+
hellaswag,acc_norm,0.5615415255925115,0.0049518409782196935,0
|
15 |
+
piqa,acc,0.7295973884657236,0.010363167031620798,0
|
16 |
+
piqa,acc_norm,0.733949945593036,0.010310039263352826,0
|
17 |
+
rte,acc,0.5487364620938628,0.029953149241808946,0
|
18 |
+
sciq,acc,0.874,0.010499249222408047,0
|
19 |
+
sciq,acc_norm,0.853,0.011203415395160328,0
|
20 |
+
storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0
|
21 |
+
winogrande,acc,0.5666929755327546,0.013926915052757345,0
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.367,
|
5 |
-
"acc_stderr": 0.015249378464171749
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.355,
|
9 |
-
"acc_stderr": 0.015139491543780532
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.35583333333333333,
|
13 |
-
"acc_stderr": 0.01382651874849331
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5178571428571429,
|
17 |
-
"acc_stderr": 0.06737697508644647,
|
18 |
-
"f1": 0.3175
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.78,
|
22 |
-
"acc_stderr": 0.04163331998932261
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4297948615813583,
|
26 |
-
"acc_stderr": 0.004940349676769324,
|
27 |
-
"acc_norm": 0.5615415255925115,
|
28 |
-
"acc_norm_stderr": 0.0049518409782196935
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5487364620938628,
|
32 |
-
"acc_stderr": 0.029953149241808946
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5666929755327546,
|
36 |
-
"acc_stderr": 0.013926915052757345
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6932121859967931,
|
40 |
-
"acc_stderr": 0.010664275190473634
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5724770642201835,
|
44 |
-
"acc_stderr": 0.008652692997177337
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5917508417508418,
|
48 |
-
"acc_stderr": 0.010085566195791245,
|
49 |
-
"acc_norm": 0.5669191919191919,
|
50 |
-
"acc_norm_stderr": 0.010167478013701789
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2525597269624573,
|
54 |
-
"acc_stderr": 0.012696728980207708,
|
55 |
-
"acc_norm": 0.2832764505119454,
|
56 |
-
"acc_norm_stderr": 0.013167478735134576
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.874,
|
60 |
-
"acc_stderr": 0.010499249222408047,
|
61 |
-
"acc_norm": 0.853,
|
62 |
-
"acc_norm_stderr": 0.011203415395160328
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7295973884657236,
|
66 |
-
"acc_stderr": 0.010363167031620798,
|
67 |
-
"acc_norm": 0.733949945593036,
|
68 |
-
"acc_norm_stderr": 0.010310039263352826
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.385,0.015395194445410808,0
|
3 |
+
anli_r2,acc,0.336,0.014944140233795021,0
|
4 |
+
anli_r3,acc,0.36333333333333334,0.013889898953170563,0
|
5 |
+
arc_challenge,acc,0.257679180887372,0.012780770562768402,0
|
6 |
+
arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0
|
7 |
+
arc_easy,acc,0.5904882154882155,0.010090368160990059,0
|
8 |
+
arc_easy,acc_norm,0.5736531986531986,0.01014785860383514,0
|
9 |
+
boolq,acc,0.5666666666666667,0.008666972565214514,1
|
10 |
+
cb,acc,0.5535714285714286,0.06703189227942394,1
|
11 |
+
cb,f1,0.3077154912597951,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.4266082453694483,0.0049357353003488666,0
|
14 |
+
hellaswag,acc_norm,0.566620195180243,0.004945291270072436,0
|
15 |
+
piqa,acc,0.7285092491838956,0.010376251176596135,0
|
16 |
+
piqa,acc_norm,0.7486398258977149,0.01012115601681925,0
|
17 |
+
rte,acc,0.5523465703971119,0.02993107036293953,0
|
18 |
+
sciq,acc,0.872,0.010570133761108665,0
|
19 |
+
sciq,acc_norm,0.854,0.0111717862854965,0
|
20 |
+
storycloze_2016,acc,0.6841261357562801,0.010749892827011113,0
|
21 |
+
winogrande,acc,0.5445935280189423,0.013996485037729782,0
|
2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.385,
|
5 |
-
"acc_stderr": 0.015395194445410808
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.336,
|
9 |
-
"acc_stderr": 0.014944140233795021
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.36333333333333334,
|
13 |
-
"acc_stderr": 0.013889898953170563
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5535714285714286,
|
17 |
-
"acc_stderr": 0.06703189227942394,
|
18 |
-
"f1": 0.3077154912597951
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4266082453694483,
|
26 |
-
"acc_stderr": 0.0049357353003488666,
|
27 |
-
"acc_norm": 0.566620195180243,
|
28 |
-
"acc_norm_stderr": 0.004945291270072436
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5523465703971119,
|
32 |
-
"acc_stderr": 0.02993107036293953
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5445935280189423,
|
36 |
-
"acc_stderr": 0.013996485037729782
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6841261357562801,
|
40 |
-
"acc_stderr": 0.010749892827011113
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5666666666666667,
|
44 |
-
"acc_stderr": 0.008666972565214514
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5904882154882155,
|
48 |
-
"acc_stderr": 0.010090368160990059,
|
49 |
-
"acc_norm": 0.5736531986531986,
|
50 |
-
"acc_norm_stderr": 0.01014785860383514
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.257679180887372,
|
54 |
-
"acc_stderr": 0.012780770562768402,
|
55 |
-
"acc_norm": 0.27559726962457337,
|
56 |
-
"acc_norm_stderr": 0.013057169655761838
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.872,
|
60 |
-
"acc_stderr": 0.010570133761108665,
|
61 |
-
"acc_norm": 0.854,
|
62 |
-
"acc_norm_stderr": 0.0111717862854965
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7285092491838956,
|
66 |
-
"acc_stderr": 0.010376251176596135,
|
67 |
-
"acc_norm": 0.7486398258977149,
|
68 |
-
"acc_norm_stderr": 0.01012115601681925
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed3/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.033687666872503644
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.033687666872503644
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.16855819384473125
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.16855819384473125
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.18538437537983646
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.18538437537983646
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.19219003937378554
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.19219003937378554
|
10 |
+
e2e_nlg_cleaned,3,average,multiple,0.14495506886771423
|
11 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04347043245633625
|
12 |
+
gem_xsum,0,median,rouge2_fmeasure,0.04347043245633625
|
13 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.030277850873655133
|
14 |
+
gem_xsum,1,median,rouge2_fmeasure,0.030277850873655133
|
15 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03015426920788573
|
16 |
+
gem_xsum,2,median,rouge2_fmeasure,0.03015426920788573
|
17 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.028265095806897757
|
18 |
+
gem_xsum,3,median,rouge2_fmeasure,0.028265095806897757
|
19 |
+
gem_xsum,3,average,multiple,0.03304191208619372
|
20 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.048503194247737774
|
21 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.048503194247737774
|
22 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.04633905642415022
|
23 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.04633905642415022
|
24 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.0482705113559789
|
25 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.0482705113559789
|
26 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.0486263549461623
|
27 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.0486263549461623
|
28 |
+
web_nlg_en,3,average,multiple,0.047934779243507294
|
29 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03524633277968111
|
30 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.03524633277968111
|
31 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04022404252550308
|
32 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.04022404252550308
|
33 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04709721853020564
|
34 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.04709721853020564
|
35 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.039320031366549095
|
36 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.039320031366549095
|
37 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01267310048414024
|
38 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.01267310048414024
|
39 |
+
wiki_lingua_en,4,average,multiple,0.034912145137215835
|
2b855b11bc4seed3/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3003906758006876, "bleu_stderr": 0.027331800335698254, "rouge1_fmeasure": 0.10522756302657624, "rouge1_fmeasure_stderr": 0.0020564917165852665, "rouge1_precision": 0.06963270011987571, "rouge1_precision_stderr": 0.0016168408610962013, "rouge1_recall": 0.29729464197458877, "rouge1_recall_stderr": 0.0047714353450774415, "rouge2_fmeasure": 0.048503194247737774, "rouge2_fmeasure_stderr": 0.001258248656235562, "rouge2_precision": 0.03200346886185345, "rouge2_precision_stderr": 0.000957625394662931, "rouge2_recall": 0.140470675875411, "rouge2_recall_stderr": 0.0032183793551318674, "rougeL_fmeasure": 0.10106959021191211, "rougeL_fmeasure_stderr": 0.0018911974850607549, "rougeL_precision": 0.06646767562482549, "rougeL_precision_stderr": 0.0014351668395974513, "rougeL_recall": 0.28816568568111717, "rougeL_recall_stderr": 0.004622479804486804, "rougeLsum_fmeasure": 0.10010042995723817, "rougeLsum_fmeasure_stderr": 0.001929318666410727, "rougeLsum_precision": 0.06620461518234684, "rougeLsum_precision_stderr": 0.00151460835911578, "rougeLsum_recall": 0.282597952018841, "rougeLsum_recall_stderr": 0.004481787483630335}}, "1": {"PALM_prompt": {"bleu": 0.32423805540348105, "bleu_stderr": 0.024690542644449438, "rouge1_fmeasure": 0.10249916346044939, "rouge1_fmeasure_stderr": 0.0018656887279984073, "rouge1_precision": 0.06685340388470082, "rouge1_precision_stderr": 0.0015253847039623312, "rouge1_recall": 0.323514427623318, "rouge1_recall_stderr": 0.004639043635695871, "rouge2_fmeasure": 0.04633905642415022, "rouge2_fmeasure_stderr": 0.0011829999453798483, "rouge2_precision": 0.030000286950072674, "rouge2_precision_stderr": 0.0008643544653956381, "rouge2_recall": 0.14824220519105014, "rouge2_recall_stderr": 0.0031972015283258494, "rougeL_fmeasure": 0.09712356208435262, "rougeL_fmeasure_stderr": 0.0017347095170401298, "rougeL_precision": 0.06336375493461167, "rougeL_precision_stderr": 0.0014215770633909588, "rougeL_recall": 0.303094617033746, "rougeL_recall_stderr": 0.004191983337899737, "rougeLsum_fmeasure": 0.09830433934297841, "rougeLsum_fmeasure_stderr": 0.0017883137206561263, "rougeLsum_precision": 0.06416576148923457, "rougeLsum_precision_stderr": 0.0014659099244540854, "rougeLsum_recall": 0.30832609036704645, "rougeLsum_recall_stderr": 0.004339774784201872}}, "2": {"PALM_prompt": {"bleu": 0.33841996991287543, "bleu_stderr": 0.013366615983706327, "rouge1_fmeasure": 0.10597979981258299, "rouge1_fmeasure_stderr": 0.0017714336298557038, "rouge1_precision": 0.06765631897767002, "rouge1_precision_stderr": 0.0013046336481251101, "rouge1_recall": 0.3402361179206784, "rouge1_recall_stderr": 0.004704696452770623, "rouge2_fmeasure": 0.0482705113559789, "rouge2_fmeasure_stderr": 0.001132968575984557, "rouge2_precision": 0.03077023716855104, "rouge2_precision_stderr": 0.0008032546127820459, "rouge2_recall": 0.1606656566085052, "rouge2_recall_stderr": 0.0033693273708689585, "rougeL_fmeasure": 0.10046626150365084, "rougeL_fmeasure_stderr": 0.0016642034527337758, "rougeL_precision": 0.06412349572198735, "rougeL_precision_stderr": 0.0012122011908497355, "rougeL_recall": 0.31907776335571136, "rougeL_recall_stderr": 0.004279931218687767, "rougeLsum_fmeasure": 0.10141858866959458, "rougeLsum_fmeasure_stderr": 0.001700099907526394, "rougeLsum_precision": 0.06475854902642648, "rougeLsum_precision_stderr": 0.0012483946453731626, "rougeLsum_recall": 0.3241391405150877, "rougeLsum_recall_stderr": 0.004418718258324822}}, "3": {"PALM_prompt": {"bleu": 0.40584434052309054, "bleu_stderr": 0.02616462426887616, "rouge1_fmeasure": 0.10602175173953958, "rouge1_fmeasure_stderr": 0.001741987498486231, "rouge1_precision": 0.06757351073723121, "rouge1_precision_stderr": 0.0012679743548778455, "rouge1_recall": 0.3440037162848658, "rouge1_recall_stderr": 0.0049119409175133995, "rouge2_fmeasure": 0.0486263549461623, "rouge2_fmeasure_stderr": 0.0010900986383698252, "rouge2_precision": 0.030886249533870456, "rouge2_precision_stderr": 0.0007629220092981514, "rouge2_recall": 0.16556453686704345, "rouge2_recall_stderr": 0.0034492328057988348, "rougeL_fmeasure": 0.10011209012325656, "rougeL_fmeasure_stderr": 0.0016356941636110295, "rougeL_precision": 0.06384738095162408, "rougeL_precision_stderr": 0.0011862226633864972, "rougeL_recall": 0.3209539378169773, "rougeL_recall_stderr": 0.004410637240630364, "rougeLsum_fmeasure": 0.10161889625106496, "rougeLsum_fmeasure_stderr": 0.0016751216394668587, "rougeLsum_precision": 0.06483859720845632, "rougeLsum_precision_stderr": 0.0012202108024313588, "rougeLsum_recall": 0.3270171129897123, "rougeLsum_recall_stderr": 0.0045709512856975595}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4622087249250515, "bleu_stderr": 0.04207910392007385, "rouge1_fmeasure": 0.1767157419817045, "rouge1_fmeasure_stderr": 0.0018123734169975135, "rouge1_precision": 0.1506827740817673, "rouge1_precision_stderr": 0.001852883253242218, "rouge1_recall": 0.25686230777525454, "rouge1_recall_stderr": 0.00260278766504943, "rouge2_fmeasure": 0.03524633277968111, "rouge2_fmeasure_stderr": 0.0008101513042416649, "rouge2_precision": 0.02971277123757481, "rouge2_precision_stderr": 0.000707355818382236, "rouge2_recall": 0.05307915600648516, "rouge2_recall_stderr": 0.0013686059525554353, "rougeL_fmeasure": 0.13759253266149163, "rougeL_fmeasure_stderr": 0.001281432881200856, "rougeL_precision": 0.11575873439522533, "rougeL_precision_stderr": 0.0012737353859565011, "rougeL_recall": 0.2052922273561128, "rougeL_recall_stderr": 0.0021096414533161158, "rougeLsum_fmeasure": 0.16173441706919453, "rougeLsum_fmeasure_stderr": 0.0016450542485275806, "rougeLsum_precision": 0.1374963134041478, "rougeLsum_precision_stderr": 0.0016657330727227028, "rougeLsum_recall": 0.2362819209208244, "rougeLsum_recall_stderr": 0.002418312889630038}}, "1": {"tldr_en": {"bleu": 1.972919037460516, "bleu_stderr": 0.05682843274487135, "rouge1_fmeasure": 0.19185955437187688, "rouge1_fmeasure_stderr": 0.001834411329990832, "rouge1_precision": 0.1655651982573561, "rouge1_precision_stderr": 0.001967434204420624, "rouge1_recall": 0.2778367078547306, "rouge1_recall_stderr": 0.0026376055817718498, "rouge2_fmeasure": 0.04022404252550308, "rouge2_fmeasure_stderr": 0.0009067575883966806, "rouge2_precision": 0.03468172497365085, "rouge2_precision_stderr": 0.0008414429849505186, "rouge2_recall": 0.06032773756466252, "rouge2_recall_stderr": 0.0014755233968311602, "rougeL_fmeasure": 0.1394283655875047, "rougeL_fmeasure_stderr": 0.0012494508344161473, "rougeL_precision": 0.1190010376024555, "rougeL_precision_stderr": 0.0013201442960543233, "rougeL_recall": 0.20730876012944102, "rougeL_recall_stderr": 0.0020918631972439913, "rougeLsum_fmeasure": 0.1798108549879984, "rougeLsum_fmeasure_stderr": 0.00170035102934959, "rougeLsum_precision": 0.15494002660111156, "rougeLsum_precision_stderr": 0.001825015399111526, "rougeLsum_recall": 0.26137076597079567, "rougeLsum_recall_stderr": 0.002487191761867969}}, "2": {"tldr_en": {"bleu": 2.2743708557193902, "bleu_stderr": 0.06189529268097383, "rouge1_fmeasure": 0.20283722636774154, "rouge1_fmeasure_stderr": 0.001838028419312966, "rouge1_precision": 0.17638523863069297, "rouge1_precision_stderr": 0.002020755519678356, "rouge1_recall": 0.2911840829982234, "rouge1_recall_stderr": 0.0026772675380155376, "rouge2_fmeasure": 0.04709721853020564, "rouge2_fmeasure_stderr": 0.000949364569447851, "rouge2_precision": 0.04064975395757813, "rouge2_precision_stderr": 0.0008728139678303188, "rouge2_recall": 0.0700705095693168, "rouge2_recall_stderr": 0.0016162550664557144, "rougeL_fmeasure": 0.14915790559695724, "rougeL_fmeasure_stderr": 0.001266186326972188, "rougeL_precision": 0.128540063496423, "rougeL_precision_stderr": 0.00138883173221342, "rougeL_recall": 0.21916109079967522, "rougeL_recall_stderr": 0.002145845723677153, "rougeLsum_fmeasure": 0.1899159372074719, "rougeLsum_fmeasure_stderr": 0.0017127052022618613, "rougeLsum_precision": 0.16486271659279425, "rougeLsum_precision_stderr": 0.0018790326981489205, "rougeLsum_recall": 0.2736737223320281, "rougeLsum_recall_stderr": 0.0025545391606520626}}, "3": {"tldr_en": {"bleu": 2.261325582734008, "bleu_stderr": 0.07440671017406066, "rouge1_fmeasure": 0.17007228676988045, "rouge1_fmeasure_stderr": 0.0021295726072665254, "rouge1_precision": 0.15457296707448556, "rouge1_precision_stderr": 0.0023843459800257866, "rouge1_recall": 0.24244601688859282, "rouge1_recall_stderr": 0.0031047631304705336, "rouge2_fmeasure": 0.039320031366549095, "rouge2_fmeasure_stderr": 0.0009156298192238671, "rouge2_precision": 0.03563497026047448, "rouge2_precision_stderr": 0.0009697902407615488, "rouge2_recall": 0.05775187966143085, "rouge2_recall_stderr": 0.0014973785566537818, "rougeL_fmeasure": 0.12582015691412737, "rougeL_fmeasure_stderr": 0.0015236321853914047, "rougeL_precision": 0.11385012472654804, "rougeL_precision_stderr": 0.0017484820235318678, "rougeL_recall": 0.18331103625638695, "rougeL_recall_stderr": 0.0024579268481887276, "rougeLsum_fmeasure": 0.15899932831508662, "rougeLsum_fmeasure_stderr": 0.0019839606960568193, "rougeLsum_precision": 0.1443112386465537, "rougeLsum_precision_stderr": 0.002222016297651524, "rougeLsum_recall": 0.22741868584839184, "rougeLsum_recall_stderr": 0.0029377295753792703}}, "4": {"tldr_en": {"bleu": 0.5600187004785872, "bleu_stderr": 0.034329899401054034, "rouge1_fmeasure": 0.054359788577698374, "rouge1_fmeasure_stderr": 0.001834775379938349, "rouge1_precision": 0.05049412311337655, "rouge1_precision_stderr": 0.0018775705845440543, "rouge1_recall": 0.07991007309327065, "rouge1_recall_stderr": 0.002748908637336395, "rouge2_fmeasure": 0.01267310048414024, "rouge2_fmeasure_stderr": 0.0006260839320397936, "rouge2_precision": 0.01142264448143536, "rouge2_precision_stderr": 0.0006812961216581213, "rouge2_recall": 0.02008142422629143, "rouge2_recall_stderr": 0.0010993317750224779, "rougeL_fmeasure": 0.04187829090557222, "rougeL_fmeasure_stderr": 0.001389559278464916, "rougeL_precision": 0.038958076750695146, "rougeL_precision_stderr": 0.0014576150060918941, "rougeL_recall": 0.0629580148095348, "rougeL_recall_stderr": 0.002201566030491766, "rougeLsum_fmeasure": 0.05081727487259366, "rougeLsum_fmeasure_stderr": 0.0017142240431756424, "rougeLsum_precision": 0.047269915920336, "rougeLsum_precision_stderr": 0.0017705102362424203, "rougeLsum_recall": 0.07497632668531767, "rougeLsum_recall_stderr": 0.002589565865103579}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.2918074829245511, "bleu_stderr": 0.03334721114977758, "rouge1_fmeasure": 0.1600243176507681, "rouge1_fmeasure_stderr": 0.0010733278398958096, "rouge1_precision": 0.12561487707508034, "rouge1_precision_stderr": 0.0013825187846368096, "rouge1_recall": 0.25305668157788624, "rouge1_recall_stderr": 0.0013315623225900873, "rouge2_fmeasure": 0.033687666872503644, "rouge2_fmeasure_stderr": 0.0005133425312314838, "rouge2_precision": 0.026668315991920277, "rouge2_precision_stderr": 0.0005306038751357611, "rouge2_recall": 0.05412553888370808, "rouge2_recall_stderr": 0.0007946156915472112, "rougeL_fmeasure": 0.15622553271845338, "rougeL_fmeasure_stderr": 0.0010043221686026895, "rougeL_precision": 0.12177712671014992, "rougeL_precision_stderr": 0.001240068402500465, "rougeL_recall": 0.24840753894632903, "rougeL_recall_stderr": 0.001318527645790754, "rougeLsum_fmeasure": 0.13789602591344347, "rougeLsum_fmeasure_stderr": 0.0009870480569216915, "rougeLsum_precision": 0.10841189553256889, "rougeLsum_precision_stderr": 0.0012618615939597366, "rougeLsum_recall": 0.21839285442742476, "rougeLsum_recall_stderr": 0.0012454131726385412}}, "1": {"generate_text_restaurant": {"bleu": 9.67590369741921, "bleu_stderr": 0.09901656476390079, "rouge1_fmeasure": 0.39957774138885144, "rouge1_fmeasure_stderr": 0.0022278577374241887, "rouge1_precision": 0.45349765710233814, "rouge1_precision_stderr": 0.0029408652429081406, "rouge1_recall": 0.39709628674803366, "rouge1_recall_stderr": 0.002799520545788188, "rouge2_fmeasure": 0.16855819384473125, "rouge2_fmeasure_stderr": 0.001773629505406216, "rouge2_precision": 0.19448143773761467, "rouge2_precision_stderr": 0.002225394284814737, "rouge2_recall": 0.16778603132296563, "rouge2_recall_stderr": 0.0019654958159185396, "rougeL_fmeasure": 0.2890014884493394, "rougeL_fmeasure_stderr": 0.0018207916834884527, "rougeL_precision": 0.33043423858845145, "rougeL_precision_stderr": 0.0024856924454187785, "rougeL_recall": 0.2872561015612187, "rougeL_recall_stderr": 0.0022337982582298155, "rougeLsum_fmeasure": 0.3277959949146152, "rougeLsum_fmeasure_stderr": 0.0021102604579282852, "rougeLsum_precision": 0.3738597805796585, "rougeLsum_precision_stderr": 0.002773351046379512, "rougeLsum_recall": 0.3249821747731812, "rougeLsum_recall_stderr": 0.002515405285337629}}, "2": {"generate_text_restaurant": {"bleu": 10.91262584267496, "bleu_stderr": 0.12763668087089558, "rouge1_fmeasure": 0.4158184072582917, "rouge1_fmeasure_stderr": 0.0021875821824657578, "rouge1_precision": 0.4745007823219277, "rouge1_precision_stderr": 0.003096779743530949, "rouge1_recall": 0.41114421373702703, "rouge1_recall_stderr": 0.002795411314326619, "rouge2_fmeasure": 0.18538437537983646, "rouge2_fmeasure_stderr": 0.001799521509176773, "rouge2_precision": 0.21433784297140893, "rouge2_precision_stderr": 0.002331836767735763, "rouge2_recall": 0.18401740949966608, "rouge2_recall_stderr": 0.002021878491927928, "rougeL_fmeasure": 0.30338837358825094, "rougeL_fmeasure_stderr": 0.001840114059497487, "rougeL_precision": 0.34796254903830837, "rougeL_precision_stderr": 0.002621056636644695, "rougeL_recall": 0.2999843023857228, "rougeL_recall_stderr": 0.002269391756360346, "rougeLsum_fmeasure": 0.3438444001935511, "rougeLsum_fmeasure_stderr": 0.0020872281136323037, "rougeLsum_precision": 0.39290245826296444, "rougeLsum_precision_stderr": 0.0028605438328826085, "rougeLsum_recall": 0.3399505800342592, "rougeLsum_recall_stderr": 0.0025542752245032124}}, "3": {"generate_text_restaurant": {"bleu": 11.413052941435677, "bleu_stderr": 0.20476608953978004, "rouge1_fmeasure": 0.42003617329938386, "rouge1_fmeasure_stderr": 0.002154444681203657, "rouge1_precision": 0.4836426637277247, "rouge1_precision_stderr": 0.003041696429497936, "rouge1_recall": 0.4077462553545315, "rouge1_recall_stderr": 0.002698117142879397, "rouge2_fmeasure": 0.19219003937378554, "rouge2_fmeasure_stderr": 0.0018883864751046853, "rouge2_precision": 0.22368030301486044, "rouge2_precision_stderr": 0.0024069791642568504, "rouge2_recall": 0.18735894030210265, "rouge2_recall_stderr": 0.0020871489281372663, "rougeL_fmeasure": 0.31324626443128384, "rougeL_fmeasure_stderr": 0.0019216880227233767, "rougeL_precision": 0.36189364939827273, "rougeL_precision_stderr": 0.0026889223009721307, "rougeL_recall": 0.30429571782108805, "rougeL_recall_stderr": 0.0023058551990661367, "rougeLsum_fmeasure": 0.35143133030157764, "rougeLsum_fmeasure_stderr": 0.002142111791450537, "rougeLsum_precision": 0.40514213772293617, "rougeLsum_precision_stderr": 0.0029172807069507275, "rougeLsum_recall": 0.341129701516605, "rougeLsum_recall_stderr": 0.0025465447925427515}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.7452599273208484, "bleu_stderr": 0.0710626335421352, "rouge1_fmeasure": 0.2032531810901225, "rouge1_fmeasure_stderr": 0.002372876068699817, "rouge1_precision": 0.14818700118728248, "rouge1_precision_stderr": 0.0018676651463230222, "rouge1_recall": 0.34586554355363025, "rouge1_recall_stderr": 0.0041641658204401725, "rouge2_fmeasure": 0.04347043245633625, "rouge2_fmeasure_stderr": 0.001442207400667641, "rouge2_precision": 0.03119345685386172, "rouge2_precision_stderr": 0.0010373843022792317, "rouge2_recall": 0.07667587881403153, "rouge2_recall_stderr": 0.002653660348427432, "rougeL_fmeasure": 0.1528350870269447, "rougeL_fmeasure_stderr": 0.0017715223191776047, "rougeL_precision": 0.11119723252661128, "rougeL_precision_stderr": 0.0013663860116280308, "rougeL_recall": 0.2616100987791669, "rougeL_recall_stderr": 0.003275713171944408, "rougeLsum_fmeasure": 0.16043210769047098, "rougeLsum_fmeasure_stderr": 0.0020055581696195052, "rougeLsum_precision": 0.11658163341774634, "rougeLsum_precision_stderr": 0.0015235968275521983, "rougeLsum_recall": 0.275178892490744, "rougeLsum_recall_stderr": 0.0036968416298391823}}, "1": {"article_DOC_summary": {"bleu": 1.1604575025128918, "bleu_stderr": 0.0877746649601935, "rouge1_fmeasure": 0.16834116026562992, "rouge1_fmeasure_stderr": 0.0023007208182279906, "rouge1_precision": 0.11944768188603072, "rouge1_precision_stderr": 0.0017101747872860968, "rouge1_recall": 0.2967764173333572, "rouge1_recall_stderr": 0.003944563464315701, "rouge2_fmeasure": 0.030277850873655133, "rouge2_fmeasure_stderr": 0.0012385139115491765, "rouge2_precision": 0.021232879052259627, "rouge2_precision_stderr": 0.0008714791712295506, "rouge2_recall": 0.05526393564583485, "rouge2_recall_stderr": 0.0023240893343291387, "rougeL_fmeasure": 0.12978985120588027, "rougeL_fmeasure_stderr": 0.0017321643764184145, "rougeL_precision": 0.09192594935218694, "rougeL_precision_stderr": 0.0012769463345594439, "rougeL_recall": 0.23019911126154574, "rougeL_recall_stderr": 0.003091431421711302, "rougeLsum_fmeasure": 0.13564693066989447, "rougeLsum_fmeasure_stderr": 0.0018863533793062786, "rougeLsum_precision": 0.09602849667357606, "rougeLsum_precision_stderr": 0.001386138103889359, "rougeLsum_recall": 0.2407455765901895, "rougeLsum_recall_stderr": 0.003358547528460653}}, "2": {"article_DOC_summary": {"bleu": 1.1805459815971808, "bleu_stderr": 0.08801398952015627, "rouge1_fmeasure": 0.16960096474510714, "rouge1_fmeasure_stderr": 0.002303219538316631, "rouge1_precision": 0.12017928149855549, "rouge1_precision_stderr": 0.0017155806595076542, "rouge1_recall": 0.3000656582759754, "rouge1_recall_stderr": 0.003942220611475929, "rouge2_fmeasure": 0.03015426920788573, "rouge2_fmeasure_stderr": 0.001263243793733084, "rouge2_precision": 0.02115270486812927, "rouge2_precision_stderr": 0.0008856577807583306, "rouge2_recall": 0.054787883745542026, "rouge2_recall_stderr": 0.0023745228517959276, "rougeL_fmeasure": 0.131124075957363, "rougeL_fmeasure_stderr": 0.0017424293214564424, "rougeL_precision": 0.0927144051400674, "rougeL_precision_stderr": 0.0012857006594715614, "rougeL_recall": 0.23362729816209124, "rougeL_recall_stderr": 0.0031066367525268844, "rougeLsum_fmeasure": 0.1367233861657065, "rougeLsum_fmeasure_stderr": 0.0018905430841393567, "rougeLsum_precision": 0.09669042239875854, "rougeLsum_precision_stderr": 0.0013887653900610069, "rougeLsum_recall": 0.24335819875183146, "rougeLsum_recall_stderr": 0.003364341261182289}}, "3": {"article_DOC_summary": {"bleu": 1.2117893477080361, "bleu_stderr": 0.1149778380988921, "rouge1_fmeasure": 0.15889837582063066, "rouge1_fmeasure_stderr": 0.0024042734536394335, "rouge1_precision": 0.11536457953211869, "rouge1_precision_stderr": 0.0019139955193594944, "rouge1_recall": 0.2768098516237596, "rouge1_recall_stderr": 0.004205225171304428, "rouge2_fmeasure": 0.028265095806897757, "rouge2_fmeasure_stderr": 0.0012619861741006862, "rouge2_precision": 0.020015331184403896, "rouge2_precision_stderr": 0.0009021836723852284, "rouge2_recall": 0.05127981526296904, "rouge2_recall_stderr": 0.002364022279427071, "rougeL_fmeasure": 0.12453970375141683, "rougeL_fmeasure_stderr": 0.0018237860596141172, "rougeL_precision": 0.08997967190382496, "rougeL_precision_stderr": 0.0014235921883162194, "rougeL_recall": 0.21857370135939078, "rougeL_recall_stderr": 0.003310622426465869, "rougeLsum_fmeasure": 0.1276545465727294, "rougeLsum_fmeasure_stderr": 0.001972377201117121, "rougeLsum_precision": 0.09224753114626984, "rougeLsum_precision_stderr": 0.0015272098267094347, "rougeLsum_recall": 0.2238960772414025, "rougeLsum_recall_stderr": 0.003566353512085928}}}}
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.339,0.014976758771620344,0
|
3 |
+
anli_r2,acc,0.335,0.014933117490932573,0
|
4 |
+
anli_r3,acc,0.3416666666666667,0.013696658778002515,0
|
5 |
+
arc_challenge,acc,0.2354948805460751,0.012399451855004757,0
|
6 |
+
arc_challenge,acc_norm,0.2764505119453925,0.013069662474252425,0
|
7 |
+
arc_easy,acc,0.5593434343434344,0.010187264635711986,0
|
8 |
+
arc_easy,acc_norm,0.49537037037037035,0.010259343705889734,0
|
9 |
+
boolq,acc,0.544954128440367,0.008709637955263414,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.19555555555555557,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.4342760406293567,0.0049464854665446254,0
|
14 |
+
hellaswag,acc_norm,0.5590519816769568,0.0049548591067816675,0
|
15 |
+
piqa,acc,0.736126224156692,0.010282996367695562,0
|
16 |
+
piqa,acc_norm,0.7383025027203483,0.01025563077270823,0
|
17 |
+
rte,acc,0.516245487364621,0.030080573208738064,0
|
18 |
+
sciq,acc,0.801,0.01263164908309918,0
|
19 |
+
sciq,acc_norm,0.72,0.0142056961040915,0
|
20 |
+
storycloze_2016,acc,0.6905398182789952,0.010689956745189074,0
|
21 |
+
winogrande,acc,0.5540647198105761,0.01397009348233069,0
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.339,
|
5 |
-
"acc_stderr": 0.014976758771620344
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.335,
|
9 |
-
"acc_stderr": 0.014933117490932573
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3416666666666667,
|
13 |
-
"acc_stderr": 0.013696658778002515
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.19555555555555557
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4342760406293567,
|
26 |
-
"acc_stderr": 0.0049464854665446254,
|
27 |
-
"acc_norm": 0.5590519816769568,
|
28 |
-
"acc_norm_stderr": 0.0049548591067816675
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.516245487364621,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5540647198105761,
|
36 |
-
"acc_stderr": 0.01397009348233069
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6905398182789952,
|
40 |
-
"acc_stderr": 0.010689956745189074
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.544954128440367,
|
44 |
-
"acc_stderr": 0.008709637955263414
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5593434343434344,
|
48 |
-
"acc_stderr": 0.010187264635711986,
|
49 |
-
"acc_norm": 0.49537037037037035,
|
50 |
-
"acc_norm_stderr": 0.010259343705889734
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2354948805460751,
|
54 |
-
"acc_stderr": 0.012399451855004757,
|
55 |
-
"acc_norm": 0.2764505119453925,
|
56 |
-
"acc_norm_stderr": 0.013069662474252425
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.801,
|
60 |
-
"acc_stderr": 0.01263164908309918,
|
61 |
-
"acc_norm": 0.72,
|
62 |
-
"acc_norm_stderr": 0.0142056961040915
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.736126224156692,
|
66 |
-
"acc_stderr": 0.010282996367695562,
|
67 |
-
"acc_norm": 0.7383025027203483,
|
68 |
-
"acc_norm_stderr": 0.01025563077270823
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.34,0.014987482264363937,0
|
3 |
+
anli_r2,acc,0.329,0.014865395385928364,0
|
4 |
+
anli_r3,acc,0.3308333333333333,0.013588208070709002,0
|
5 |
+
arc_challenge,acc,0.24573378839590443,0.012581033453730107,0
|
6 |
+
arc_challenge,acc_norm,0.2713310580204778,0.012993807727545796,0
|
7 |
+
arc_easy,acc,0.5660774410774411,0.010169795770462111,0
|
8 |
+
arc_easy,acc_norm,0.5446127946127947,0.010218861787618725,0
|
9 |
+
boolq,acc,0.5339449541284403,0.00872487854852522,1
|
10 |
+
cb,acc,0.32142857142857145,0.06297362289056341,1
|
11 |
+
cb,f1,0.2684950416948389,,1
|
12 |
+
copa,acc,0.72,0.04512608598542127,0
|
13 |
+
hellaswag,acc,0.4312885879306911,0.0049424407463284975,0
|
14 |
+
hellaswag,acc_norm,0.5588528181637125,0.0049550950962647085,0
|
15 |
+
piqa,acc,0.7393906420021763,0.010241826155811623,0
|
16 |
+
piqa,acc_norm,0.7410228509249184,0.01022096603140561,0
|
17 |
+
rte,acc,0.5451263537906137,0.029973636495415252,0
|
18 |
+
sciq,acc,0.855,0.01113997751789014,0
|
19 |
+
sciq,acc_norm,0.83,0.01188449583454167,0
|
20 |
+
storycloze_2016,acc,0.6734366648850882,0.010844543793668893,0
|
21 |
+
winogrande,acc,0.5414364640883977,0.014004146853791906,0
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.34,
|
5 |
-
"acc_stderr": 0.014987482264363937
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.329,
|
9 |
-
"acc_stderr": 0.014865395385928364
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3308333333333333,
|
13 |
-
"acc_stderr": 0.013588208070709002
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.32142857142857145,
|
17 |
-
"acc_stderr": 0.06297362289056341,
|
18 |
-
"f1": 0.2684950416948389
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.72,
|
22 |
-
"acc_stderr": 0.04512608598542127
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4312885879306911,
|
26 |
-
"acc_stderr": 0.0049424407463284975,
|
27 |
-
"acc_norm": 0.5588528181637125,
|
28 |
-
"acc_norm_stderr": 0.0049550950962647085
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5451263537906137,
|
32 |
-
"acc_stderr": 0.029973636495415252
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5414364640883977,
|
36 |
-
"acc_stderr": 0.014004146853791906
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6734366648850882,
|
40 |
-
"acc_stderr": 0.010844543793668893
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5339449541284403,
|
44 |
-
"acc_stderr": 0.00872487854852522
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5660774410774411,
|
48 |
-
"acc_stderr": 0.010169795770462111,
|
49 |
-
"acc_norm": 0.5446127946127947,
|
50 |
-
"acc_norm_stderr": 0.010218861787618725
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24573378839590443,
|
54 |
-
"acc_stderr": 0.012581033453730107,
|
55 |
-
"acc_norm": 0.2713310580204778,
|
56 |
-
"acc_norm_stderr": 0.012993807727545796
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.855,
|
60 |
-
"acc_stderr": 0.01113997751789014,
|
61 |
-
"acc_norm": 0.83,
|
62 |
-
"acc_norm_stderr": 0.01188449583454167
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7393906420021763,
|
66 |
-
"acc_stderr": 0.010241826155811623,
|
67 |
-
"acc_norm": 0.7410228509249184,
|
68 |
-
"acc_norm_stderr": 0.01022096603140561
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.329,0.014865395385928366,0
|
3 |
+
anli_r2,acc,0.325,0.014818724459095526,0
|
4 |
+
anli_r3,acc,0.3358333333333333,0.013639261190932882,0
|
5 |
+
arc_challenge,acc,0.2525597269624573,0.012696728980207708,0
|
6 |
+
arc_challenge,acc_norm,0.2764505119453925,0.013069662474252427,0
|
7 |
+
arc_easy,acc,0.5774410774410774,0.010135978222981078,0
|
8 |
+
arc_easy,acc_norm,0.5576599326599326,0.010191334444220851,0
|
9 |
+
boolq,acc,0.5397553516819572,0.008717368239786054,1
|
10 |
+
cb,acc,0.17857142857142858,0.05164277182008721,1
|
11 |
+
cb,f1,0.15455455455455455,,1
|
12 |
+
copa,acc,0.73,0.044619604333847394,0
|
13 |
+
hellaswag,acc,0.4291973710416252,0.004939500404882189,0
|
14 |
+
hellaswag,acc_norm,0.5610436168094005,0.004952454721934799,0
|
15 |
+
piqa,acc,0.7383025027203483,0.010255630772708229,0
|
16 |
+
piqa,acc_norm,0.735038084874864,0.010296557993316044,0
|
17 |
+
rte,acc,0.5018050541516246,0.030096267148976626,0
|
18 |
+
sciq,acc,0.875,0.010463483381956722,0
|
19 |
+
sciq,acc_norm,0.85,0.011297239823409296,0
|
20 |
+
storycloze_2016,acc,0.6819882415820417,0.010769343495248553,0
|
21 |
+
winogrande,acc,0.5706393054459353,0.01391153749996916,0
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.329,
|
5 |
-
"acc_stderr": 0.014865395385928366
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.325,
|
9 |
-
"acc_stderr": 0.014818724459095526
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3358333333333333,
|
13 |
-
"acc_stderr": 0.013639261190932882
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.17857142857142858,
|
17 |
-
"acc_stderr": 0.05164277182008721,
|
18 |
-
"f1": 0.15455455455455455
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.73,
|
22 |
-
"acc_stderr": 0.044619604333847394
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4291973710416252,
|
26 |
-
"acc_stderr": 0.004939500404882189,
|
27 |
-
"acc_norm": 0.5610436168094005,
|
28 |
-
"acc_norm_stderr": 0.004952454721934799
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5018050541516246,
|
32 |
-
"acc_stderr": 0.030096267148976626
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5706393054459353,
|
36 |
-
"acc_stderr": 0.01391153749996916
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6819882415820417,
|
40 |
-
"acc_stderr": 0.010769343495248553
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5397553516819572,
|
44 |
-
"acc_stderr": 0.008717368239786054
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5774410774410774,
|
48 |
-
"acc_stderr": 0.010135978222981078,
|
49 |
-
"acc_norm": 0.5576599326599326,
|
50 |
-
"acc_norm_stderr": 0.010191334444220851
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2525597269624573,
|
54 |
-
"acc_stderr": 0.012696728980207708,
|
55 |
-
"acc_norm": 0.2764505119453925,
|
56 |
-
"acc_norm_stderr": 0.013069662474252427
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.875,
|
60 |
-
"acc_stderr": 0.010463483381956722,
|
61 |
-
"acc_norm": 0.85,
|
62 |
-
"acc_norm_stderr": 0.011297239823409296
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7383025027203483,
|
66 |
-
"acc_stderr": 0.010255630772708229,
|
67 |
-
"acc_norm": 0.735038084874864,
|
68 |
-
"acc_norm_stderr": 0.010296557993316044
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.337,0.0149550879186536,0
|
3 |
+
anli_r2,acc,0.335,0.014933117490932573,0
|
4 |
+
anli_r3,acc,0.3233333333333333,0.013508372867300219,0
|
5 |
+
arc_challenge,acc,0.24829351535836178,0.012624912868089753,0
|
6 |
+
arc_challenge,acc_norm,0.2764505119453925,0.013069662474252428,0
|
7 |
+
arc_easy,acc,0.5816498316498316,0.010122061470742861,0
|
8 |
+
arc_easy,acc_norm,0.5627104377104377,0.010178768429321588,0
|
9 |
+
boolq,acc,0.544954128440367,0.008709637955263421,1
|
10 |
+
cb,acc,0.2857142857142857,0.06091449038731724,1
|
11 |
+
cb,f1,0.24848484848484845,,1
|
12 |
+
copa,acc,0.81,0.03942772444036623,0
|
13 |
+
hellaswag,acc,0.43248356901015733,0.004944080605048776,0
|
14 |
+
hellaswag,acc_norm,0.5600477992431786,0.004953667028654382,0
|
15 |
+
piqa,acc,0.7415669205658324,0.01021397163677332,0
|
16 |
+
piqa,acc_norm,0.735038084874864,0.010296557993316038,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.884,0.010131468138756995,0
|
19 |
+
sciq,acc_norm,0.861,0.010945263761042965,0
|
20 |
+
storycloze_2016,acc,0.6787814003206841,0.010798029402794916,0
|
21 |
+
winogrande,acc,0.5406471981057617,0.014005973823825135,0
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.337,
|
5 |
-
"acc_stderr": 0.0149550879186536
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.335,
|
9 |
-
"acc_stderr": 0.014933117490932573
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3233333333333333,
|
13 |
-
"acc_stderr": 0.013508372867300219
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.2857142857142857,
|
17 |
-
"acc_stderr": 0.06091449038731724,
|
18 |
-
"f1": 0.24848484848484845
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.81,
|
22 |
-
"acc_stderr": 0.03942772444036623
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43248356901015733,
|
26 |
-
"acc_stderr": 0.004944080605048776,
|
27 |
-
"acc_norm": 0.5600477992431786,
|
28 |
-
"acc_norm_stderr": 0.004953667028654382
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5234657039711191,
|
32 |
-
"acc_stderr": 0.03006330041190266
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5406471981057617,
|
36 |
-
"acc_stderr": 0.014005973823825135
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6787814003206841,
|
40 |
-
"acc_stderr": 0.010798029402794916
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.544954128440367,
|
44 |
-
"acc_stderr": 0.008709637955263421
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5816498316498316,
|
48 |
-
"acc_stderr": 0.010122061470742861,
|
49 |
-
"acc_norm": 0.5627104377104377,
|
50 |
-
"acc_norm_stderr": 0.010178768429321588
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24829351535836178,
|
54 |
-
"acc_stderr": 0.012624912868089753,
|
55 |
-
"acc_norm": 0.2764505119453925,
|
56 |
-
"acc_norm_stderr": 0.013069662474252428
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.884,
|
60 |
-
"acc_stderr": 0.010131468138756995,
|
61 |
-
"acc_norm": 0.861,
|
62 |
-
"acc_norm_stderr": 0.010945263761042965
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7415669205658324,
|
66 |
-
"acc_stderr": 0.01021397163677332,
|
67 |
-
"acc_norm": 0.735038084874864,
|
68 |
-
"acc_norm_stderr": 0.010296557993316038
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.336,0.014944140233795025,0
|
3 |
+
anli_r2,acc,0.33,0.014876872027456732,0
|
4 |
+
anli_r3,acc,0.3375,0.013655897185463655,0
|
5 |
+
arc_challenge,acc,0.24829351535836178,0.012624912868089762,0
|
6 |
+
arc_challenge,acc_norm,0.2764505119453925,0.013069662474252428,0
|
7 |
+
arc_easy,acc,0.5845959595959596,0.010111869494911519,0
|
8 |
+
arc_easy,acc_norm,0.5715488215488216,0.010154195733990965,0
|
9 |
+
boolq,acc,0.5516819571865443,0.008698213008694273,1
|
10 |
+
cb,acc,0.2857142857142857,0.06091449038731725,1
|
11 |
+
cb,f1,0.26612466124661244,,1
|
12 |
+
copa,acc,0.78,0.041633319989322626,0
|
13 |
+
hellaswag,acc,0.43168691495717987,0.004942990623131125,0
|
14 |
+
hellaswag,acc_norm,0.5638319059948218,0.004948952519517512,0
|
15 |
+
piqa,acc,0.735582154515778,0.01028978724476717,0
|
16 |
+
piqa,acc_norm,0.735582154515778,0.01028978724476716,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.882,0.010206869264381791,0
|
19 |
+
sciq,acc_norm,0.864,0.01084535023047299,0
|
20 |
+
storycloze_2016,acc,0.6862640299305185,0.010730179119317623,0
|
21 |
+
winogrande,acc,0.5501183898973955,0.013981711904049732,0
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.336,
|
5 |
-
"acc_stderr": 0.014944140233795025
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.33,
|
9 |
-
"acc_stderr": 0.014876872027456732
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3375,
|
13 |
-
"acc_stderr": 0.013655897185463655
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.2857142857142857,
|
17 |
-
"acc_stderr": 0.06091449038731725,
|
18 |
-
"f1": 0.26612466124661244
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.78,
|
22 |
-
"acc_stderr": 0.041633319989322626
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43168691495717987,
|
26 |
-
"acc_stderr": 0.004942990623131125,
|
27 |
-
"acc_norm": 0.5638319059948218,
|
28 |
-
"acc_norm_stderr": 0.004948952519517512
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5234657039711191,
|
32 |
-
"acc_stderr": 0.03006330041190266
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5501183898973955,
|
36 |
-
"acc_stderr": 0.013981711904049732
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6862640299305185,
|
40 |
-
"acc_stderr": 0.010730179119317623
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5516819571865443,
|
44 |
-
"acc_stderr": 0.008698213008694273
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5845959595959596,
|
48 |
-
"acc_stderr": 0.010111869494911519,
|
49 |
-
"acc_norm": 0.5715488215488216,
|
50 |
-
"acc_norm_stderr": 0.010154195733990965
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24829351535836178,
|
54 |
-
"acc_stderr": 0.012624912868089762,
|
55 |
-
"acc_norm": 0.2764505119453925,
|
56 |
-
"acc_norm_stderr": 0.013069662474252428
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.882,
|
60 |
-
"acc_stderr": 0.010206869264381791,
|
61 |
-
"acc_norm": 0.864,
|
62 |
-
"acc_norm_stderr": 0.01084535023047299
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.735582154515778,
|
66 |
-
"acc_stderr": 0.01028978724476717,
|
67 |
-
"acc_norm": 0.735582154515778,
|
68 |
-
"acc_norm_stderr": 0.01028978724476716
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.35,0.015090650341444233,0
|
3 |
+
anli_r2,acc,0.334,0.014922019523732967,0
|
4 |
+
anli_r3,acc,0.3175,0.013443538681348061,0
|
5 |
+
arc_challenge,acc,0.25341296928327645,0.012710896778378606,0
|
6 |
+
arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0
|
7 |
+
arc_easy,acc,0.5879629629629629,0.010099765857562762,0
|
8 |
+
arc_easy,acc_norm,0.569023569023569,0.010161552863493746,0
|
9 |
+
boolq,acc,0.5467889908256881,0.008706681265872488,1
|
10 |
+
cb,acc,0.26785714285714285,0.05971290310957636,1
|
11 |
+
cb,f1,0.24955436720142601,,1
|
12 |
+
copa,acc,0.79,0.040936018074033256,0
|
13 |
+
hellaswag,acc,0.42999402509460266,0.004940631135803533,0
|
14 |
+
hellaswag,acc_norm,0.566620195180243,0.004945291270072436,0
|
15 |
+
piqa,acc,0.735038084874864,0.010296557993316052,0
|
16 |
+
piqa,acc_norm,0.7448313384113167,0.01017157159252183,0
|
17 |
+
rte,acc,0.5054151624548736,0.030094698123239966,0
|
18 |
+
sciq,acc,0.889,0.009938701010583726,0
|
19 |
+
sciq,acc_norm,0.879,0.010318210380946097,0
|
20 |
+
storycloze_2016,acc,0.6803848209513629,0.01078375973373075,0
|
21 |
+
winogrande,acc,0.5390686661404893,0.014009521680980306,0
|
2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.35,
|
5 |
-
"acc_stderr": 0.015090650341444233
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.334,
|
9 |
-
"acc_stderr": 0.014922019523732967
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3175,
|
13 |
-
"acc_stderr": 0.013443538681348061
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.26785714285714285,
|
17 |
-
"acc_stderr": 0.05971290310957636,
|
18 |
-
"f1": 0.24955436720142601
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.42999402509460266,
|
26 |
-
"acc_stderr": 0.004940631135803533,
|
27 |
-
"acc_norm": 0.566620195180243,
|
28 |
-
"acc_norm_stderr": 0.004945291270072436
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5054151624548736,
|
32 |
-
"acc_stderr": 0.030094698123239966
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5390686661404893,
|
36 |
-
"acc_stderr": 0.014009521680980306
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6803848209513629,
|
40 |
-
"acc_stderr": 0.01078375973373075
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5467889908256881,
|
44 |
-
"acc_stderr": 0.008706681265872488
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5879629629629629,
|
48 |
-
"acc_stderr": 0.010099765857562762,
|
49 |
-
"acc_norm": 0.569023569023569,
|
50 |
-
"acc_norm_stderr": 0.010161552863493746
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.25341296928327645,
|
54 |
-
"acc_stderr": 0.012710896778378606,
|
55 |
-
"acc_norm": 0.2832764505119454,
|
56 |
-
"acc_norm_stderr": 0.013167478735134575
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.889,
|
60 |
-
"acc_stderr": 0.009938701010583726,
|
61 |
-
"acc_norm": 0.879,
|
62 |
-
"acc_norm_stderr": 0.010318210380946097
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.735038084874864,
|
66 |
-
"acc_stderr": 0.010296557993316052,
|
67 |
-
"acc_norm": 0.7448313384113167,
|
68 |
-
"acc_norm_stderr": 0.01017157159252183
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.326,0.014830507204541033,0
|
3 |
+
anli_r2,acc,0.332,0.014899597242811476,0
|
4 |
+
anli_r3,acc,0.33,0.013579531277800925,0
|
5 |
+
arc_challenge,acc,0.2431740614334471,0.01253655414458709,0
|
6 |
+
arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0
|
7 |
+
arc_easy,acc,0.5698653198653199,0.010159130445178499,0
|
8 |
+
arc_easy,acc_norm,0.49957912457912457,0.010259779886094424,0
|
9 |
+
boolq,acc,0.6055045871559633,0.008548152025770936,1
|
10 |
+
cb,acc,0.35714285714285715,0.06460957383809221,1
|
11 |
+
cb,f1,0.2884615384615385,,1
|
12 |
+
copa,acc,0.76,0.04292346959909283,0
|
13 |
+
hellaswag,acc,0.43686516630153355,0.004949842967331425,0
|
14 |
+
hellaswag,acc_norm,0.5622385978888668,0.004950973231188733,0
|
15 |
+
piqa,acc,0.7399347116430903,0.010234893249061308,0
|
16 |
+
piqa,acc_norm,0.7431991294885746,0.010192864802278033,0
|
17 |
+
rte,acc,0.5523465703971119,0.02993107036293953,0
|
18 |
+
sciq,acc,0.819,0.012181436179177923,0
|
19 |
+
sciq,acc_norm,0.747,0.01375427861358708,0
|
20 |
+
storycloze_2016,acc,0.6980224478888295,0.010616985436073357,0
|
21 |
+
winogrande,acc,0.5611681136543015,0.013946933444507032,0
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.326,
|
5 |
-
"acc_stderr": 0.014830507204541033
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.332,
|
9 |
-
"acc_stderr": 0.014899597242811476
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33,
|
13 |
-
"acc_stderr": 0.013579531277800925
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.35714285714285715,
|
17 |
-
"acc_stderr": 0.06460957383809221,
|
18 |
-
"f1": 0.2884615384615385
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.76,
|
22 |
-
"acc_stderr": 0.04292346959909283
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43686516630153355,
|
26 |
-
"acc_stderr": 0.004949842967331425,
|
27 |
-
"acc_norm": 0.5622385978888668,
|
28 |
-
"acc_norm_stderr": 0.004950973231188733
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5523465703971119,
|
32 |
-
"acc_stderr": 0.02993107036293953
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5611681136543015,
|
36 |
-
"acc_stderr": 0.013946933444507032
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6980224478888295,
|
40 |
-
"acc_stderr": 0.010616985436073357
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6055045871559633,
|
44 |
-
"acc_stderr": 0.008548152025770936
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5698653198653199,
|
48 |
-
"acc_stderr": 0.010159130445178499,
|
49 |
-
"acc_norm": 0.49957912457912457,
|
50 |
-
"acc_norm_stderr": 0.010259779886094424
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2431740614334471,
|
54 |
-
"acc_stderr": 0.01253655414458709,
|
55 |
-
"acc_norm": 0.2773037542662116,
|
56 |
-
"acc_norm_stderr": 0.013082095839059374
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.819,
|
60 |
-
"acc_stderr": 0.012181436179177923,
|
61 |
-
"acc_norm": 0.747,
|
62 |
-
"acc_norm_stderr": 0.01375427861358708
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7399347116430903,
|
66 |
-
"acc_stderr": 0.010234893249061308,
|
67 |
-
"acc_norm": 0.7431991294885746,
|
68 |
-
"acc_norm_stderr": 0.010192864802278033
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.306,0.01458000605543697,0
|
3 |
+
anli_r2,acc,0.322,0.014782913600996669,0
|
4 |
+
anli_r3,acc,0.3416666666666667,0.013696658778002519,0
|
5 |
+
arc_challenge,acc,0.24744027303754265,0.012610352663292673,0
|
6 |
+
arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0
|
7 |
+
arc_easy,acc,0.5744949494949495,0.01014527118259102,0
|
8 |
+
arc_easy,acc_norm,0.5391414141414141,0.010228298200766126,0
|
9 |
+
boolq,acc,0.5761467889908257,0.008643046537505764,1
|
10 |
+
cb,acc,0.35714285714285715,0.06460957383809221,1
|
11 |
+
cb,f1,0.3268421052631579,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.43596893049193386,0.004948696280312425,0
|
14 |
+
hellaswag,acc_norm,0.5642302330213105,0.004948439229523909,0
|
15 |
+
piqa,acc,0.7366702937976061,0.010276185322196766,0
|
16 |
+
piqa,acc_norm,0.7372143634385201,0.010269354068140783,0
|
17 |
+
rte,acc,0.5090252707581228,0.030091559826331334,0
|
18 |
+
sciq,acc,0.869,0.010674874844837952,0
|
19 |
+
sciq,acc_norm,0.853,0.011203415395160333,0
|
20 |
+
storycloze_2016,acc,0.6926777124532336,0.01066944508186666,0
|
21 |
+
winogrande,acc,0.55327545382794,0.013972488371616692,0
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.306,
|
5 |
-
"acc_stderr": 0.01458000605543697
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.322,
|
9 |
-
"acc_stderr": 0.014782913600996669
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3416666666666667,
|
13 |
-
"acc_stderr": 0.013696658778002519
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.35714285714285715,
|
17 |
-
"acc_stderr": 0.06460957383809221,
|
18 |
-
"f1": 0.3268421052631579
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43596893049193386,
|
26 |
-
"acc_stderr": 0.004948696280312425,
|
27 |
-
"acc_norm": 0.5642302330213105,
|
28 |
-
"acc_norm_stderr": 0.004948439229523909
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5090252707581228,
|
32 |
-
"acc_stderr": 0.030091559826331334
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.55327545382794,
|
36 |
-
"acc_stderr": 0.013972488371616692
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6926777124532336,
|
40 |
-
"acc_stderr": 0.01066944508186666
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5761467889908257,
|
44 |
-
"acc_stderr": 0.008643046537505764
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5744949494949495,
|
48 |
-
"acc_stderr": 0.01014527118259102,
|
49 |
-
"acc_norm": 0.5391414141414141,
|
50 |
-
"acc_norm_stderr": 0.010228298200766126
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24744027303754265,
|
54 |
-
"acc_stderr": 0.012610352663292673,
|
55 |
-
"acc_norm": 0.27559726962457337,
|
56 |
-
"acc_norm_stderr": 0.013057169655761838
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.869,
|
60 |
-
"acc_stderr": 0.010674874844837952,
|
61 |
-
"acc_norm": 0.853,
|
62 |
-
"acc_norm_stderr": 0.011203415395160333
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7366702937976061,
|
66 |
-
"acc_stderr": 0.010276185322196766,
|
67 |
-
"acc_norm": 0.7372143634385201,
|
68 |
-
"acc_norm_stderr": 0.010269354068140783
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.323,0.014794927843348639,0
|
3 |
+
anli_r2,acc,0.311,0.014645596385722699,0
|
4 |
+
anli_r3,acc,0.32666666666666666,0.013544340907003663,0
|
5 |
+
arc_challenge,acc,0.2593856655290102,0.012808273573927104,0
|
6 |
+
arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0
|
7 |
+
arc_easy,acc,0.5833333333333334,0.010116282977781239,0
|
8 |
+
arc_easy,acc_norm,0.5711279461279462,0.010155440652900152,0
|
9 |
+
boolq,acc,0.573394495412844,0.008650327037726273,1
|
10 |
+
cb,acc,0.2857142857142857,0.060914490387317236,1
|
11 |
+
cb,f1,0.21909633418584828,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.43537143995220073,0.004947922692688842,0
|
14 |
+
hellaswag,acc_norm,0.5636327424815774,0.004949207947265913,0
|
15 |
+
piqa,acc,0.736126224156692,0.010282996367695564,0
|
16 |
+
piqa,acc_norm,0.7442872687704026,0.010178690109459872,0
|
17 |
+
rte,acc,0.48375451263537905,0.030080573208738064,0
|
18 |
+
sciq,acc,0.877,0.010391293421849874,0
|
19 |
+
sciq,acc_norm,0.858,0.011043457699378215,0
|
20 |
+
storycloze_2016,acc,0.6937466595403528,0.010659088460112754,0
|
21 |
+
winogrande,acc,0.5453827940015785,0.013994481027066,0
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.323,
|
5 |
-
"acc_stderr": 0.014794927843348639
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.311,
|
9 |
-
"acc_stderr": 0.014645596385722699
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.32666666666666666,
|
13 |
-
"acc_stderr": 0.013544340907003663
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.2857142857142857,
|
17 |
-
"acc_stderr": 0.060914490387317236,
|
18 |
-
"f1": 0.21909633418584828
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43537143995220073,
|
26 |
-
"acc_stderr": 0.004947922692688842,
|
27 |
-
"acc_norm": 0.5636327424815774,
|
28 |
-
"acc_norm_stderr": 0.004949207947265913
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48375451263537905,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5453827940015785,
|
36 |
-
"acc_stderr": 0.013994481027066
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6937466595403528,
|
40 |
-
"acc_stderr": 0.010659088460112754
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.573394495412844,
|
44 |
-
"acc_stderr": 0.008650327037726273
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5833333333333334,
|
48 |
-
"acc_stderr": 0.010116282977781239,
|
49 |
-
"acc_norm": 0.5711279461279462,
|
50 |
-
"acc_norm_stderr": 0.010155440652900152
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2593856655290102,
|
54 |
-
"acc_stderr": 0.012808273573927104,
|
55 |
-
"acc_norm": 0.2773037542662116,
|
56 |
-
"acc_norm_stderr": 0.013082095839059374
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.877,
|
60 |
-
"acc_stderr": 0.010391293421849874,
|
61 |
-
"acc_norm": 0.858,
|
62 |
-
"acc_norm_stderr": 0.011043457699378215
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.736126224156692,
|
66 |
-
"acc_stderr": 0.010282996367695564,
|
67 |
-
"acc_norm": 0.7442872687704026,
|
68 |
-
"acc_norm_stderr": 0.010178690109459872
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.298,0.014470846741134694,0
|
3 |
+
anli_r2,acc,0.353,0.015120172605483687,0
|
4 |
+
anli_r3,acc,0.3425,0.013704669762934732,0
|
5 |
+
arc_challenge,acc,0.25170648464163825,0.01268249633404296,0
|
6 |
+
arc_challenge,acc_norm,0.2790102389078498,0.013106784883601336,0
|
7 |
+
arc_easy,acc,0.5757575757575758,0.010141333654958562,0
|
8 |
+
arc_easy,acc_norm,0.5648148148148148,0.010173216430370917,0
|
9 |
+
boolq,acc,0.5865443425076453,0.008613059239942643,1
|
10 |
+
cb,acc,0.35714285714285715,0.06460957383809221,1
|
11 |
+
cb,f1,0.30450234601177995,,1
|
12 |
+
copa,acc,0.78,0.04163331998932261,0
|
13 |
+
hellaswag,acc,0.43527185819557856,0.004947793051042668,0
|
14 |
+
hellaswag,acc_norm,0.5651264688309102,0.00494727245422622,0
|
15 |
+
piqa,acc,0.7399347116430903,0.010234893249061306,0
|
16 |
+
piqa,acc_norm,0.750816104461371,0.010091882770120214,0
|
17 |
+
rte,acc,0.516245487364621,0.030080573208738064,0
|
18 |
+
sciq,acc,0.883,0.010169287802713329,0
|
19 |
+
sciq,acc_norm,0.859,0.011010914595992441,0
|
20 |
+
storycloze_2016,acc,0.6974879743452699,0.010622307774396943,0
|
21 |
+
winogrande,acc,0.5595895816890292,0.0139523303119156,0
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.298,
|
5 |
-
"acc_stderr": 0.014470846741134694
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.353,
|
9 |
-
"acc_stderr": 0.015120172605483687
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3425,
|
13 |
-
"acc_stderr": 0.013704669762934732
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.35714285714285715,
|
17 |
-
"acc_stderr": 0.06460957383809221,
|
18 |
-
"f1": 0.30450234601177995
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.78,
|
22 |
-
"acc_stderr": 0.04163331998932261
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43527185819557856,
|
26 |
-
"acc_stderr": 0.004947793051042668,
|
27 |
-
"acc_norm": 0.5651264688309102,
|
28 |
-
"acc_norm_stderr": 0.00494727245422622
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.516245487364621,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5595895816890292,
|
36 |
-
"acc_stderr": 0.0139523303119156
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6974879743452699,
|
40 |
-
"acc_stderr": 0.010622307774396943
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5865443425076453,
|
44 |
-
"acc_stderr": 0.008613059239942643
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5757575757575758,
|
48 |
-
"acc_stderr": 0.010141333654958562,
|
49 |
-
"acc_norm": 0.5648148148148148,
|
50 |
-
"acc_norm_stderr": 0.010173216430370917
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.25170648464163825,
|
54 |
-
"acc_stderr": 0.01268249633404296,
|
55 |
-
"acc_norm": 0.2790102389078498,
|
56 |
-
"acc_norm_stderr": 0.013106784883601336
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.883,
|
60 |
-
"acc_stderr": 0.010169287802713329,
|
61 |
-
"acc_norm": 0.859,
|
62 |
-
"acc_norm_stderr": 0.011010914595992441
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7399347116430903,
|
66 |
-
"acc_stderr": 0.010234893249061306,
|
67 |
-
"acc_norm": 0.750816104461371,
|
68 |
-
"acc_norm_stderr": 0.010091882770120214
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.329,0.01486539538592837,0
|
3 |
+
anli_r2,acc,0.318,0.0147340793093119,0
|
4 |
+
anli_r3,acc,0.3325,0.013605417345710528,0
|
5 |
+
arc_challenge,acc,0.2551194539249147,0.012739038695202104,0
|
6 |
+
arc_challenge,acc_norm,0.2841296928327645,0.013179442447653886,0
|
7 |
+
arc_easy,acc,0.5744949494949495,0.010145271182591018,0
|
8 |
+
arc_easy,acc_norm,0.5669191919191919,0.010167478013701792,0
|
9 |
+
boolq,acc,0.582262996941896,0.008625883905552707,1
|
10 |
+
cb,acc,0.48214285714285715,0.0673769750864465,1
|
11 |
+
cb,f1,0.3471907281431091,,1
|
12 |
+
copa,acc,0.73,0.044619604333847394,0
|
13 |
+
hellaswag,acc,0.43487353116908983,0.004947272454226208,0
|
14 |
+
hellaswag,acc_norm,0.5681139215295757,0.004943264339868658,0
|
15 |
+
piqa,acc,0.7426550598476604,0.01019992106479251,0
|
16 |
+
piqa,acc_norm,0.7470076169749728,0.010142888698862455,0
|
17 |
+
rte,acc,0.51985559566787,0.030072723167317184,0
|
18 |
+
sciq,acc,0.884,0.010131468138757,0
|
19 |
+
sciq,acc_norm,0.875,0.010463483381956722,0
|
20 |
+
storycloze_2016,acc,0.6937466595403528,0.010659088460112754,0
|
21 |
+
winogrande,acc,0.5714285714285714,0.013908353814606691,0
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.329,
|
5 |
-
"acc_stderr": 0.01486539538592837
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.318,
|
9 |
-
"acc_stderr": 0.0147340793093119
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3325,
|
13 |
-
"acc_stderr": 0.013605417345710528
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.48214285714285715,
|
17 |
-
"acc_stderr": 0.0673769750864465,
|
18 |
-
"f1": 0.3471907281431091
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.73,
|
22 |
-
"acc_stderr": 0.044619604333847394
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43487353116908983,
|
26 |
-
"acc_stderr": 0.004947272454226208,
|
27 |
-
"acc_norm": 0.5681139215295757,
|
28 |
-
"acc_norm_stderr": 0.004943264339868658
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.51985559566787,
|
32 |
-
"acc_stderr": 0.030072723167317184
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5714285714285714,
|
36 |
-
"acc_stderr": 0.013908353814606691
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6937466595403528,
|
40 |
-
"acc_stderr": 0.010659088460112754
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.582262996941896,
|
44 |
-
"acc_stderr": 0.008625883905552707
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5744949494949495,
|
48 |
-
"acc_stderr": 0.010145271182591018,
|
49 |
-
"acc_norm": 0.5669191919191919,
|
50 |
-
"acc_norm_stderr": 0.010167478013701792
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2551194539249147,
|
54 |
-
"acc_stderr": 0.012739038695202104,
|
55 |
-
"acc_norm": 0.2841296928327645,
|
56 |
-
"acc_norm_stderr": 0.013179442447653886
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.884,
|
60 |
-
"acc_stderr": 0.010131468138757,
|
61 |
-
"acc_norm": 0.875,
|
62 |
-
"acc_norm_stderr": 0.010463483381956722
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7426550598476604,
|
66 |
-
"acc_stderr": 0.01019992106479251,
|
67 |
-
"acc_norm": 0.7470076169749728,
|
68 |
-
"acc_norm_stderr": 0.010142888698862455
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.318,0.014734079309311901,0
|
3 |
+
anli_r2,acc,0.346,0.015050266127564443,0
|
4 |
+
anli_r3,acc,0.3383333333333333,0.013664144006618261,0
|
5 |
+
arc_challenge,acc,0.2687713310580205,0.012955065963710695,0
|
6 |
+
arc_challenge,acc_norm,0.27986348122866894,0.013119040897725923,0
|
7 |
+
arc_easy,acc,0.5782828282828283,0.01013325528401233,0
|
8 |
+
arc_easy,acc_norm,0.5711279461279462,0.010155440652900152,0
|
9 |
+
boolq,acc,0.5886850152905199,0.008606395426309215,1
|
10 |
+
cb,acc,0.35714285714285715,0.06460957383809221,1
|
11 |
+
cb,f1,0.2634620436038876,,1
|
12 |
+
copa,acc,0.76,0.04292346959909282,0
|
13 |
+
hellaswag,acc,0.43547102170882296,0.004948052131344501,0
|
14 |
+
hellaswag,acc_norm,0.5694084843656642,0.004941470620074855,0
|
15 |
+
piqa,acc,0.7388465723612623,0.010248738649935576,0
|
16 |
+
piqa,acc_norm,0.7442872687704026,0.010178690109459867,0
|
17 |
+
rte,acc,0.5451263537906137,0.029973636495415255,0
|
18 |
+
sciq,acc,0.89,0.00989939381972445,0
|
19 |
+
sciq,acc_norm,0.885,0.010093407594904631,0
|
20 |
+
storycloze_2016,acc,0.6985569214323891,0.010611646032767584,0
|
21 |
+
winogrande,acc,0.5524861878453039,0.013974847640536199,0
|
2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.318,
|
5 |
-
"acc_stderr": 0.014734079309311901
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.346,
|
9 |
-
"acc_stderr": 0.015050266127564443
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3383333333333333,
|
13 |
-
"acc_stderr": 0.013664144006618261
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.35714285714285715,
|
17 |
-
"acc_stderr": 0.06460957383809221,
|
18 |
-
"f1": 0.2634620436038876
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.76,
|
22 |
-
"acc_stderr": 0.04292346959909282
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43547102170882296,
|
26 |
-
"acc_stderr": 0.004948052131344501,
|
27 |
-
"acc_norm": 0.5694084843656642,
|
28 |
-
"acc_norm_stderr": 0.004941470620074855
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5451263537906137,
|
32 |
-
"acc_stderr": 0.029973636495415255
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5524861878453039,
|
36 |
-
"acc_stderr": 0.013974847640536199
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6985569214323891,
|
40 |
-
"acc_stderr": 0.010611646032767584
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5886850152905199,
|
44 |
-
"acc_stderr": 0.008606395426309215
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5782828282828283,
|
48 |
-
"acc_stderr": 0.01013325528401233,
|
49 |
-
"acc_norm": 0.5711279461279462,
|
50 |
-
"acc_norm_stderr": 0.010155440652900152
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2687713310580205,
|
54 |
-
"acc_stderr": 0.012955065963710695,
|
55 |
-
"acc_norm": 0.27986348122866894,
|
56 |
-
"acc_norm_stderr": 0.013119040897725923
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.89,
|
60 |
-
"acc_stderr": 0.00989939381972445,
|
61 |
-
"acc_norm": 0.885,
|
62 |
-
"acc_norm_stderr": 0.010093407594904631
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7388465723612623,
|
66 |
-
"acc_stderr": 0.010248738649935576,
|
67 |
-
"acc_norm": 0.7442872687704026,
|
68 |
-
"acc_norm_stderr": 0.010178690109459867
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b14bc4seed1/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00012949433318118156
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.00012949433318118156
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.18224154640367043
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.18224154640367043
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.20392339700886974
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.20392339700886974
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.20537187889646735
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.20537187889646735
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.200121282410508
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.200121282410508
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19429987532612708
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19429987532612708
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.16434791239647062
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.044020752612575986
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.044020752612575986
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.02968537882036386
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.02968537882036386
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.030040223235727605
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.030040223235727605
|
21 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.030050510110479087
|
22 |
+
gem_xsum,3,median,rouge2_fmeasure,0.030050510110479087
|
23 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.007253870650249119
|
24 |
+
gem_xsum,4,median,rouge2_fmeasure,0.007253870650249119
|
25 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,8.848593754254134e-05
|
26 |
+
gem_xsum,5,median,rouge2_fmeasure,8.848593754254134e-05
|
27 |
+
gem_xsum,5,average,multiple,0.023523203561156367
|
28 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.052796392998268285
|
29 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.052796392998268285
|
30 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05226153564491622
|
31 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.05226153564491622
|
32 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05325778753738895
|
33 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.05325778753738895
|
34 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05116400683224674
|
35 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.05116400683224674
|
36 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0543334322796211
|
37 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.0543334322796211
|
38 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.054406851908666655
|
39 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.054406851908666655
|
40 |
+
web_nlg_en,5,average,multiple,0.05303666786685132
|
41 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03281166114649338
|
42 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.03281166114649338
|
43 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03952348883731413
|
44 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.03952348883731413
|
45 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04385052776628551
|
46 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.04385052776628551
|
47 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03576925556291925
|
48 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.03576925556291925
|
49 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.0115081158992026
|
50 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.0115081158992026
|
51 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0019115778864576577
|
52 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.0019115778864576577
|
53 |
+
wiki_lingua_en,5,average,multiple,0.027562437849778753
|
2b855b14bc4seed1/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3859706493814145, "bleu_stderr": 0.02887493788243282, "rouge1_fmeasure": 0.11155152422680106, "rouge1_fmeasure_stderr": 0.002282891780760996, "rouge1_precision": 0.07428960078453671, "rouge1_precision_stderr": 0.0018063226287394017, "rouge1_recall": 0.30350663746210643, "rouge1_recall_stderr": 0.00473697872155852, "rouge2_fmeasure": 0.052796392998268285, "rouge2_fmeasure_stderr": 0.0013951384332651713, "rouge2_precision": 0.03531981209056992, "rouge2_precision_stderr": 0.0011595867013095926, "rouge2_recall": 0.14540106420377438, "rouge2_recall_stderr": 0.003142842212970267, "rougeL_fmeasure": 0.10653891951191909, "rougeL_fmeasure_stderr": 0.0020787485289465255, "rougeL_precision": 0.07055647736310881, "rougeL_precision_stderr": 0.0016283929067047295, "rougeL_recall": 0.2935720196205313, "rougeL_recall_stderr": 0.0045875931258363735, "rougeLsum_fmeasure": 0.10565401215529986, "rougeLsum_fmeasure_stderr": 0.0021263606523734837, "rougeLsum_precision": 0.07024334129648914, "rougeLsum_precision_stderr": 0.0016843891256844708, "rougeLsum_recall": 0.288622206208257, "rougeLsum_recall_stderr": 0.004462405964590755}}, "1": {"PALM_prompt": {"bleu": 0.4498997351689477, "bleu_stderr": 0.03674862617096384, "rouge1_fmeasure": 0.11423642365402245, "rouge1_fmeasure_stderr": 0.002125553076824774, "rouge1_precision": 0.07474282486730215, "rouge1_precision_stderr": 0.0017060899291641667, "rouge1_recall": 0.35037818287785766, "rouge1_recall_stderr": 0.005089644035448327, "rouge2_fmeasure": 0.05226153564491622, "rouge2_fmeasure_stderr": 0.0012866470017600612, "rouge2_precision": 0.033947865346798244, "rouge2_precision_stderr": 0.0009605222606600222, "rouge2_recall": 0.16651493031375508, "rouge2_recall_stderr": 0.0034551314473355676, "rougeL_fmeasure": 0.1075089911234956, "rougeL_fmeasure_stderr": 0.0019013555079034736, "rougeL_precision": 0.07006889806462963, "rougeL_precision_stderr": 0.0015199133453655971, "rougeL_recall": 0.3313741675277014, "rougeL_recall_stderr": 0.004724938074588536, "rougeLsum_fmeasure": 0.1080259820412502, "rougeLsum_fmeasure_stderr": 0.001985969459394615, "rougeLsum_precision": 0.07068151425631596, "rougeLsum_precision_stderr": 0.0016036772272344858, "rougeLsum_recall": 0.330968980956117, "rougeLsum_recall_stderr": 0.004694246590162869}}, "2": {"PALM_prompt": {"bleu": 0.4753049001409591, "bleu_stderr": 0.022662210056026047, "rouge1_fmeasure": 0.11668735202155088, "rouge1_fmeasure_stderr": 0.001969072957510736, "rouge1_precision": 0.07509396872848731, "rouge1_precision_stderr": 0.001495770634509107, "rouge1_recall": 0.369650217038538, "rouge1_recall_stderr": 0.004969240424808801, "rouge2_fmeasure": 0.05325778753738895, "rouge2_fmeasure_stderr": 0.0012240840116235934, "rouge2_precision": 0.034222534671223666, "rouge2_precision_stderr": 0.0009009097779622381, "rouge2_recall": 0.17744398005391065, "rouge2_recall_stderr": 0.003538000874923632, "rougeL_fmeasure": 0.10948225690231367, "rougeL_fmeasure_stderr": 0.0017896005091475441, "rougeL_precision": 0.07035789575974258, "rougeL_precision_stderr": 0.0013479485808054492, "rougeL_recall": 0.346088142522385, "rougeL_recall_stderr": 0.004573219497516567, "rougeLsum_fmeasure": 0.11040902543720352, "rougeLsum_fmeasure_stderr": 0.001845965020973935, "rougeLsum_precision": 0.07105991740872057, "rougeLsum_precision_stderr": 0.0014021861498776512, "rougeLsum_recall": 0.34990113897565156, "rougeLsum_recall_stderr": 0.004653894399099468}}, "3": {"PALM_prompt": {"bleu": 0.49330582546586715, "bleu_stderr": 0.026760635766622045, "rouge1_fmeasure": 0.11255875083875559, "rouge1_fmeasure_stderr": 0.0018899560091881686, "rouge1_precision": 0.07197595815311898, "rouge1_precision_stderr": 0.0014085873060936218, "rouge1_recall": 0.36596279105771895, "rouge1_recall_stderr": 0.005060110953265633, "rouge2_fmeasure": 0.05116400683224674, "rouge2_fmeasure_stderr": 0.0011637012566298453, "rouge2_precision": 0.032559872031436, "rouge2_precision_stderr": 0.0008258506188369471, "rouge2_recall": 0.17582832452926037, "rouge2_recall_stderr": 0.0036102167860794358, "rougeL_fmeasure": 0.1053125965181327, "rougeL_fmeasure_stderr": 0.0017380106035689842, "rougeL_precision": 0.06730885297435893, "rougeL_precision_stderr": 0.0012874209586825013, "rougeL_recall": 0.3410263606925172, "rougeL_recall_stderr": 0.004632395084547858, "rougeLsum_fmeasure": 0.10651933217629937, "rougeLsum_fmeasure_stderr": 0.0017835590190595765, "rougeLsum_precision": 0.06816617828713667, "rougeLsum_precision_stderr": 0.0013317358072988438, "rougeLsum_recall": 0.3452024329512138, "rougeLsum_recall_stderr": 0.0046680401031925535}}, "4": {"PALM_prompt": {"bleu": 0.5805875464248518, "bleu_stderr": 0.02467601476988524, "rouge1_fmeasure": 0.1185079312492515, "rouge1_fmeasure_stderr": 0.0019283588373999788, "rouge1_precision": 0.0759876905383805, "rouge1_precision_stderr": 0.0014999294305126894, "rouge1_recall": 0.38441082579620744, "rouge1_recall_stderr": 0.005176874183483729, "rouge2_fmeasure": 0.0543334322796211, "rouge2_fmeasure_stderr": 0.001167839949370806, "rouge2_precision": 0.034693214028384364, "rouge2_precision_stderr": 0.0008716169777471336, "rouge2_recall": 0.18834133037111078, "rouge2_recall_stderr": 0.0037005417550254813, "rougeL_fmeasure": 0.10928166815352991, "rougeL_fmeasure_stderr": 0.0017079278026699558, "rougeL_precision": 0.06996833075999682, "rougeL_precision_stderr": 0.0013122027020668525, "rougeL_recall": 0.35440915717841187, "rougeL_recall_stderr": 0.0046597414750728676, "rougeLsum_fmeasure": 0.11155898719590468, "rougeLsum_fmeasure_stderr": 0.0018082007870370246, "rougeLsum_precision": 0.07161964114325686, "rougeLsum_precision_stderr": 0.00141486011529458, "rougeLsum_recall": 0.36094525512943, "rougeLsum_recall_stderr": 0.004743097077998619}}, "5": {"PALM_prompt": {"bleu": 0.6262213710856557, "bleu_stderr": 0.03220830109957928, "rouge1_fmeasure": 0.11957471813564298, "rouge1_fmeasure_stderr": 0.0018632136844397324, "rouge1_precision": 0.07568606204419236, "rouge1_precision_stderr": 0.0013727260179330872, "rouge1_recall": 0.4004498860844127, "rouge1_recall_stderr": 0.005164039996437608, "rouge2_fmeasure": 0.054406851908666655, "rouge2_fmeasure_stderr": 0.0011337345810570566, "rouge2_precision": 0.03424858760150629, "rouge2_precision_stderr": 0.0007988668794381154, "rouge2_recall": 0.19500849332230508, "rouge2_recall_stderr": 0.003726245247160497, "rougeL_fmeasure": 0.10937979253378736, "rougeL_fmeasure_stderr": 0.0016473303165533246, "rougeL_precision": 0.06919783619593119, "rougeL_precision_stderr": 0.0012097192783486648, "rougeL_recall": 0.3665037791378817, "rougeL_recall_stderr": 0.004626331345429801, "rougeLsum_fmeasure": 0.11268811047269227, "rougeLsum_fmeasure_stderr": 0.0017367441620354863, "rougeLsum_precision": 0.07131226400764427, "rougeLsum_precision_stderr": 0.0012731468955373129, "rougeLsum_recall": 0.37701365596712055, "rougeLsum_recall_stderr": 0.004768186337938744}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4085781866227987, "bleu_stderr": 0.06765316931490462, "rouge1_fmeasure": 0.1735271187409542, "rouge1_fmeasure_stderr": 0.0017420641997891633, "rouge1_precision": 0.14862761907514144, "rouge1_precision_stderr": 0.0018170184325446725, "rouge1_recall": 0.25162969277056396, "rouge1_recall_stderr": 0.0025110750991919678, "rouge2_fmeasure": 0.03281166114649338, "rouge2_fmeasure_stderr": 0.0007678027921869415, "rouge2_precision": 0.027931710583823308, "rouge2_precision_stderr": 0.0006852910799466808, "rouge2_recall": 0.04947292517867473, "rouge2_recall_stderr": 0.001323152681280402, "rougeL_fmeasure": 0.13211424580114145, "rougeL_fmeasure_stderr": 0.0012189992528884612, "rougeL_precision": 0.11167291160760119, "rougeL_precision_stderr": 0.0012338979014086847, "rougeL_recall": 0.1967670614485443, "rougeL_recall_stderr": 0.0020238976303286155, "rougeLsum_fmeasure": 0.16049275586848605, "rougeLsum_fmeasure_stderr": 0.0015915307859795458, "rougeLsum_precision": 0.13726434831918505, "rougeLsum_precision_stderr": 0.001658580287033731, "rougeLsum_recall": 0.23359610303376463, "rougeLsum_recall_stderr": 0.0023331163283850007}}, "1": {"tldr_en": {"bleu": 1.8695588017385163, "bleu_stderr": 0.05321662312142784, "rouge1_fmeasure": 0.18920766306464926, "rouge1_fmeasure_stderr": 0.0019111483043418977, "rouge1_precision": 0.16277193183652522, "rouge1_precision_stderr": 0.0019791861140128785, "rouge1_recall": 0.27320755514294287, "rouge1_recall_stderr": 0.0027529027179660705, "rouge2_fmeasure": 0.03952348883731413, "rouge2_fmeasure_stderr": 0.0009017689028399563, "rouge2_precision": 0.03377477444135655, "rouge2_precision_stderr": 0.0008162143350541909, "rouge2_recall": 0.05888454857622772, "rouge2_recall_stderr": 0.0014868106734162374, "rougeL_fmeasure": 0.14124032964831484, "rougeL_fmeasure_stderr": 0.0012998701147887161, "rougeL_precision": 0.12024619489337642, "rougeL_precision_stderr": 0.0013281691582764977, "rougeL_recall": 0.2091152162389091, "rougeL_recall_stderr": 0.00215651070348592, "rougeLsum_fmeasure": 0.17618136263300152, "rougeLsum_fmeasure_stderr": 0.0017768481626556554, "rougeLsum_precision": 0.15124075642915727, "rougeLsum_precision_stderr": 0.001833497711400087, "rougeLsum_recall": 0.2556042591652215, "rougeLsum_recall_stderr": 0.002599027550285205}}, "2": {"tldr_en": {"bleu": 2.0895250564263526, "bleu_stderr": 0.05260673114945821, "rouge1_fmeasure": 0.19547280360886987, "rouge1_fmeasure_stderr": 0.001894454328895725, "rouge1_precision": 0.1705052900654322, "rouge1_precision_stderr": 0.0020623873190335836, "rouge1_recall": 0.27934387948105693, "rouge1_recall_stderr": 0.00269250839204282, "rouge2_fmeasure": 0.04385052776628551, "rouge2_fmeasure_stderr": 0.0009414966007970309, "rouge2_precision": 0.03848650920945066, "rouge2_precision_stderr": 0.000929187611532398, "rouge2_recall": 0.06443059864859675, "rouge2_recall_stderr": 0.0015228056774988696, "rougeL_fmeasure": 0.147954778208212, "rougeL_fmeasure_stderr": 0.0013306013768007523, "rougeL_precision": 0.12791269972430605, "rougeL_precision_stderr": 0.0014524157223196957, "rougeL_recall": 0.2160525397509754, "rougeL_recall_stderr": 0.0021509404278420674, "rougeLsum_fmeasure": 0.18149103619605517, "rougeLsum_fmeasure_stderr": 0.0017456373642829746, "rougeLsum_precision": 0.15803063427007255, "rougeLsum_precision_stderr": 0.0019019671150593402, "rougeLsum_recall": 0.26032488790855124, "rougeLsum_recall_stderr": 0.0025259372632087294}}, "3": {"tldr_en": {"bleu": 1.9845021837810688, "bleu_stderr": 0.06643615389489499, "rouge1_fmeasure": 0.1627969919759764, "rouge1_fmeasure_stderr": 0.002102839789628136, "rouge1_precision": 0.14794605221202178, "rouge1_precision_stderr": 0.0023288106629145966, "rouge1_recall": 0.23196887225200322, "rouge1_recall_stderr": 0.0030906899098327623, "rouge2_fmeasure": 0.03576925556291925, "rouge2_fmeasure_stderr": 0.0008895897244987175, "rouge2_precision": 0.03234477098093676, "rouge2_precision_stderr": 0.0009482837463457055, "rouge2_recall": 0.052607551971036656, "rouge2_recall_stderr": 0.001441559167188345, "rougeL_fmeasure": 0.12292479290951157, "rougeL_fmeasure_stderr": 0.0015261254326798437, "rougeL_precision": 0.11098220652021759, "rougeL_precision_stderr": 0.0017249339058787237, "rougeL_recall": 0.17882334047937634, "rougeL_recall_stderr": 0.002441176945496047, "rougeLsum_fmeasure": 0.15163244869667974, "rougeLsum_fmeasure_stderr": 0.001949601678988161, "rougeLsum_precision": 0.1379056588631655, "rougeLsum_precision_stderr": 0.002188641876469295, "rougeLsum_recall": 0.2165921381447695, "rougeLsum_recall_stderr": 0.002892803013438996}}, "4": {"tldr_en": {"bleu": 0.43692124770591884, "bleu_stderr": 0.034589878125136274, "rouge1_fmeasure": 0.05320898989288258, "rouge1_fmeasure_stderr": 0.0018000803783716336, "rouge1_precision": 0.05106771575902374, "rouge1_precision_stderr": 0.0019659203953794374, "rouge1_recall": 0.07804501442013374, "rouge1_recall_stderr": 0.002681543421696101, "rouge2_fmeasure": 0.0115081158992026, "rouge2_fmeasure_stderr": 0.0005924515945289618, "rouge2_precision": 0.010915560953831725, "rouge2_precision_stderr": 0.0006843232384822272, "rouge2_recall": 0.017333650821899818, "rouge2_recall_stderr": 0.0009520410019829541, "rougeL_fmeasure": 0.0406022993863923, "rougeL_fmeasure_stderr": 0.001350639486420207, "rougeL_precision": 0.03912914603871578, "rougeL_precision_stderr": 0.0015292266274958602, "rougeL_recall": 0.06051029913811969, "rougeL_recall_stderr": 0.002087906460503723, "rougeLsum_fmeasure": 0.04978211979263513, "rougeLsum_fmeasure_stderr": 0.0016808277076301248, "rougeLsum_precision": 0.04795995322666103, "rougeLsum_precision_stderr": 0.0018654956003536299, "rougeLsum_recall": 0.0730595855068419, "rougeLsum_recall_stderr": 0.002511680618409503}}, "5": {"tldr_en": {"bleu": 8.693135496106579e-07, "bleu_stderr": 2.0347209537079987e-06, "rouge1_fmeasure": 0.009154755966022539, "rouge1_fmeasure_stderr": 0.0008437188329211569, "rouge1_precision": 0.008888598534123287, "rouge1_precision_stderr": 0.0009074304457843781, "rouge1_recall": 0.013732262968419945, "rouge1_recall_stderr": 0.0012775310603661385, "rouge2_fmeasure": 0.0019115778864576577, "rouge2_fmeasure_stderr": 0.0002415435510558745, "rouge2_precision": 0.0020329208663477155, "rouge2_precision_stderr": 0.0004034253796867396, "rouge2_recall": 0.0030032403059095103, "rouge2_recall_stderr": 0.0004244983130075312, "rougeL_fmeasure": 0.006902885273901401, "rougeL_fmeasure_stderr": 0.0006295358749076249, "rougeL_precision": 0.006715096170350463, "rougeL_precision_stderr": 0.000704577952426742, "rougeL_recall": 0.010603012416762791, "rougeL_recall_stderr": 0.0010070501511240425, "rougeLsum_fmeasure": 0.008435486741119774, "rougeLsum_fmeasure_stderr": 0.0007773340965961406, "rougeLsum_precision": 0.00825151495932399, "rougeLsum_precision_stderr": 0.0008497435087111472, "rougeLsum_recall": 0.0126360914173471, "rougeLsum_recall_stderr": 0.0011775115731198504}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.015996891880542208, "bleu_stderr": 0.004310800871207989, "rouge1_fmeasure": 0.01581057979538024, "rouge1_fmeasure_stderr": 0.0003268681446111073, "rouge1_precision": 0.012607142857143292, "rouge1_precision_stderr": 0.0002620854055855113, "rouge1_recall": 0.022364589648673473, "rouge1_recall_stderr": 0.0004833744939072677, "rouge2_fmeasure": 0.00012949433318118156, "rouge2_fmeasure_stderr": 3.494334474550083e-05, "rouge2_precision": 0.00011382113821138212, "rouge2_precision_stderr": 3.0353975217624302e-05, "rouge2_recall": 0.00016007109878338666, "rouge2_recall_stderr": 4.471587603586543e-05, "rougeL_fmeasure": 0.01581057979538024, "rougeL_fmeasure_stderr": 0.0003268681446111073, "rougeL_precision": 0.012607142857143292, "rougeL_precision_stderr": 0.0002620854055855113, "rougeL_recall": 0.022364589648673473, "rougeL_recall_stderr": 0.0004833744939072677, "rougeLsum_fmeasure": 0.015304580133401937, "rougeLsum_fmeasure_stderr": 0.00030990342484444515, "rougeLsum_precision": 0.012202380952381383, "rougeLsum_precision_stderr": 0.00024852907150074173, "rougeLsum_recall": 0.02165931137163564, "rougeLsum_recall_stderr": 0.000459555205796444}}, "1": {"generate_text_restaurant": {"bleu": 10.160164880301407, "bleu_stderr": 0.09870361519554652, "rouge1_fmeasure": 0.4183603415646924, "rouge1_fmeasure_stderr": 0.0021915886347872913, "rouge1_precision": 0.4723645042246602, "rouge1_precision_stderr": 0.003093544138341839, "rouge1_recall": 0.41769731726334886, "rouge1_recall_stderr": 0.0027761936199660036, "rouge2_fmeasure": 0.18224154640367043, "rouge2_fmeasure_stderr": 0.001744075951680649, "rouge2_precision": 0.20824659603338538, "rouge2_precision_stderr": 0.0022062119262852707, "rouge2_recall": 0.1819755290759944, "rouge2_recall_stderr": 0.0019466842287792123, "rougeL_fmeasure": 0.29563040377094635, "rougeL_fmeasure_stderr": 0.0018175171603569897, "rougeL_precision": 0.3351327678327539, "rougeL_precision_stderr": 0.0025523703114727794, "rougeL_recall": 0.295579474348389, "rougeL_recall_stderr": 0.0022293299165326882, "rougeLsum_fmeasure": 0.33772472890005967, "rougeLsum_fmeasure_stderr": 0.0020840945335635214, "rougeLsum_precision": 0.38216044781782504, "rougeLsum_precision_stderr": 0.002840674624973531, "rougeLsum_recall": 0.33669899123255476, "rougeLsum_recall_stderr": 0.002497483152367024}}, "2": {"generate_text_restaurant": {"bleu": 11.639342595686514, "bleu_stderr": 0.1372398556772298, "rouge1_fmeasure": 0.4403748840276186, "rouge1_fmeasure_stderr": 0.0021469015406322322, "rouge1_precision": 0.5008112994914655, "rouge1_precision_stderr": 0.0030901508544956993, "rouge1_recall": 0.4331586166619688, "rouge1_recall_stderr": 0.0027669107693439566, "rouge2_fmeasure": 0.20392339700886974, "rouge2_fmeasure_stderr": 0.0017713110997826917, "rouge2_precision": 0.23532637411562363, "rouge2_precision_stderr": 0.0023540846426583248, "rouge2_recall": 0.20098225485559104, "rouge2_recall_stderr": 0.002000098072621399, "rougeL_fmeasure": 0.3101075960976144, "rougeL_fmeasure_stderr": 0.0018538357828329025, "rougeL_precision": 0.3541570070163828, "rougeL_precision_stderr": 0.0026216980786301844, "rougeL_recall": 0.30536324071501125, "rougeL_recall_stderr": 0.0022959263532433085, "rougeLsum_fmeasure": 0.3568310755537105, "rougeLsum_fmeasure_stderr": 0.0021021781653240523, "rougeLsum_precision": 0.4060963348848002, "rougeLsum_precision_stderr": 0.00287308069523248, "rougeLsum_recall": 0.35125355708974765, "rougeLsum_recall_stderr": 0.002574327739391992}}, "3": {"generate_text_restaurant": {"bleu": 12.12151131378298, "bleu_stderr": 0.11915100827089956, "rouge1_fmeasure": 0.4393124622349425, "rouge1_fmeasure_stderr": 0.002105146952485857, "rouge1_precision": 0.48319371001463535, "rouge1_precision_stderr": 0.002959676881031021, "rouge1_recall": 0.4422942530679121, "rouge1_recall_stderr": 0.002745785251032174, "rouge2_fmeasure": 0.20537187889646735, "rouge2_fmeasure_stderr": 0.0017679657376305042, "rouge2_precision": 0.22776533967342907, "rouge2_precision_stderr": 0.002199578780588574, "rouge2_recall": 0.20780683933237928, "rouge2_recall_stderr": 0.0020519338118908715, "rougeL_fmeasure": 0.30827570526127646, "rougeL_fmeasure_stderr": 0.001838483091209031, "rougeL_precision": 0.34008741895966316, "rougeL_precision_stderr": 0.002497623460553464, "rougeL_recall": 0.3107412356396501, "rougeL_recall_stderr": 0.0022904606355026516, "rougeLsum_fmeasure": 0.3590927766034753, "rougeLsum_fmeasure_stderr": 0.0020747916723764977, "rougeLsum_precision": 0.3948986426501321, "rougeLsum_precision_stderr": 0.0027465008011313763, "rougeLsum_recall": 0.36193901740453144, "rougeLsum_recall_stderr": 0.0025838482295284003}}, "4": {"generate_text_restaurant": {"bleu": 11.559015928325927, "bleu_stderr": 0.18803967369078045, "rouge1_fmeasure": 0.4317309670284675, "rouge1_fmeasure_stderr": 0.002056102439511285, "rouge1_precision": 0.45547982198924447, "rouge1_precision_stderr": 0.0028623291621968612, "rouge1_recall": 0.4496956638404531, "rouge1_recall_stderr": 0.002644952608966691, "rouge2_fmeasure": 0.200121282410508, "rouge2_fmeasure_stderr": 0.001720634914837748, "rouge2_precision": 0.21266205937348281, "rouge2_precision_stderr": 0.0020948347718174865, "rouge2_recall": 0.2094291861066839, "rouge2_recall_stderr": 0.0020099108277091484, "rougeL_fmeasure": 0.3022050091930064, "rougeL_fmeasure_stderr": 0.001805017629409357, "rougeL_precision": 0.31926641390021887, "rougeL_precision_stderr": 0.0023710826604788795, "rougeL_recall": 0.3157467625853494, "rougeL_recall_stderr": 0.0022866495141614174, "rougeLsum_fmeasure": 0.3556255169379216, "rougeLsum_fmeasure_stderr": 0.0020739273107426392, "rougeLsum_precision": 0.3747187674171436, "rougeLsum_precision_stderr": 0.002662920050950552, "rougeLsum_recall": 0.37116215718680456, "rougeLsum_recall_stderr": 0.0025805030574394894}}, "5": {"generate_text_restaurant": {"bleu": 10.718543658577635, "bleu_stderr": 0.13557644877495797, "rouge1_fmeasure": 0.4260986497991647, "rouge1_fmeasure_stderr": 0.0019558770098863274, "rouge1_precision": 0.43624590990474293, "rouge1_precision_stderr": 0.002755359910046268, "rouge1_recall": 0.45743821166090926, "rouge1_recall_stderr": 0.0025961821255812874, "rouge2_fmeasure": 0.19429987532612708, "rouge2_fmeasure_stderr": 0.0016645032075996245, "rouge2_precision": 0.19962516392680765, "rouge2_precision_stderr": 0.0019681113841036675, "rouge2_recall": 0.21014052881618187, "rouge2_recall_stderr": 0.001996018471125935, "rougeL_fmeasure": 0.29713861834051897, "rougeL_fmeasure_stderr": 0.0017327548341535156, "rougeL_precision": 0.30445962134555027, "rougeL_precision_stderr": 0.0022570971354846756, "rougeL_recall": 0.319832443105098, "rougeL_recall_stderr": 0.002242049253249801, "rougeLsum_fmeasure": 0.3541205313176406, "rougeLsum_fmeasure_stderr": 0.0019745070250081604, "rougeLsum_precision": 0.36210497757070403, "rougeLsum_precision_stderr": 0.0025479433447278678, "rougeLsum_recall": 0.38092539069745174, "rougeLsum_recall_stderr": 0.00253423877318417}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.893091380773236, "bleu_stderr": 0.07155879041797279, "rouge1_fmeasure": 0.2061985807221383, "rouge1_fmeasure_stderr": 0.0025614797423455142, "rouge1_precision": 0.1594590911124501, "rouge1_precision_stderr": 0.0022782733083245854, "rouge1_recall": 0.3283512452700197, "rouge1_recall_stderr": 0.004306520729443716, "rouge2_fmeasure": 0.044020752612575986, "rouge2_fmeasure_stderr": 0.0015514733272418211, "rouge2_precision": 0.03310698595501658, "rouge2_precision_stderr": 0.001224021675086901, "rouge2_recall": 0.07342630951021904, "rouge2_recall_stderr": 0.0026234650563968604, "rougeL_fmeasure": 0.15357121051296377, "rougeL_fmeasure_stderr": 0.0019035866015213703, "rougeL_precision": 0.11840027221905174, "rougeL_precision_stderr": 0.00166823447110928, "rougeL_recall": 0.24641286838690638, "rougeL_recall_stderr": 0.003356144965161673, "rougeLsum_fmeasure": 0.15982417698859158, "rougeLsum_fmeasure_stderr": 0.0021047185959146048, "rougeLsum_precision": 0.12282873435732601, "rougeLsum_precision_stderr": 0.0017745649485105812, "rougeLsum_recall": 0.25756924529549197, "rougeLsum_recall_stderr": 0.0037577915871248534}}, "1": {"article_DOC_summary": {"bleu": 1.1760544673245734, "bleu_stderr": 0.08133234405781672, "rouge1_fmeasure": 0.1644736490786607, "rouge1_fmeasure_stderr": 0.002289825635722979, "rouge1_precision": 0.11666645790427581, "rouge1_precision_stderr": 0.0017014755837046706, "rouge1_recall": 0.28978527393441794, "rouge1_recall_stderr": 0.0038991635104653646, "rouge2_fmeasure": 0.02968537882036386, "rouge2_fmeasure_stderr": 0.0012639179125521537, "rouge2_precision": 0.020871037943975598, "rouge2_precision_stderr": 0.0008902042501889368, "rouge2_recall": 0.05355457980003852, "rouge2_recall_stderr": 0.0023314697329144688, "rougeL_fmeasure": 0.12816828557628004, "rougeL_fmeasure_stderr": 0.0017421199503176153, "rougeL_precision": 0.09068714229050606, "rougeL_precision_stderr": 0.0012801677393527455, "rougeL_recall": 0.22742162679707073, "rougeL_recall_stderr": 0.0030968609463424912, "rougeLsum_fmeasure": 0.13227157946601684, "rougeLsum_fmeasure_stderr": 0.0018913833448635142, "rougeLsum_precision": 0.09360786095706375, "rougeLsum_precision_stderr": 0.0013862407744166395, "rougeLsum_recall": 0.23441366999836272, "rougeLsum_recall_stderr": 0.0033283603169613893}}, "2": {"article_DOC_summary": {"bleu": 1.1923931410727253, "bleu_stderr": 0.096157535871299, "rouge1_fmeasure": 0.16654675192354085, "rouge1_fmeasure_stderr": 0.0022693587032095544, "rouge1_precision": 0.11788198252469814, "rouge1_precision_stderr": 0.0016840268338500827, "rouge1_recall": 0.29510022232622024, "rouge1_recall_stderr": 0.003931015696733789, "rouge2_fmeasure": 0.030040223235727605, "rouge2_fmeasure_stderr": 0.001269654630895902, "rouge2_precision": 0.02100747266618839, "rouge2_precision_stderr": 0.0008812797803659204, "rouge2_recall": 0.05497523294613044, "rouge2_recall_stderr": 0.0024458906023431502, "rougeL_fmeasure": 0.13083582888851075, "rougeL_fmeasure_stderr": 0.0017198624344877728, "rougeL_precision": 0.09243404834906441, "rougeL_precision_stderr": 0.0012651350635372006, "rougeL_recall": 0.23303993493040487, "rougeL_recall_stderr": 0.003075264655112234, "rougeLsum_fmeasure": 0.13353807347880225, "rougeLsum_fmeasure_stderr": 0.001873990491132275, "rougeLsum_precision": 0.09431829617737796, "rougeLsum_precision_stderr": 0.0013719159321866803, "rougeLsum_recall": 0.23802791061424466, "rougeLsum_recall_stderr": 0.0033649952401285928}}, "3": {"article_DOC_summary": {"bleu": 1.3191345882756016, "bleu_stderr": 0.09421067837162811, "rouge1_fmeasure": 0.16270135093872945, "rouge1_fmeasure_stderr": 0.0024951785759190037, "rouge1_precision": 0.11793417351785487, "rouge1_precision_stderr": 0.002017377276366173, "rouge1_recall": 0.2836315744233739, "rouge1_recall_stderr": 0.004290159570320814, "rouge2_fmeasure": 0.030050510110479087, "rouge2_fmeasure_stderr": 0.0013009231624033775, "rouge2_precision": 0.021590058499460735, "rouge2_precision_stderr": 0.000983046658889197, "rouge2_recall": 0.05387363083458545, "rouge2_recall_stderr": 0.0023820277892373596, "rougeL_fmeasure": 0.1279568957546, "rougeL_fmeasure_stderr": 0.0018961537785853696, "rougeL_precision": 0.09245687482962514, "rougeL_precision_stderr": 0.0015075983809124832, "rougeL_recall": 0.22467473031951235, "rougeL_recall_stderr": 0.003391085028163447, "rougeLsum_fmeasure": 0.13073618241924947, "rougeLsum_fmeasure_stderr": 0.0020835700028823006, "rougeLsum_precision": 0.09456074152436214, "rougeLsum_precision_stderr": 0.0016592694935628211, "rougeLsum_recall": 0.22933200389418706, "rougeLsum_recall_stderr": 0.0036933254855523244}}, "4": {"article_DOC_summary": {"bleu": 0.5682110373693539, "bleu_stderr": 0.08502122706344573, "rouge1_fmeasure": 0.04429948639681006, "rouge1_fmeasure_stderr": 0.0024800638787534734, "rouge1_precision": 0.0367038903617586, "rouge1_precision_stderr": 0.002159913173134104, "rouge1_recall": 0.06993097002986987, "rouge1_recall_stderr": 0.003981889947602205, "rouge2_fmeasure": 0.007253870650249119, "rouge2_fmeasure_stderr": 0.0007371032526022674, "rouge2_precision": 0.005663786671792409, "rouge2_precision_stderr": 0.0006131630864231121, "rouge2_recall": 0.011891116216861553, "rouge2_recall_stderr": 0.0012302505001022532, "rougeL_fmeasure": 0.03463725804962344, "rougeL_fmeasure_stderr": 0.0019206641793292863, "rougeL_precision": 0.029073862632456978, "rougeL_precision_stderr": 0.0017427910212302695, "rougeL_recall": 0.05493887124617477, "rougeL_recall_stderr": 0.00313835664453676, "rougeLsum_fmeasure": 0.035650820163199474, "rougeLsum_fmeasure_stderr": 0.002013697777536067, "rougeLsum_precision": 0.02992033353583286, "rougeLsum_precision_stderr": 0.0018236172991812475, "rougeLsum_recall": 0.056338516909847834, "rougeLsum_recall_stderr": 0.0032573814363183865}}, "5": {"article_DOC_summary": {"bleu": 7.234219387755872e-40, "bleu_stderr": 8.513383008510971e-35, "rouge1_fmeasure": 0.0021354726500630848, "rouge1_fmeasure_stderr": 0.0005598260412101052, "rouge1_precision": 0.0023302457662671455, "rouge1_precision_stderr": 0.0006112488722810845, "rouge1_recall": 0.002024827111990807, "rouge1_recall_stderr": 0.0005399672158721946, "rouge2_fmeasure": 8.848593754254134e-05, "rouge2_fmeasure_stderr": 6.272722707705515e-05, "rouge2_precision": 0.00010405105438401775, "rouge2_precision_stderr": 7.357743885200423e-05, "rouge2_recall": 7.71869639794168e-05, "rouge2_recall_stderr": 5.489230344136828e-05, "rougeL_fmeasure": 0.00164411367748071, "rougeL_fmeasure_stderr": 0.0004347092185887277, "rougeL_precision": 0.0018006566925166142, "rougeL_precision_stderr": 0.000476195417495383, "rougeL_recall": 0.001551384791989109, "rougeL_recall_stderr": 0.0004151810881735085, "rougeLsum_fmeasure": 0.0017350747461433223, "rougeLsum_fmeasure_stderr": 0.00045807565970479823, "rougeLsum_precision": 0.001898751962642961, "rougeLsum_precision_stderr": 0.0005005472882701675, "rougeLsum_recall": 0.001637972732350634, "rougeLsum_recall_stderr": 0.0004381341521417912}}}}
|
2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.338,0.014965960710224485,0
|
3 |
+
anli_r2,acc,0.333,0.014910846164229863,0
|
4 |
+
anli_r3,acc,0.3358333333333333,0.013639261190932882,0
|
5 |
+
arc_challenge,acc,0.2440273037542662,0.01255144762785626,0
|
6 |
+
arc_challenge,acc_norm,0.28242320819112626,0.013155456884097224,0
|
7 |
+
arc_easy,acc,0.5673400673400674,0.010166307932642867,0
|
8 |
+
arc_easy,acc_norm,0.4962121212121212,0.010259489101351847,0
|
9 |
+
boolq,acc,0.5978593272171254,0.008575926383211252,1
|
10 |
+
cb,acc,0.4107142857142857,0.0663363415035954,1
|
11 |
+
cb,f1,0.21777777777777776,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.4374626568412667,0.004950598300667558,0
|
14 |
+
hellaswag,acc_norm,0.5612427803226449,0.004952209831856566,0
|
15 |
+
piqa,acc,0.733949945593036,0.010310039263352831,0
|
16 |
+
piqa,acc_norm,0.7372143634385201,0.010269354068140777,0
|
17 |
+
rte,acc,0.5306859205776173,0.030039730592197812,0
|
18 |
+
sciq,acc,0.805,0.012535235623319322,0
|
19 |
+
sciq,acc_norm,0.717,0.014251810906481742,0
|
20 |
+
storycloze_2016,acc,0.6916087653661144,0.010679734445487796,0
|
21 |
+
winogrande,acc,0.5603788476716653,0.013949649776015689,0
|
2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.338,
|
5 |
-
"acc_stderr": 0.014965960710224485
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.333,
|
9 |
-
"acc_stderr": 0.014910846164229863
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3358333333333333,
|
13 |
-
"acc_stderr": 0.013639261190932882
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4107142857142857,
|
17 |
-
"acc_stderr": 0.0663363415035954,
|
18 |
-
"f1": 0.21777777777777776
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4374626568412667,
|
26 |
-
"acc_stderr": 0.004950598300667558,
|
27 |
-
"acc_norm": 0.5612427803226449,
|
28 |
-
"acc_norm_stderr": 0.004952209831856566
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5306859205776173,
|
32 |
-
"acc_stderr": 0.030039730592197812
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5603788476716653,
|
36 |
-
"acc_stderr": 0.013949649776015689
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6916087653661144,
|
40 |
-
"acc_stderr": 0.010679734445487796
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5978593272171254,
|
44 |
-
"acc_stderr": 0.008575926383211252
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5673400673400674,
|
48 |
-
"acc_stderr": 0.010166307932642867,
|
49 |
-
"acc_norm": 0.4962121212121212,
|
50 |
-
"acc_norm_stderr": 0.010259489101351847
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2440273037542662,
|
54 |
-
"acc_stderr": 0.01255144762785626,
|
55 |
-
"acc_norm": 0.28242320819112626,
|
56 |
-
"acc_norm_stderr": 0.013155456884097224
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.805,
|
60 |
-
"acc_stderr": 0.012535235623319322,
|
61 |
-
"acc_norm": 0.717,
|
62 |
-
"acc_norm_stderr": 0.014251810906481742
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.733949945593036,
|
66 |
-
"acc_stderr": 0.010310039263352831,
|
67 |
-
"acc_norm": 0.7372143634385201,
|
68 |
-
"acc_norm_stderr": 0.010269354068140777
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.344,0.015029633724408947,0
|
3 |
+
anli_r2,acc,0.333,0.01491084616422986,0
|
4 |
+
anli_r3,acc,0.35333333333333333,0.01380457216231493,0
|
5 |
+
arc_challenge,acc,0.24658703071672355,0.012595726268790124,0
|
6 |
+
arc_challenge,acc_norm,0.28924914675767915,0.013250012579393443,0
|
7 |
+
arc_easy,acc,0.5765993265993266,0.01013867100528905,0
|
8 |
+
arc_easy,acc_norm,0.5361952861952862,0.010232865550346736,0
|
9 |
+
boolq,acc,0.5807339449541284,0.008630302070999097,1
|
10 |
+
cb,acc,0.5357142857142857,0.06724777654937658,1
|
11 |
+
cb,f1,0.37714285714285706,,1
|
12 |
+
copa,acc,0.73,0.0446196043338474,0
|
13 |
+
hellaswag,acc,0.433379804819757,0.00494529127007243,0
|
14 |
+
hellaswag,acc_norm,0.5638319059948218,0.00494895251951751,0
|
15 |
+
piqa,acc,0.7399347116430903,0.01023489324906129,0
|
16 |
+
piqa,acc_norm,0.736126224156692,0.01028299636769557,0
|
17 |
+
rte,acc,0.5379061371841155,0.030009848912529113,0
|
18 |
+
sciq,acc,0.846,0.011419913065098706,0
|
19 |
+
sciq,acc_norm,0.824,0.012048616898597498,0
|
20 |
+
storycloze_2016,acc,0.6803848209513629,0.01078375973373075,0
|
21 |
+
winogrande,acc,0.5430149960536701,0.01400038676159829,0
|
2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.344,
|
5 |
-
"acc_stderr": 0.015029633724408947
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.333,
|
9 |
-
"acc_stderr": 0.01491084616422986
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.35333333333333333,
|
13 |
-
"acc_stderr": 0.01380457216231493
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5357142857142857,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.37714285714285706
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.73,
|
22 |
-
"acc_stderr": 0.0446196043338474
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.433379804819757,
|
26 |
-
"acc_stderr": 0.00494529127007243,
|
27 |
-
"acc_norm": 0.5638319059948218,
|
28 |
-
"acc_norm_stderr": 0.00494895251951751
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5379061371841155,
|
32 |
-
"acc_stderr": 0.030009848912529113
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5430149960536701,
|
36 |
-
"acc_stderr": 0.01400038676159829
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6803848209513629,
|
40 |
-
"acc_stderr": 0.01078375973373075
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5807339449541284,
|
44 |
-
"acc_stderr": 0.008630302070999097
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5765993265993266,
|
48 |
-
"acc_stderr": 0.01013867100528905,
|
49 |
-
"acc_norm": 0.5361952861952862,
|
50 |
-
"acc_norm_stderr": 0.010232865550346736
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24658703071672355,
|
54 |
-
"acc_stderr": 0.012595726268790124,
|
55 |
-
"acc_norm": 0.28924914675767915,
|
56 |
-
"acc_norm_stderr": 0.013250012579393443
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.846,
|
60 |
-
"acc_stderr": 0.011419913065098706,
|
61 |
-
"acc_norm": 0.824,
|
62 |
-
"acc_norm_stderr": 0.012048616898597498
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7399347116430903,
|
66 |
-
"acc_stderr": 0.01023489324906129,
|
67 |
-
"acc_norm": 0.736126224156692,
|
68 |
-
"acc_norm_stderr": 0.01028299636769557
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.331,0.014888272588203943,0
|
3 |
+
anli_r2,acc,0.329,0.01486539538592836,0
|
4 |
+
anli_r3,acc,0.3466666666666667,0.013744022550571946,0
|
5 |
+
arc_challenge,acc,0.25426621160409557,0.012724999945157744,0
|
6 |
+
arc_challenge,acc_norm,0.29692832764505117,0.013352025976725223,0
|
7 |
+
arc_easy,acc,0.5896464646464646,0.010093531255765457,0
|
8 |
+
arc_easy,acc_norm,0.5496632996632996,0.010209047724374143,0
|
9 |
+
boolq,acc,0.5767584097859327,0.008641391399113584,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.27939042089985483,,1
|
12 |
+
copa,acc,0.76,0.042923469599092816,0
|
13 |
+
hellaswag,acc,0.43019318860784705,0.004940911779273377,0
|
14 |
+
hellaswag,acc_norm,0.5633339972117108,0.00494958956767889,0
|
15 |
+
piqa,acc,0.7328618063112078,0.010323440492612437,0
|
16 |
+
piqa,acc_norm,0.7274211099020674,0.010389256803296007,0
|
17 |
+
rte,acc,0.51985559566787,0.030072723167317184,0
|
18 |
+
sciq,acc,0.869,0.010674874844837957,0
|
19 |
+
sciq,acc_norm,0.854,0.011171786285496497,0
|
20 |
+
storycloze_2016,acc,0.6873329770176376,0.010720223172953165,0
|
21 |
+
winogrande,acc,0.5580110497237569,0.013957584079108997,0
|
2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.331,
|
5 |
-
"acc_stderr": 0.014888272588203943
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.329,
|
9 |
-
"acc_stderr": 0.01486539538592836
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3466666666666667,
|
13 |
-
"acc_stderr": 0.013744022550571946
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.27939042089985483
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.76,
|
22 |
-
"acc_stderr": 0.042923469599092816
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43019318860784705,
|
26 |
-
"acc_stderr": 0.004940911779273377,
|
27 |
-
"acc_norm": 0.5633339972117108,
|
28 |
-
"acc_norm_stderr": 0.00494958956767889
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.51985559566787,
|
32 |
-
"acc_stderr": 0.030072723167317184
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5580110497237569,
|
36 |
-
"acc_stderr": 0.013957584079108997
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6873329770176376,
|
40 |
-
"acc_stderr": 0.010720223172953165
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5767584097859327,
|
44 |
-
"acc_stderr": 0.008641391399113584
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5896464646464646,
|
48 |
-
"acc_stderr": 0.010093531255765457,
|
49 |
-
"acc_norm": 0.5496632996632996,
|
50 |
-
"acc_norm_stderr": 0.010209047724374143
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.25426621160409557,
|
54 |
-
"acc_stderr": 0.012724999945157744,
|
55 |
-
"acc_norm": 0.29692832764505117,
|
56 |
-
"acc_norm_stderr": 0.013352025976725223
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.869,
|
60 |
-
"acc_stderr": 0.010674874844837957,
|
61 |
-
"acc_norm": 0.854,
|
62 |
-
"acc_norm_stderr": 0.011171786285496497
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7328618063112078,
|
66 |
-
"acc_stderr": 0.010323440492612437,
|
67 |
-
"acc_norm": 0.7274211099020674,
|
68 |
-
"acc_norm_stderr": 0.010389256803296007
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.341,0.014998131348402718,0
|
3 |
+
anli_r2,acc,0.344,0.015029633724408945,0
|
4 |
+
anli_r3,acc,0.3441666666666667,0.013720551062295755,0
|
5 |
+
arc_challenge,acc,0.2525597269624573,0.012696728980207706,0
|
6 |
+
arc_challenge,acc_norm,0.2883959044368601,0.013238394422428171,0
|
7 |
+
arc_easy,acc,0.5791245791245792,0.01013050216406633,0
|
8 |
+
arc_easy,acc_norm,0.5618686868686869,0.010180937100600067,0
|
9 |
+
boolq,acc,0.5837920489296636,0.008621380519419278,1
|
10 |
+
cb,acc,0.42857142857142855,0.06672848092813058,1
|
11 |
+
cb,f1,0.2988586070347077,,1
|
12 |
+
copa,acc,0.8,0.040201512610368445,0
|
13 |
+
hellaswag,acc,0.4304919338777136,0.004941331215598548,0
|
14 |
+
hellaswag,acc_norm,0.563433578968333,0.004949462563681335,0
|
15 |
+
piqa,acc,0.7383025027203483,0.010255630772708229,0
|
16 |
+
piqa,acc_norm,0.7366702937976061,0.010276185322196764,0
|
17 |
+
rte,acc,0.5126353790613718,0.030086851767188564,0
|
18 |
+
sciq,acc,0.868,0.010709373963528035,0
|
19 |
+
sciq,acc_norm,0.865,0.010811655372416051,0
|
20 |
+
storycloze_2016,acc,0.686798503474078,0.0107252094229294,0
|
21 |
+
winogrande,acc,0.5493291239147593,0.01398392886904024,0
|
2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.341,
|
5 |
-
"acc_stderr": 0.014998131348402718
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.344,
|
9 |
-
"acc_stderr": 0.015029633724408945
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3441666666666667,
|
13 |
-
"acc_stderr": 0.013720551062295755
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.2988586070347077
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.8,
|
22 |
-
"acc_stderr": 0.040201512610368445
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4304919338777136,
|
26 |
-
"acc_stderr": 0.004941331215598548,
|
27 |
-
"acc_norm": 0.563433578968333,
|
28 |
-
"acc_norm_stderr": 0.004949462563681335
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5126353790613718,
|
32 |
-
"acc_stderr": 0.030086851767188564
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5493291239147593,
|
36 |
-
"acc_stderr": 0.01398392886904024
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.686798503474078,
|
40 |
-
"acc_stderr": 0.0107252094229294
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5837920489296636,
|
44 |
-
"acc_stderr": 0.008621380519419278
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5791245791245792,
|
48 |
-
"acc_stderr": 0.01013050216406633,
|
49 |
-
"acc_norm": 0.5618686868686869,
|
50 |
-
"acc_norm_stderr": 0.010180937100600067
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2525597269624573,
|
54 |
-
"acc_stderr": 0.012696728980207706,
|
55 |
-
"acc_norm": 0.2883959044368601,
|
56 |
-
"acc_norm_stderr": 0.013238394422428171
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.868,
|
60 |
-
"acc_stderr": 0.010709373963528035,
|
61 |
-
"acc_norm": 0.865,
|
62 |
-
"acc_norm_stderr": 0.010811655372416051
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7383025027203483,
|
66 |
-
"acc_stderr": 0.010255630772708229,
|
67 |
-
"acc_norm": 0.7366702937976061,
|
68 |
-
"acc_norm_stderr": 0.010276185322196764
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|