Commit
·
770ebe7
1
Parent(s):
d6f25de
merge
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 2b855b55boscarseed1/evaluation/generation/merged.csv +53 -0
- 2b855b55boscarseed1/evaluation/generation/merged.json +1 -0
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_0.csv +21 -0
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_0_lm-eval_global_step52452_2023-02-13-14-30-07_0shots_backup.json +0 -87
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_1.csv +21 -0
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_1_lm-eval_global_step52452_2023-02-13-14-30-07_1shots_backup.json +0 -87
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_2.csv +21 -0
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_2_lm-eval_global_step52452_2023-02-13-14-30-07_2shots_backup.json +0 -87
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_3.csv +21 -0
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_3.json +29 -1
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_3_lm-eval_global_step52452_2023-02-13-14-30-07_3shots_backup.json +0 -59
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_4.csv +21 -0
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_4.json +34 -1
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_4_lm-eval_global_step52452_2023-02-13-14-30-07_4shots_backup.json +0 -54
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_5.csv +21 -0
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_5.json +56 -1
- 2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_5_lm-eval_global_step52452_2023-02-13-14-30-07_5shots_backup.json +0 -32
- 2b855b55boscarseed2/evaluation/generation/merged.csv +53 -0
- 2b855b55boscarseed2/evaluation/generation/merged.json +1 -0
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_0.csv +21 -0
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_0_lm-eval_global_step52452_2023-02-13-14-30-06_0shots_backup.json +0 -87
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_1.csv +21 -0
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_1_lm-eval_global_step52452_2023-02-13-14-30-06_1shots_backup.json +0 -87
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_2.csv +21 -0
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_2_lm-eval_global_step52452_2023-02-13-14-30-06_2shots_backup.json +0 -87
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_3.csv +21 -0
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_3.json +15 -1
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_3_lm-eval_global_step52452_2023-02-13-14-30-06_3shots_backup.json +0 -73
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_4.csv +21 -0
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_4.json +34 -1
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_4_lm-eval_global_step52452_2023-02-13-14-30-06_4shots_backup.json +0 -54
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_5.csv +21 -0
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_5.json +34 -1
- 2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_5_lm-eval_global_step52452_2023-02-13-14-30-06_5shots_backup.json +0 -54
- 2b855b55boscarseed3/evaluation/generation/merged.csv +53 -0
- 2b855b55boscarseed3/evaluation/generation/merged.json +1 -0
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_0.csv +21 -0
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_0_lm-eval_global_step52452_2023-02-13-14-30-06_0shots_backup.json +0 -87
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_1.csv +21 -0
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_1_lm-eval_global_step52452_2023-02-13-14-30-06_1shots_backup.json +0 -87
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_2.csv +21 -0
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_2_lm-eval_global_step52452_2023-02-13-14-30-07_2shots_backup.json +0 -87
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_3.csv +21 -0
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_3_lm-eval_global_step52452_2023-02-13-14-30-06_3shots_backup.json +0 -87
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_4.csv +21 -0
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_4.json +22 -1
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_4_lm-eval_global_step52452_2023-02-13-14-30-06_4shots_backup.json +0 -66
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_5.csv +21 -0
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_5.json +34 -1
- 2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_5_lm-eval_global_step52452_2023-02-13-14-30-06_5shots_backup.json +0 -54
2b855b55boscarseed1/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.015075003955377725
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.015075003955377725
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2197193326260251
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.2197193326260251
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.24462503044359898
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.24462503044359898
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.24783052219620694
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.24783052219620694
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.24803460463857877
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.24803460463857877
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24688680363045243
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.24688680363045243
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.20369521624837333
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03650064845163216
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.03650064845163216
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.042159228682119326
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.042159228682119326
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.047169327493735744
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.047169327493735744
|
21 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04773019417977251
|
22 |
+
gem_xsum,3,median,rouge2_fmeasure,0.04773019417977251
|
23 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.012384552834017296
|
24 |
+
gem_xsum,4,median,rouge2_fmeasure,0.012384552834017296
|
25 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,9.281741700266572e-05
|
26 |
+
gem_xsum,5,median,rouge2_fmeasure,9.281741700266572e-05
|
27 |
+
gem_xsum,5,average,multiple,0.03100612817637995
|
28 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.044642962602441105
|
29 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.044642962602441105
|
30 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07208261868868578
|
31 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.07208261868868578
|
32 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09209494204875378
|
33 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.09209494204875378
|
34 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10424957521252359
|
35 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.10424957521252359
|
36 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10890579060188654
|
37 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.10890579060188654
|
38 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11883663273017754
|
39 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.11883663273017754
|
40 |
+
web_nlg_en,5,average,multiple,0.09013542031407806
|
41 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.02327004167912628
|
42 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.02327004167912628
|
43 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.031435380352664986
|
44 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.031435380352664986
|
45 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05229754159287467
|
46 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.05229754159287467
|
47 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04645910000265166
|
48 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.04645910000265166
|
49 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01548765621583035
|
50 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.01548765621583035
|
51 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.003396689228466778
|
52 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.003396689228466778
|
53 |
+
wiki_lingua_en,5,average,multiple,0.02872440151193579
|
2b855b55boscarseed1/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.33202476289978655, "bleu_stderr": 0.03284628255303658, "rouge1_fmeasure": 0.10035078272046244, "rouge1_fmeasure_stderr": 0.0019783236676076898, "rouge1_precision": 0.06651480936453831, "rouge1_precision_stderr": 0.001650969998879038, "rouge1_recall": 0.297334737955042, "rouge1_recall_stderr": 0.004992706637890668, "rouge2_fmeasure": 0.044642962602441105, "rouge2_fmeasure_stderr": 0.001178797135452423, "rouge2_precision": 0.029110043712061957, "rouge2_precision_stderr": 0.0008947308938169874, "rouge2_recall": 0.13391480095345307, "rouge2_recall_stderr": 0.0032321395002733538, "rougeL_fmeasure": 0.09648383030825737, "rougeL_fmeasure_stderr": 0.0018521276667425775, "rougeL_precision": 0.06360280420840975, "rougeL_precision_stderr": 0.0015109805669500105, "rougeL_recall": 0.2880932061217916, "rougeL_recall_stderr": 0.004879650046655422, "rougeLsum_fmeasure": 0.0940598212200592, "rougeLsum_fmeasure_stderr": 0.0018664528054209402, "rougeLsum_precision": 0.06247062164223968, "rougeLsum_precision_stderr": 0.0015581650631781932, "rougeLsum_recall": 0.27694653889243487, "rougeLsum_recall_stderr": 0.004633639788952067}}, "1": {"PALM_prompt": {"bleu": 0.527115014551651, "bleu_stderr": 0.025666419950371343, "rouge1_fmeasure": 0.14851607672855913, "rouge1_fmeasure_stderr": 0.0033689657278829425, "rouge1_precision": 0.1270389619853191, "rouge1_precision_stderr": 0.004160322890958984, "rouge1_recall": 0.3113982585236776, "rouge1_recall_stderr": 0.005100689200994648, "rouge2_fmeasure": 0.07208261868868578, "rouge2_fmeasure_stderr": 0.002234096091379544, "rouge2_precision": 0.06294568402552247, "rouge2_precision_stderr": 0.002863056502770272, "rouge2_recall": 0.15340774472215243, "rouge2_recall_stderr": 0.003411319647719293, "rougeL_fmeasure": 0.13589968933657245, "rougeL_fmeasure_stderr": 0.0029353826676204973, "rougeL_precision": 0.1157116753063807, "rougeL_precision_stderr": 0.003769961491523323, "rougeL_recall": 0.29147344461366753, "rougeL_recall_stderr": 0.00468545606109597, "rougeLsum_fmeasure": 0.13804305933456154, "rougeLsum_fmeasure_stderr": 0.003006886444225815, "rougeLsum_precision": 0.11802827465869996, "rougeLsum_precision_stderr": 0.003850703320853075, "rougeLsum_recall": 0.2933246534286482, "rougeLsum_recall_stderr": 0.00468201951647078}}, "2": {"PALM_prompt": {"bleu": 0.6597997964856303, "bleu_stderr": 0.034728244254522804, "rouge1_fmeasure": 0.18002908296123593, "rouge1_fmeasure_stderr": 0.003968306495814761, "rouge1_precision": 0.16610339248030914, "rouge1_precision_stderr": 0.00507284326160864, "rouge1_recall": 0.3352959906245331, "rouge1_recall_stderr": 0.004999412504809714, "rouge2_fmeasure": 0.09209494204875378, "rouge2_fmeasure_stderr": 0.0026603000241755223, "rouge2_precision": 0.08678039894771569, "rouge2_precision_stderr": 0.003378136673520059, "rouge2_recall": 0.1742610345269654, "rouge2_recall_stderr": 0.0036505372153588563, "rougeL_fmeasure": 0.1614621956023238, "rougeL_fmeasure_stderr": 0.003324273428019681, "rougeL_precision": 0.14649813778279558, "rougeL_precision_stderr": 0.0043379793964259455, "rougeL_recall": 0.31289951361042795, "rougeL_recall_stderr": 0.004601477399031109, "rougeLsum_fmeasure": 0.164808372020027, "rougeLsum_fmeasure_stderr": 0.0034322550025239302, "rougeLsum_precision": 0.1507150810319521, "rougeLsum_precision_stderr": 0.004505163531248335, "rougeLsum_recall": 0.3160265105718741, "rougeLsum_recall_stderr": 0.0046507219012497545}}, "3": {"PALM_prompt": {"bleu": 0.823682659540449, "bleu_stderr": 0.03223216531212538, "rouge1_fmeasure": 0.19910319186529463, "rouge1_fmeasure_stderr": 0.004405170032341835, "rouge1_precision": 0.19000820798830498, "rouge1_precision_stderr": 0.005654761920275743, "rouge1_recall": 0.3542179698381772, "rouge1_recall_stderr": 0.00504611245447937, "rouge2_fmeasure": 0.10424957521252359, "rouge2_fmeasure_stderr": 0.0029705826763534444, "rouge2_precision": 0.1027389531668612, "rouge2_precision_stderr": 0.0038013449699206645, "rouge2_recall": 0.185929248611057, "rouge2_recall_stderr": 0.0037754838215154443, "rougeL_fmeasure": 0.17723376057155618, "rougeL_fmeasure_stderr": 0.0036982270883436853, "rougeL_precision": 0.16748846871920003, "rougeL_precision_stderr": 0.004927644927911881, "rougeL_recall": 0.32704498822538286, "rougeL_recall_stderr": 0.0045579213655376595, "rougeLsum_fmeasure": 0.18174531523702536, "rougeLsum_fmeasure_stderr": 0.003846822902949112, "rougeLsum_precision": 0.17304403574735655, "rougeLsum_precision_stderr": 0.005121196453837608, "rougeLsum_recall": 0.3314123051295653, "rougeLsum_recall_stderr": 0.0046234640987954925}}, "4": {"PALM_prompt": {"bleu": 0.9530919051940064, "bleu_stderr": 0.0577655503873056, "rouge1_fmeasure": 0.2079827747732359, "rouge1_fmeasure_stderr": 0.004259502031408432, "rouge1_precision": 0.19701770449821013, "rouge1_precision_stderr": 0.005404879841926881, "rouge1_recall": 0.36981751550824943, "rouge1_recall_stderr": 0.00493079961155932, "rouge2_fmeasure": 0.10890579060188654, "rouge2_fmeasure_stderr": 0.0029073811099355118, "rouge2_precision": 0.1051003568367592, "rouge2_precision_stderr": 0.0035978375653891058, "rouge2_recall": 0.19524745454962628, "rouge2_recall_stderr": 0.003766208065478121, "rougeL_fmeasure": 0.18418656332394442, "rougeL_fmeasure_stderr": 0.0035517977647474175, "rougeL_precision": 0.17219580163143128, "rougeL_precision_stderr": 0.004627732394600431, "rougeL_recall": 0.34023933463326017, "rougeL_recall_stderr": 0.004497949648052741, "rougeLsum_fmeasure": 0.1893169899986314, "rougeLsum_fmeasure_stderr": 0.003703689468392939, "rougeLsum_precision": 0.17855926303676908, "rougeLsum_precision_stderr": 0.004851287660726657, "rougeLsum_recall": 0.3451827474898279, "rougeLsum_recall_stderr": 0.004540414961524762}}, "5": {"PALM_prompt": {"bleu": 1.018884418225518, "bleu_stderr": 0.06063389090761142, "rouge1_fmeasure": 0.2245014633073898, "rouge1_fmeasure_stderr": 0.004545756965978833, "rouge1_precision": 0.22200655898499053, "rouge1_precision_stderr": 0.005962548162960268, "rouge1_recall": 0.3774826876317441, "rouge1_recall_stderr": 0.004921209540706189, "rouge2_fmeasure": 0.11883663273017754, "rouge2_fmeasure_stderr": 0.003113425284275466, "rouge2_precision": 0.12083626340688242, "rouge2_precision_stderr": 0.004049580076214986, "rouge2_recall": 0.20032221007341308, "rouge2_recall_stderr": 0.0037578095361692466, "rougeL_fmeasure": 0.19808696683914262, "rougeL_fmeasure_stderr": 0.003815012659940734, "rougeL_precision": 0.19385895825070826, "rougeL_precision_stderr": 0.005166002921250581, "rougeL_recall": 0.3463288815796092, "rougeL_recall_stderr": 0.004493315251838996, "rougeLsum_fmeasure": 0.20367651597482986, "rougeLsum_fmeasure_stderr": 0.00397975215283144, "rougeLsum_precision": 0.20080334312524634, "rougeLsum_precision_stderr": 0.00538475664354496, "rougeLsum_recall": 0.3509129204118928, "rougeLsum_recall_stderr": 0.00453081936993939}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.7850233361049752, "bleu_stderr": 0.08520743952252237, "rouge1_fmeasure": 0.09645178040850848, "rouge1_fmeasure_stderr": 0.0023932324184934455, "rouge1_precision": 0.10013364964570891, "rouge1_precision_stderr": 0.0029985387314871463, "rouge1_recall": 0.1289356843926102, "rouge1_recall_stderr": 0.003252442943612275, "rouge2_fmeasure": 0.02327004167912628, "rouge2_fmeasure_stderr": 0.0008647699244277772, "rouge2_precision": 0.021651764261573173, "rouge2_precision_stderr": 0.0008981720810698575, "rouge2_recall": 0.032598865567093065, "rouge2_recall_stderr": 0.0013688978962574704, "rougeL_fmeasure": 0.07529103428773028, "rougeL_fmeasure_stderr": 0.0018177481003977558, "rougeL_precision": 0.0798145635639849, "rougeL_precision_stderr": 0.0025581061221673636, "rougeL_recall": 0.1030194283338092, "rougeL_recall_stderr": 0.0026473619878516663, "rougeLsum_fmeasure": 0.08939472105780072, "rougeLsum_fmeasure_stderr": 0.0022284928823277507, "rougeLsum_precision": 0.09366562084598902, "rougeLsum_precision_stderr": 0.00288003307671834, "rougeLsum_recall": 0.1197163043259789, "rougeLsum_recall_stderr": 0.0030465061402892178}}, "1": {"tldr_en": {"bleu": 2.149034150653557, "bleu_stderr": 0.0880436859723492, "rouge1_fmeasure": 0.1571029592533628, "rouge1_fmeasure_stderr": 0.0019787964900554204, "rouge1_precision": 0.21111776699074944, "rouge1_precision_stderr": 0.0036795176175499007, "rouge1_recall": 0.18105322303497254, "rouge1_recall_stderr": 0.002704526953003645, "rouge2_fmeasure": 0.031435380352664986, "rouge2_fmeasure_stderr": 0.001023452617747968, "rouge2_precision": 0.05017605453973606, "rouge2_precision_stderr": 0.002257054306007703, "rouge2_recall": 0.036689033105982816, "rouge2_recall_stderr": 0.001304908170146679, "rougeL_fmeasure": 0.1212831892222724, "rougeL_fmeasure_stderr": 0.0014852539083793823, "rougeL_precision": 0.1681933611426401, "rougeL_precision_stderr": 0.00320801194833936, "rougeL_recall": 0.14021014034344625, "rougeL_recall_stderr": 0.0020838861579054162, "rougeLsum_fmeasure": 0.14814548337398184, "rougeLsum_fmeasure_stderr": 0.00183959107231936, "rougeLsum_precision": 0.2001981792177172, "rougeLsum_precision_stderr": 0.003545698165430484, "rougeLsum_recall": 0.17075722396717186, "rougeLsum_recall_stderr": 0.0025253030583168973}}, "2": {"tldr_en": {"bleu": 3.2305991931473317, "bleu_stderr": 0.07950533990543071, "rouge1_fmeasure": 0.19819144050955464, "rouge1_fmeasure_stderr": 0.002161181470745133, "rouge1_precision": 0.29659957692420796, "rouge1_precision_stderr": 0.004313993725725908, "rouge1_recall": 0.2126729727290148, "rouge1_recall_stderr": 0.0028286561111436205, "rouge2_fmeasure": 0.05229754159287467, "rouge2_fmeasure_stderr": 0.0012492320842947013, "rouge2_precision": 0.09032858272882092, "rouge2_precision_stderr": 0.0028501839568620016, "rouge2_recall": 0.05443855757508895, "rouge2_recall_stderr": 0.0014238340672832245, "rougeL_fmeasure": 0.1554803027070678, "rougeL_fmeasure_stderr": 0.0016597756677151324, "rougeL_precision": 0.24038327671489826, "rougeL_precision_stderr": 0.0037910715865012022, "rougeL_recall": 0.16660116248057652, "rougeL_recall_stderr": 0.002203413004362285, "rougeLsum_fmeasure": 0.1857602434310385, "rougeLsum_fmeasure_stderr": 0.002016281937901344, "rougeLsum_precision": 0.280082550942368, "rougeLsum_precision_stderr": 0.004156093711309705, "rougeLsum_recall": 0.19923648622767753, "rougeLsum_recall_stderr": 0.002642427103030138}}, "3": {"tldr_en": {"bleu": 2.4191797072532624, "bleu_stderr": 0.05559997787678139, "rouge1_fmeasure": 0.17120594609085021, "rouge1_fmeasure_stderr": 0.0023855136999745593, "rouge1_precision": 0.27191585815512404, "rouge1_precision_stderr": 0.004553033969089403, "rouge1_recall": 0.17785515989361886, "rouge1_recall_stderr": 0.0029945927607217264, "rouge2_fmeasure": 0.04645910000265166, "rouge2_fmeasure_stderr": 0.0012307845898654572, "rouge2_precision": 0.08471260929686349, "rouge2_precision_stderr": 0.0028304634396926127, "rouge2_recall": 0.04775389377827743, "rouge2_recall_stderr": 0.0014249088126759941, "rougeL_fmeasure": 0.13587385417087836, "rougeL_fmeasure_stderr": 0.0018675251007595766, "rougeL_precision": 0.22280436054883324, "rougeL_precision_stderr": 0.003987085100348492, "rougeL_recall": 0.14044912634248938, "rougeL_recall_stderr": 0.0023437321076624205, "rougeLsum_fmeasure": 0.16058553787086796, "rougeLsum_fmeasure_stderr": 0.0022305451173721368, "rougeLsum_precision": 0.2572403556839211, "rougeLsum_precision_stderr": 0.0043942959684568315, "rougeLsum_recall": 0.16664812554605501, "rougeLsum_recall_stderr": 0.002803799009075263}}, "4": {"tldr_en": {"bleu": 0.05821785911961025, "bleu_stderr": 0.006926474108387649, "rouge1_fmeasure": 0.05705943866163425, "rouge1_fmeasure_stderr": 0.002034238951249155, "rouge1_precision": 0.09582438104190298, "rouge1_precision_stderr": 0.0037073724803882197, "rouge1_recall": 0.058712498837730144, "rouge1_recall_stderr": 0.002338440354244753, "rouge2_fmeasure": 0.01548765621583035, "rouge2_fmeasure_stderr": 0.0008491139070832301, "rouge2_precision": 0.030302600329343992, "rouge2_precision_stderr": 0.002020034963991066, "rouge2_recall": 0.01624326433634945, "rouge2_recall_stderr": 0.0010266482066008744, "rougeL_fmeasure": 0.04588756613231766, "rougeL_fmeasure_stderr": 0.0016219219311521358, "rougeL_precision": 0.08030364270664823, "rougeL_precision_stderr": 0.003249592772348841, "rougeL_recall": 0.04709615722253022, "rougeL_recall_stderr": 0.001883208275739314, "rougeLsum_fmeasure": 0.05326890477987492, "rougeLsum_fmeasure_stderr": 0.0018972568367539395, "rougeLsum_precision": 0.09101484836816211, "rougeLsum_precision_stderr": 0.003576586055780137, "rougeLsum_recall": 0.05458046297521363, "rougeLsum_recall_stderr": 0.002174773191043919}}, "5": {"tldr_en": {"bleu": 9.084911261227677e-17, "bleu_stderr": 1.938771872220056e-15, "rouge1_fmeasure": 0.00965877650109712, "rouge1_fmeasure_stderr": 0.0009502280880678353, "rouge1_precision": 0.017339640048071717, "rouge1_precision_stderr": 0.0017914288612189862, "rouge1_recall": 0.009388948695729904, "rouge1_recall_stderr": 0.0009896129220143181, "rouge2_fmeasure": 0.003396689228466778, "rouge2_fmeasure_stderr": 0.00046945115771734425, "rouge2_precision": 0.007695847190713622, "rouge2_precision_stderr": 0.0011846920894618485, "rouge2_recall": 0.0028956408424041136, "rouge2_recall_stderr": 0.00040203067458898954, "rougeL_fmeasure": 0.008167446118792413, "rougeL_fmeasure_stderr": 0.0008056769564050579, "rougeL_precision": 0.015395436111568952, "rougeL_precision_stderr": 0.0016573876664127856, "rougeL_recall": 0.007940994530119429, "rougeL_recall_stderr": 0.0008467405048451344, "rougeLsum_fmeasure": 0.009282795853843133, "rougeLsum_fmeasure_stderr": 0.0009151896700507592, "rougeLsum_precision": 0.01678224909258122, "rougeLsum_precision_stderr": 0.0017500452777960934, "rougeLsum_recall": 0.009052865544143397, "rougeLsum_recall_stderr": 0.0009622730677349403}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.8705138531971153, "bleu_stderr": 0.04867582531264036, "rouge1_fmeasure": 0.03809056028402446, "rouge1_fmeasure_stderr": 0.0016784416647229426, "rouge1_precision": 0.034545848496052654, "rouge1_precision_stderr": 0.001566579723382951, "rouge1_recall": 0.045834929071352386, "rouge1_recall_stderr": 0.002016811830226931, "rouge2_fmeasure": 0.015075003955377725, "rouge2_fmeasure_stderr": 0.0007874787502852935, "rouge2_precision": 0.01357831342305598, "rouge2_precision_stderr": 0.0007278435374694024, "rouge2_recall": 0.01818137390819633, "rouge2_recall_stderr": 0.0009483762899268107, "rougeL_fmeasure": 0.0335161304527499, "rougeL_fmeasure_stderr": 0.0014703931309561242, "rougeL_precision": 0.030361727102885563, "rougeL_precision_stderr": 0.001369965689115654, "rougeL_recall": 0.040364202751189966, "rougeL_recall_stderr": 0.0017680282610923641, "rougeLsum_fmeasure": 0.03475700574859405, "rougeLsum_fmeasure_stderr": 0.0015808773938519858, "rougeLsum_precision": 0.03177258180002162, "rougeLsum_precision_stderr": 0.0014955658234812072, "rougeLsum_recall": 0.04147354656808662, "rougeLsum_recall_stderr": 0.0018699366373118296}}, "1": {"generate_text_restaurant": {"bleu": 12.008758968305228, "bleu_stderr": 0.12564745639649488, "rouge1_fmeasure": 0.46552795049805773, "rouge1_fmeasure_stderr": 0.00235640354584502, "rouge1_precision": 0.5631594638857529, "rouge1_precision_stderr": 0.00324421624132654, "rouge1_recall": 0.4365816544468671, "rouge1_recall_stderr": 0.0030105433233647487, "rouge2_fmeasure": 0.2197193326260251, "rouge2_fmeasure_stderr": 0.0020109962036813298, "rouge2_precision": 0.26937549997021565, "rouge2_precision_stderr": 0.0026440834252656653, "rouge2_recall": 0.2057658676796454, "rouge2_recall_stderr": 0.002143734858738364, "rougeL_fmeasure": 0.335604122613543, "rougeL_fmeasure_stderr": 0.002079991474543142, "rougeL_precision": 0.4087872898136732, "rougeL_precision_stderr": 0.002934069491949776, "rougeL_recall": 0.31377900067912107, "rougeL_recall_stderr": 0.0024396772769032255, "rougeLsum_fmeasure": 0.3790404958358403, "rougeLsum_fmeasure_stderr": 0.0023299555118255666, "rougeLsum_precision": 0.45969831228314867, "rougeLsum_precision_stderr": 0.0031623483646411057, "rougeLsum_recall": 0.3549682540172786, "rougeLsum_recall_stderr": 0.002746427422139494}}, "2": {"generate_text_restaurant": {"bleu": 14.14198056712505, "bleu_stderr": 0.24387528882104717, "rouge1_fmeasure": 0.4878980019648943, "rouge1_fmeasure_stderr": 0.00229400065900204, "rouge1_precision": 0.5832193324685282, "rouge1_precision_stderr": 0.0032787034277368537, "rouge1_recall": 0.4601966943407872, "rouge1_recall_stderr": 0.0029819842123464044, "rouge2_fmeasure": 0.24462503044359898, "rouge2_fmeasure_stderr": 0.0020943947868852283, "rouge2_precision": 0.2960872844323463, "rouge2_precision_stderr": 0.002769000714927548, "rouge2_recall": 0.2309770935674146, "rouge2_recall_stderr": 0.0022965170518954945, "rougeL_fmeasure": 0.3620900980791755, "rougeL_fmeasure_stderr": 0.002113460359566787, "rougeL_precision": 0.4349496583122404, "rougeL_precision_stderr": 0.003015743000538396, "rougeL_recall": 0.34082058480437577, "rougeL_recall_stderr": 0.0025066913309293682, "rougeLsum_fmeasure": 0.406567917534112, "rougeLsum_fmeasure_stderr": 0.0023455647214753387, "rougeLsum_precision": 0.4867174466000412, "rougeLsum_precision_stderr": 0.0032350689341699907, "rougeLsum_recall": 0.38300473271219204, "rougeLsum_recall_stderr": 0.0027921658474823886}}, "3": {"generate_text_restaurant": {"bleu": 14.568494170196605, "bleu_stderr": 0.16235845379384753, "rouge1_fmeasure": 0.48940623430349556, "rouge1_fmeasure_stderr": 0.002237489170426613, "rouge1_precision": 0.580960944087758, "rouge1_precision_stderr": 0.0032426085149854737, "rouge1_recall": 0.462205873383073, "rouge1_recall_stderr": 0.0029025682592738675, "rouge2_fmeasure": 0.24783052219620694, "rouge2_fmeasure_stderr": 0.002089515541436856, "rouge2_precision": 0.29731229285227795, "rouge2_precision_stderr": 0.002737785275611433, "rouge2_recall": 0.23475220291108906, "rouge2_recall_stderr": 0.0023139060262653333, "rougeL_fmeasure": 0.3647405346505457, "rougeL_fmeasure_stderr": 0.0021293095742188914, "rougeL_precision": 0.43459494765721124, "rougeL_precision_stderr": 0.003021280699774259, "rougeL_recall": 0.34401804948389253, "rougeL_recall_stderr": 0.002508909081833447, "rougeLsum_fmeasure": 0.4100565331346358, "rougeLsum_fmeasure_stderr": 0.0023413011884842946, "rougeLsum_precision": 0.4870002916061261, "rougeLsum_precision_stderr": 0.0032257758633302993, "rougeLsum_recall": 0.3872001617504186, "rougeLsum_recall_stderr": 0.0027831042016586783}}, "4": {"generate_text_restaurant": {"bleu": 14.734731736932146, "bleu_stderr": 0.16887071667451783, "rouge1_fmeasure": 0.4894791754818595, "rouge1_fmeasure_stderr": 0.0023005348739165796, "rouge1_precision": 0.5801184894727782, "rouge1_precision_stderr": 0.003304446650411685, "rouge1_recall": 0.46099783072021006, "rouge1_recall_stderr": 0.0028878553497999355, "rouge2_fmeasure": 0.24803460463857877, "rouge2_fmeasure_stderr": 0.00214528331500546, "rouge2_precision": 0.29812075093333956, "rouge2_precision_stderr": 0.002844755419824555, "rouge2_recall": 0.23325171781043572, "rouge2_recall_stderr": 0.0022772307735508773, "rougeL_fmeasure": 0.36343319118608863, "rougeL_fmeasure_stderr": 0.0021424532256284854, "rougeL_precision": 0.4323253611145228, "rougeL_precision_stderr": 0.0030220895477853227, "rougeL_recall": 0.3419042442881764, "rougeL_recall_stderr": 0.0024830197258768685, "rougeLsum_fmeasure": 0.41100693983913, "rougeLsum_fmeasure_stderr": 0.002385212896337812, "rougeLsum_precision": 0.48702805692607404, "rougeLsum_precision_stderr": 0.003251909391956583, "rougeLsum_recall": 0.3871750907489159, "rougeLsum_recall_stderr": 0.002775190310841645}}, "5": {"generate_text_restaurant": {"bleu": 14.417330528707561, "bleu_stderr": 0.23744062587048995, "rouge1_fmeasure": 0.48870465280584424, "rouge1_fmeasure_stderr": 0.002241402168765513, "rouge1_precision": 0.5804014340987179, "rouge1_precision_stderr": 0.0032819804536962584, "rouge1_recall": 0.4582313763378584, "rouge1_recall_stderr": 0.0028210153655625013, "rouge2_fmeasure": 0.24688680363045243, "rouge2_fmeasure_stderr": 0.002112718830613371, "rouge2_precision": 0.2973507676511209, "rouge2_precision_stderr": 0.002822434875050306, "rouge2_recall": 0.23140500698702424, "rouge2_recall_stderr": 0.0022546925569823587, "rougeL_fmeasure": 0.3634152205481943, "rougeL_fmeasure_stderr": 0.002132655133838495, "rougeL_precision": 0.43320881922904375, "rougeL_precision_stderr": 0.0030348704584789053, "rougeL_recall": 0.34027316974490135, "rougeL_recall_stderr": 0.002449545539483717, "rougeLsum_fmeasure": 0.40942633387323074, "rougeLsum_fmeasure_stderr": 0.0023623920101575194, "rougeLsum_precision": 0.48672630521219834, "rougeLsum_precision_stderr": 0.003280235985520434, "rougeLsum_recall": 0.38371077804603543, "rougeLsum_recall_stderr": 0.0027322740126673704}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.6445432935397932, "bleu_stderr": 0.11288880012114288, "rouge1_fmeasure": 0.16563824544624003, "rouge1_fmeasure_stderr": 0.0032253697331514498, "rouge1_precision": 0.11963298530443908, "rouge1_precision_stderr": 0.002368819248238803, "rouge1_recall": 0.28117303100546565, "rouge1_recall_stderr": 0.005529983454772786, "rouge2_fmeasure": 0.03650064845163216, "rouge2_fmeasure_stderr": 0.001407177754346794, "rouge2_precision": 0.025988362314328585, "rouge2_precision_stderr": 0.000997588416820755, "rouge2_recall": 0.06429373809950764, "rouge2_recall_stderr": 0.002624993265784442, "rougeL_fmeasure": 0.12335764205289244, "rougeL_fmeasure_stderr": 0.0023998257893507076, "rougeL_precision": 0.08900328361038783, "rougeL_precision_stderr": 0.0017538328575853156, "rougeL_recall": 0.21035687677527884, "rougeL_recall_stderr": 0.004212998263040422, "rougeLsum_fmeasure": 0.12971536873089445, "rougeLsum_fmeasure_stderr": 0.0026207364705836296, "rougeLsum_precision": 0.09348267589650253, "rougeLsum_precision_stderr": 0.0019094389936997007, "rougeLsum_recall": 0.22162305835248405, "rougeLsum_recall_stderr": 0.004602701558276795}}, "1": {"article_DOC_summary": {"bleu": 1.734128327285966, "bleu_stderr": 0.08924592886370027, "rouge1_fmeasure": 0.20565249149745046, "rouge1_fmeasure_stderr": 0.002804831346629974, "rouge1_precision": 0.16963198767042315, "rouge1_precision_stderr": 0.002937269648695778, "rouge1_recall": 0.30825637395558336, "rouge1_recall_stderr": 0.004370185872392451, "rouge2_fmeasure": 0.042159228682119326, "rouge2_fmeasure_stderr": 0.001590769484466558, "rouge2_precision": 0.03482662251402544, "rouge2_precision_stderr": 0.0014769947445053828, "rouge2_recall": 0.06455952984895309, "rouge2_recall_stderr": 0.0024217677572532996, "rougeL_fmeasure": 0.15284179702621556, "rougeL_fmeasure_stderr": 0.002143177827632909, "rougeL_precision": 0.12598747225029228, "rougeL_precision_stderr": 0.002280349516115101, "rougeL_recall": 0.23023461091477196, "rougeL_recall_stderr": 0.003384529702917665, "rougeLsum_fmeasure": 0.15914388511785377, "rougeLsum_fmeasure_stderr": 0.0022721119916273304, "rougeLsum_precision": 0.1304550598704216, "rougeLsum_precision_stderr": 0.002321857032815001, "rougeLsum_recall": 0.24130012289322367, "rougeLsum_recall_stderr": 0.0037023325680176954}}, "2": {"article_DOC_summary": {"bleu": 2.071346995064759, "bleu_stderr": 0.11437801801926932, "rouge1_fmeasure": 0.22539563476819588, "rouge1_fmeasure_stderr": 0.003133489804867722, "rouge1_precision": 0.22067542118387265, "rouge1_precision_stderr": 0.003859452585001004, "rouge1_recall": 0.27030517058690373, "rouge1_recall_stderr": 0.00391277652919709, "rouge2_fmeasure": 0.047169327493735744, "rouge2_fmeasure_stderr": 0.00199109164400243, "rouge2_precision": 0.047009502675708165, "rouge2_precision_stderr": 0.002173179401977634, "rouge2_recall": 0.05686441658011089, "rouge2_recall_stderr": 0.00247318036490619, "rougeL_fmeasure": 0.16770225679218967, "rougeL_fmeasure_stderr": 0.002496420037294455, "rougeL_precision": 0.1642240699512137, "rougeL_precision_stderr": 0.0030808505625225004, "rougeL_recall": 0.20216628044461474, "rougeL_recall_stderr": 0.0031256639466597388, "rougeLsum_fmeasure": 0.17110619274109168, "rougeLsum_fmeasure_stderr": 0.0025646249178584764, "rougeLsum_precision": 0.16692030908684236, "rougeLsum_precision_stderr": 0.0030981804279163147, "rougeLsum_recall": 0.20768035344064897, "rougeLsum_recall_stderr": 0.0033851444187716706}}, "3": {"article_DOC_summary": {"bleu": 2.318524945908214, "bleu_stderr": 0.08598566221567794, "rouge1_fmeasure": 0.21646380285670908, "rouge1_fmeasure_stderr": 0.003530035532648745, "rouge1_precision": 0.22140615543341866, "rouge1_precision_stderr": 0.004204679986334029, "rouge1_recall": 0.2444906635779359, "rouge1_recall_stderr": 0.004073874459614794, "rouge2_fmeasure": 0.04773019417977251, "rouge2_fmeasure_stderr": 0.0020540531313582152, "rouge2_precision": 0.04952151051084217, "rouge2_precision_stderr": 0.0022666923793748218, "rouge2_recall": 0.0533842351917708, "rouge2_recall_stderr": 0.0023446986110702743, "rougeL_fmeasure": 0.16386346574102725, "rougeL_fmeasure_stderr": 0.0027882855655311897, "rougeL_precision": 0.16807663783660487, "rougeL_precision_stderr": 0.003387549311395567, "rougeL_recall": 0.1859775029237818, "rougeL_recall_stderr": 0.00325443532625321, "rougeLsum_fmeasure": 0.16552783963705708, "rougeLsum_fmeasure_stderr": 0.0028417200102711917, "rougeLsum_precision": 0.16936961306971046, "rougeLsum_precision_stderr": 0.0034063239974255637, "rougeLsum_recall": 0.18841832491605268, "rougeLsum_recall_stderr": 0.003382609617563602}}, "4": {"article_DOC_summary": {"bleu": 0.2202628831493563, "bleu_stderr": 0.05888934471976352, "rouge1_fmeasure": 0.05671539048524648, "rouge1_fmeasure_stderr": 0.003330283869930652, "rouge1_precision": 0.06436084008158685, "rouge1_precision_stderr": 0.004004154969143478, "rouge1_recall": 0.06086433870277819, "rouge1_recall_stderr": 0.0036498255182998273, "rouge2_fmeasure": 0.012384552834017296, "rouge2_fmeasure_stderr": 0.0012566880181981682, "rouge2_precision": 0.014617474767198527, "rouge2_precision_stderr": 0.0017366601516105656, "rouge2_recall": 0.012749228311253753, "rouge2_recall_stderr": 0.0012424948236292898, "rougeL_fmeasure": 0.04272344374969, "rougeL_fmeasure_stderr": 0.002558877704383889, "rougeL_precision": 0.048955624036311166, "rougeL_precision_stderr": 0.003162711183472558, "rougeL_recall": 0.0456391933473409, "rougeL_recall_stderr": 0.002760702511080215, "rougeLsum_fmeasure": 0.04389506319896344, "rougeLsum_fmeasure_stderr": 0.0026138307035845822, "rougeLsum_precision": 0.050043427048033697, "rougeLsum_precision_stderr": 0.0032007871669195633, "rougeLsum_recall": 0.047172374717404854, "rougeLsum_recall_stderr": 0.0028612594448202094}}, "5": {"article_DOC_summary": {"bleu": 3.79693853289103e-57, "bleu_stderr": 1.663506575542054e-51, "rouge1_fmeasure": 0.0019356785814855194, "rouge1_fmeasure_stderr": 0.0005869452676229577, "rouge1_precision": 0.0026890331605918753, "rouge1_precision_stderr": 0.0008483268258212475, "rouge1_recall": 0.0017377836109564373, "rouge1_recall_stderr": 0.0005352032014968291, "rouge2_fmeasure": 9.281741700266572e-05, "rouge2_fmeasure_stderr": 6.607511454084363e-05, "rouge2_precision": 0.00011077758719268152, "rouge2_precision_stderr": 7.83387543031719e-05, "rouge2_recall": 8.063215610385422e-05, "rouge2_recall_stderr": 5.79270034349398e-05, "rougeL_fmeasure": 0.001515345186644524, "rougeL_fmeasure_stderr": 0.000460327548876044, "rougeL_precision": 0.002152537869033098, "rougeL_precision_stderr": 0.0006750309400673729, "rougeL_recall": 0.0013275557055278245, "rougeL_recall_stderr": 0.0004057676471998518, "rougeLsum_fmeasure": 0.001515345186644524, "rougeLsum_fmeasure_stderr": 0.000460327548876044, "rougeLsum_precision": 0.002152537869033098, "rougeLsum_precision_stderr": 0.0006750309400673729, "rougeLsum_recall": 0.0013275557055278245, "rougeLsum_recall_stderr": 0.0004057676471998518}}}}
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.325,0.014818724459095526,0
|
3 |
+
anli_r2,acc,0.341,0.014998131348402709,0
|
4 |
+
anli_r3,acc,0.3258333333333333,0.013535422043417459,0
|
5 |
+
arc_challenge,acc,0.21843003412969283,0.01207429160570098,0
|
6 |
+
arc_challenge,acc_norm,0.2525597269624573,0.012696728980207704,0
|
7 |
+
arc_easy,acc,0.5340909090909091,0.010235908103438685,0
|
8 |
+
arc_easy,acc_norm,0.4882154882154882,0.010256933475911006,0
|
9 |
+
boolq,acc,0.6073394495412844,0.008541161248702906,1
|
10 |
+
cb,acc,0.5178571428571429,0.06737697508644647,1
|
11 |
+
cb,f1,0.3478682170542636,,1
|
12 |
+
copa,acc,0.71,0.04560480215720684,0
|
13 |
+
hellaswag,acc,0.374726150169289,0.004830628620181023,0
|
14 |
+
hellaswag,acc_norm,0.46415056761601275,0.004976939333240077,0
|
15 |
+
piqa,acc,0.7013057671381937,0.010678556398149242,0
|
16 |
+
piqa,acc_norm,0.7105549510337323,0.010581014740675621,0
|
17 |
+
rte,acc,0.5415162454873647,0.029992535385373314,0
|
18 |
+
sciq,acc,0.802,0.012607733934175315,0
|
19 |
+
sciq,acc_norm,0.724,0.014142984975740668,0
|
20 |
+
storycloze_2016,acc,0.6488508818813469,0.011038179124113263,0
|
21 |
+
winogrande,acc,0.5232833464877664,0.01403724130957364,0
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_0_lm-eval_global_step52452_2023-02-13-14-30-07_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.325,
|
5 |
-
"acc_stderr": 0.014818724459095526
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.341,
|
9 |
-
"acc_stderr": 0.014998131348402709
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3258333333333333,
|
13 |
-
"acc_stderr": 0.013535422043417459
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5178571428571429,
|
17 |
-
"acc_stderr": 0.06737697508644647,
|
18 |
-
"f1": 0.3478682170542636
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.71,
|
22 |
-
"acc_stderr": 0.04560480215720684
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.374726150169289,
|
26 |
-
"acc_stderr": 0.004830628620181023,
|
27 |
-
"acc_norm": 0.46415056761601275,
|
28 |
-
"acc_norm_stderr": 0.004976939333240077
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5415162454873647,
|
32 |
-
"acc_stderr": 0.029992535385373314
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5232833464877664,
|
36 |
-
"acc_stderr": 0.01403724130957364
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6488508818813469,
|
40 |
-
"acc_stderr": 0.011038179124113263
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6073394495412844,
|
44 |
-
"acc_stderr": 0.008541161248702906
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5340909090909091,
|
48 |
-
"acc_stderr": 0.010235908103438685,
|
49 |
-
"acc_norm": 0.4882154882154882,
|
50 |
-
"acc_norm_stderr": 0.010256933475911006
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.21843003412969283,
|
54 |
-
"acc_stderr": 0.01207429160570098,
|
55 |
-
"acc_norm": 0.2525597269624573,
|
56 |
-
"acc_norm_stderr": 0.012696728980207704
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.802,
|
60 |
-
"acc_stderr": 0.012607733934175315,
|
61 |
-
"acc_norm": 0.724,
|
62 |
-
"acc_norm_stderr": 0.014142984975740668
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7013057671381937,
|
66 |
-
"acc_stderr": 0.010678556398149242,
|
67 |
-
"acc_norm": 0.7105549510337323,
|
68 |
-
"acc_norm_stderr": 0.010581014740675621
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.329,0.01486539538592836,0
|
3 |
+
anli_r2,acc,0.323,0.014794927843348639,0
|
4 |
+
anli_r3,acc,0.345,0.013728421539454876,0
|
5 |
+
arc_challenge,acc,0.2440273037542662,0.012551447627856255,0
|
6 |
+
arc_challenge,acc_norm,0.25426621160409557,0.012724999945157736,0
|
7 |
+
arc_easy,acc,0.5391414141414141,0.01022829820076613,0
|
8 |
+
arc_easy,acc_norm,0.5244107744107744,0.010247548905242272,0
|
9 |
+
boolq,acc,0.5850152905198777,0.008617716361921567,1
|
10 |
+
cb,acc,0.5,0.06741998624632421,1
|
11 |
+
cb,f1,0.35057471264367807,,1
|
12 |
+
copa,acc,0.72,0.04512608598542127,0
|
13 |
+
hellaswag,acc,0.3695478988249353,0.0048169588177260836,0
|
14 |
+
hellaswag,acc_norm,0.4667396932881896,0.0049787293000748915,0
|
15 |
+
piqa,acc,0.70620239390642,0.0106275740805148,0
|
16 |
+
piqa,acc_norm,0.7040261153427638,0.010650414317148131,0
|
17 |
+
rte,acc,0.5451263537906137,0.029973636495415252,0
|
18 |
+
sciq,acc,0.861,0.010945263761042962,0
|
19 |
+
sciq,acc_norm,0.843,0.011510146979230187,0
|
20 |
+
storycloze_2016,acc,0.6386958845537146,0.011108686479432282,0
|
21 |
+
winogrande,acc,0.5272296764009471,0.014031631629827696,0
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_1_lm-eval_global_step52452_2023-02-13-14-30-07_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.329,
|
5 |
-
"acc_stderr": 0.01486539538592836
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.323,
|
9 |
-
"acc_stderr": 0.014794927843348639
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.345,
|
13 |
-
"acc_stderr": 0.013728421539454876
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5,
|
17 |
-
"acc_stderr": 0.06741998624632421,
|
18 |
-
"f1": 0.35057471264367807
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.72,
|
22 |
-
"acc_stderr": 0.04512608598542127
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3695478988249353,
|
26 |
-
"acc_stderr": 0.0048169588177260836,
|
27 |
-
"acc_norm": 0.4667396932881896,
|
28 |
-
"acc_norm_stderr": 0.0049787293000748915
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5451263537906137,
|
32 |
-
"acc_stderr": 0.029973636495415252
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5272296764009471,
|
36 |
-
"acc_stderr": 0.014031631629827696
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6386958845537146,
|
40 |
-
"acc_stderr": 0.011108686479432282
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5850152905198777,
|
44 |
-
"acc_stderr": 0.008617716361921567
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5391414141414141,
|
48 |
-
"acc_stderr": 0.01022829820076613,
|
49 |
-
"acc_norm": 0.5244107744107744,
|
50 |
-
"acc_norm_stderr": 0.010247548905242272
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2440273037542662,
|
54 |
-
"acc_stderr": 0.012551447627856255,
|
55 |
-
"acc_norm": 0.25426621160409557,
|
56 |
-
"acc_norm_stderr": 0.012724999945157736
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.861,
|
60 |
-
"acc_stderr": 0.010945263761042962,
|
61 |
-
"acc_norm": 0.843,
|
62 |
-
"acc_norm_stderr": 0.011510146979230187
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.70620239390642,
|
66 |
-
"acc_stderr": 0.0106275740805148,
|
67 |
-
"acc_norm": 0.7040261153427638,
|
68 |
-
"acc_norm_stderr": 0.010650414317148131
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.323,0.014794927843348642,0
|
3 |
+
anli_r2,acc,0.337,0.014955087918653602,0
|
4 |
+
anli_r3,acc,0.32,0.013471620929769144,0
|
5 |
+
arc_challenge,acc,0.24744027303754265,0.01261035266329267,0
|
6 |
+
arc_challenge,acc_norm,0.26706484641638223,0.012928933196496349,0
|
7 |
+
arc_easy,acc,0.5505050505050505,0.010207308833916037,0
|
8 |
+
arc_easy,acc_norm,0.5340909090909091,0.010235908103438688,0
|
9 |
+
boolq,acc,0.5847094801223242,0.008618637526341675,1
|
10 |
+
cb,acc,0.42857142857142855,0.06672848092813058,1
|
11 |
+
cb,f1,0.28200928200928205,,1
|
12 |
+
copa,acc,0.7,0.046056618647183814,0
|
13 |
+
hellaswag,acc,0.369946225851424,0.004818031396138923,0
|
14 |
+
hellaswag,acc_norm,0.4690300736904999,0.00498020045185168,0
|
15 |
+
piqa,acc,0.7154515778019587,0.010527218464130614,0
|
16 |
+
piqa,acc_norm,0.705658324265506,0.010633311470347509,0
|
17 |
+
rte,acc,0.5270758122743683,0.030052303463143713,0
|
18 |
+
sciq,acc,0.892,0.009820001651345693,0
|
19 |
+
sciq,acc_norm,0.872,0.010570133761108668,0
|
20 |
+
storycloze_2016,acc,0.6376269374665954,0.011115793699210296,0
|
21 |
+
winogrande,acc,0.5382794001578532,0.01401124259496412,0
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_2_lm-eval_global_step52452_2023-02-13-14-30-07_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.323,
|
5 |
-
"acc_stderr": 0.014794927843348642
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.337,
|
9 |
-
"acc_stderr": 0.014955087918653602
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.32,
|
13 |
-
"acc_stderr": 0.013471620929769144
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.28200928200928205
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.7,
|
22 |
-
"acc_stderr": 0.046056618647183814
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.369946225851424,
|
26 |
-
"acc_stderr": 0.004818031396138923,
|
27 |
-
"acc_norm": 0.4690300736904999,
|
28 |
-
"acc_norm_stderr": 0.00498020045185168
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5270758122743683,
|
32 |
-
"acc_stderr": 0.030052303463143713
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5382794001578532,
|
36 |
-
"acc_stderr": 0.01401124259496412
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6376269374665954,
|
40 |
-
"acc_stderr": 0.011115793699210296
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5847094801223242,
|
44 |
-
"acc_stderr": 0.008618637526341675
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5505050505050505,
|
48 |
-
"acc_stderr": 0.010207308833916037,
|
49 |
-
"acc_norm": 0.5340909090909091,
|
50 |
-
"acc_norm_stderr": 0.010235908103438688
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24744027303754265,
|
54 |
-
"acc_stderr": 0.01261035266329267,
|
55 |
-
"acc_norm": 0.26706484641638223,
|
56 |
-
"acc_norm_stderr": 0.012928933196496349
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.892,
|
60 |
-
"acc_stderr": 0.009820001651345693,
|
61 |
-
"acc_norm": 0.872,
|
62 |
-
"acc_norm_stderr": 0.010570133761108668
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7154515778019587,
|
66 |
-
"acc_stderr": 0.010527218464130614,
|
67 |
-
"acc_norm": 0.705658324265506,
|
68 |
-
"acc_norm_stderr": 0.010633311470347509
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.341,0.014998131348402706,0
|
3 |
+
anli_r2,acc,0.35,0.015090650341444233,0
|
4 |
+
anli_r3,acc,0.32666666666666666,0.013544340907003663,0
|
5 |
+
arc_challenge,acc,0.24061433447098976,0.012491468532390571,0
|
6 |
+
arc_challenge,acc_norm,0.2721843003412969,0.013006600406423706,0
|
7 |
+
arc_easy,acc,0.5509259259259259,0.010206428316323369,0
|
8 |
+
arc_easy,acc_norm,0.5366161616161617,0.01023223506393303,0
|
9 |
+
boolq,acc,0.5868501529051988,0.00861211754780359,1
|
10 |
+
cb,acc,0.5178571428571429,0.06737697508644648,1
|
11 |
+
cb,f1,0.41940672576964805,,1
|
12 |
+
copa,acc,0.72,0.045126085985421276,0
|
13 |
+
hellaswag,acc,0.3732324238199562,0.00482674616083019,0
|
14 |
+
hellaswag,acc_norm,0.46883091017725553,0.004980076707392432,0
|
15 |
+
piqa,acc,0.70620239390642,0.010627574080514797,0
|
16 |
+
piqa,acc_norm,0.7040261153427638,0.010650414317148131,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.887,0.010016552866696858,0
|
19 |
+
sciq,acc_norm,0.877,0.010391293421849877,0
|
20 |
+
storycloze_2016,acc,0.6424371993586317,0.011083341168827785,0
|
21 |
+
winogrande,acc,0.5382794001578532,0.014011242594964118,0
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_3.json
CHANGED
@@ -42,6 +42,30 @@
|
|
42 |
"boolq": {
|
43 |
"acc": 0.5868501529051988,
|
44 |
"acc_stderr": 0.00861211754780359
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
}
|
46 |
},
|
47 |
"versions": {
|
@@ -54,6 +78,10 @@
|
|
54 |
"rte": 0,
|
55 |
"winogrande": 0,
|
56 |
"storycloze_2016": 0,
|
57 |
-
"boolq": 1
|
|
|
|
|
|
|
|
|
58 |
}
|
59 |
}
|
|
|
42 |
"boolq": {
|
43 |
"acc": 0.5868501529051988,
|
44 |
"acc_stderr": 0.00861211754780359
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.5509259259259259,
|
48 |
+
"acc_stderr": 0.010206428316323369,
|
49 |
+
"acc_norm": 0.5366161616161617,
|
50 |
+
"acc_norm_stderr": 0.01023223506393303
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.24061433447098976,
|
54 |
+
"acc_stderr": 0.012491468532390571,
|
55 |
+
"acc_norm": 0.2721843003412969,
|
56 |
+
"acc_norm_stderr": 0.013006600406423706
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.887,
|
60 |
+
"acc_stderr": 0.010016552866696858,
|
61 |
+
"acc_norm": 0.877,
|
62 |
+
"acc_norm_stderr": 0.010391293421849877
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.70620239390642,
|
66 |
+
"acc_stderr": 0.010627574080514797,
|
67 |
+
"acc_norm": 0.7040261153427638,
|
68 |
+
"acc_norm_stderr": 0.010650414317148131
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_3_lm-eval_global_step52452_2023-02-13-14-30-07_3shots_backup.json
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.341,
|
5 |
-
"acc_stderr": 0.014998131348402706
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.35,
|
9 |
-
"acc_stderr": 0.015090650341444233
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.32666666666666666,
|
13 |
-
"acc_stderr": 0.013544340907003663
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5178571428571429,
|
17 |
-
"acc_stderr": 0.06737697508644648,
|
18 |
-
"f1": 0.41940672576964805
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.72,
|
22 |
-
"acc_stderr": 0.045126085985421276
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3732324238199562,
|
26 |
-
"acc_stderr": 0.00482674616083019,
|
27 |
-
"acc_norm": 0.46883091017725553,
|
28 |
-
"acc_norm_stderr": 0.004980076707392432
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5234657039711191,
|
32 |
-
"acc_stderr": 0.03006330041190266
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5382794001578532,
|
36 |
-
"acc_stderr": 0.014011242594964118
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6424371993586317,
|
40 |
-
"acc_stderr": 0.011083341168827785
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5868501529051988,
|
44 |
-
"acc_stderr": 0.00861211754780359
|
45 |
-
}
|
46 |
-
},
|
47 |
-
"versions": {
|
48 |
-
"anli_r1": 0,
|
49 |
-
"anli_r2": 0,
|
50 |
-
"anli_r3": 0,
|
51 |
-
"cb": 1,
|
52 |
-
"copa": 0,
|
53 |
-
"hellaswag": 0,
|
54 |
-
"rte": 0,
|
55 |
-
"winogrande": 0,
|
56 |
-
"storycloze_2016": 0,
|
57 |
-
"boolq": 1
|
58 |
-
}
|
59 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.35,0.015090650341444233,0
|
3 |
+
anli_r2,acc,0.329,0.014865395385928359,0
|
4 |
+
anli_r3,acc,0.335,0.013630871843821474,0
|
5 |
+
arc_challenge,acc,0.23293515358361774,0.012352507042617396,0
|
6 |
+
arc_challenge,acc_norm,0.2790102389078498,0.01310678488360134,0
|
7 |
+
arc_easy,acc,0.547979797979798,0.010212436978834099,0
|
8 |
+
arc_easy,acc_norm,0.5412457912457912,0.010224815730255816,0
|
9 |
+
boolq,acc,0.591743119266055,0.008596583869583202,1
|
10 |
+
cb,acc,0.4642857142857143,0.0672477765493766,1
|
11 |
+
cb,f1,0.2842465753424657,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.3746265684126668,0.004830371317841071,0
|
14 |
+
hellaswag,acc_norm,0.4667396932881896,0.004978729300074892,0
|
15 |
+
piqa,acc,0.7110990206746464,0.010575111841364905,0
|
16 |
+
piqa,acc_norm,0.7170837867247007,0.010508949177489676,0
|
17 |
+
rte,acc,0.4657039711191336,0.030025579819366422,0
|
18 |
+
sciq,acc,0.894,0.009739551265785134,0
|
19 |
+
sciq,acc_norm,0.892,0.009820001651345688,0
|
20 |
+
storycloze_2016,acc,0.6467129877071085,0.011053474766125627,0
|
21 |
+
winogrande,acc,0.5414364640883977,0.014004146853791914,0
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_4.json
CHANGED
@@ -38,6 +38,34 @@
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.6467129877071085,
|
40 |
"acc_stderr": 0.011053474766125627
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
}
|
42 |
},
|
43 |
"versions": {
|
@@ -49,6 +77,11 @@
|
|
49 |
"hellaswag": 0,
|
50 |
"rte": 0,
|
51 |
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
}
|
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.6467129877071085,
|
40 |
"acc_stderr": 0.011053474766125627
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.591743119266055,
|
44 |
+
"acc_stderr": 0.008596583869583202
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.547979797979798,
|
48 |
+
"acc_stderr": 0.010212436978834099,
|
49 |
+
"acc_norm": 0.5412457912457912,
|
50 |
+
"acc_norm_stderr": 0.010224815730255816
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.23293515358361774,
|
54 |
+
"acc_stderr": 0.012352507042617396,
|
55 |
+
"acc_norm": 0.2790102389078498,
|
56 |
+
"acc_norm_stderr": 0.01310678488360134
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.894,
|
60 |
+
"acc_stderr": 0.009739551265785134,
|
61 |
+
"acc_norm": 0.892,
|
62 |
+
"acc_norm_stderr": 0.009820001651345688
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7110990206746464,
|
66 |
+
"acc_stderr": 0.010575111841364905,
|
67 |
+
"acc_norm": 0.7170837867247007,
|
68 |
+
"acc_norm_stderr": 0.010508949177489676
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_4_lm-eval_global_step52452_2023-02-13-14-30-07_4shots_backup.json
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.35,
|
5 |
-
"acc_stderr": 0.015090650341444233
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.329,
|
9 |
-
"acc_stderr": 0.014865395385928359
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.335,
|
13 |
-
"acc_stderr": 0.013630871843821474
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4642857142857143,
|
17 |
-
"acc_stderr": 0.0672477765493766,
|
18 |
-
"f1": 0.2842465753424657
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3746265684126668,
|
26 |
-
"acc_stderr": 0.004830371317841071,
|
27 |
-
"acc_norm": 0.4667396932881896,
|
28 |
-
"acc_norm_stderr": 0.004978729300074892
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4657039711191336,
|
32 |
-
"acc_stderr": 0.030025579819366422
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5414364640883977,
|
36 |
-
"acc_stderr": 0.014004146853791914
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6467129877071085,
|
40 |
-
"acc_stderr": 0.011053474766125627
|
41 |
-
}
|
42 |
-
},
|
43 |
-
"versions": {
|
44 |
-
"anli_r1": 0,
|
45 |
-
"anli_r2": 0,
|
46 |
-
"anli_r3": 0,
|
47 |
-
"cb": 1,
|
48 |
-
"copa": 0,
|
49 |
-
"hellaswag": 0,
|
50 |
-
"rte": 0,
|
51 |
-
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
53 |
-
}
|
54 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.33,0.01487687202745673,0
|
3 |
+
anli_r2,acc,0.319,0.014746404865473484,0
|
4 |
+
anli_r3,acc,0.335,0.01363087184382147,0
|
5 |
+
arc_challenge,acc,0.25,0.012653835621466646,0
|
6 |
+
arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0
|
7 |
+
arc_easy,acc,0.5467171717171717,0.01021490151673162,0
|
8 |
+
arc_easy,acc_norm,0.5458754208754208,0.010216507710244115,0
|
9 |
+
boolq,acc,0.5880733944954128,0.008608316516029646,1
|
10 |
+
cb,acc,0.5714285714285714,0.06672848092813058,1
|
11 |
+
cb,f1,0.44974910394265244,,1
|
12 |
+
copa,acc,0.72,0.04512608598542128,0
|
13 |
+
hellaswag,acc,0.37273451503684524,0.004825441080261174,0
|
14 |
+
hellaswag,acc_norm,0.46922923720374426,0.004980323400031081,0
|
15 |
+
piqa,acc,0.7067464635473341,0.010621818421101926,0
|
16 |
+
piqa,acc_norm,0.7094668117519043,0.010592765034696534,0
|
17 |
+
rte,acc,0.5018050541516246,0.030096267148976626,0
|
18 |
+
sciq,acc,0.893,0.009779910359847167,0
|
19 |
+
sciq,acc_norm,0.898,0.009575368801653866,0
|
20 |
+
storycloze_2016,acc,0.6456440406199893,0.011061031791615487,0
|
21 |
+
winogrande,acc,0.5374901341752171,0.014012928183336578,0
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_5.json
CHANGED
@@ -20,6 +20,52 @@
|
|
20 |
"copa": {
|
21 |
"acc": 0.72,
|
22 |
"acc_stderr": 0.04512608598542128
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
}
|
24 |
},
|
25 |
"versions": {
|
@@ -27,6 +73,15 @@
|
|
27 |
"anli_r2": 0,
|
28 |
"anli_r3": 0,
|
29 |
"cb": 1,
|
30 |
-
"copa": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
}
|
32 |
}
|
|
|
20 |
"copa": {
|
21 |
"acc": 0.72,
|
22 |
"acc_stderr": 0.04512608598542128
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.37273451503684524,
|
26 |
+
"acc_stderr": 0.004825441080261174,
|
27 |
+
"acc_norm": 0.46922923720374426,
|
28 |
+
"acc_norm_stderr": 0.004980323400031081
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5018050541516246,
|
32 |
+
"acc_stderr": 0.030096267148976626
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5374901341752171,
|
36 |
+
"acc_stderr": 0.014012928183336578
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.6456440406199893,
|
40 |
+
"acc_stderr": 0.011061031791615487
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5880733944954128,
|
44 |
+
"acc_stderr": 0.008608316516029646
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.5467171717171717,
|
48 |
+
"acc_stderr": 0.01021490151673162,
|
49 |
+
"acc_norm": 0.5458754208754208,
|
50 |
+
"acc_norm_stderr": 0.010216507710244115
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.25,
|
54 |
+
"acc_stderr": 0.012653835621466646,
|
55 |
+
"acc_norm": 0.2832764505119454,
|
56 |
+
"acc_norm_stderr": 0.013167478735134575
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.893,
|
60 |
+
"acc_stderr": 0.009779910359847167,
|
61 |
+
"acc_norm": 0.898,
|
62 |
+
"acc_norm_stderr": 0.009575368801653866
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7067464635473341,
|
66 |
+
"acc_stderr": 0.010621818421101926,
|
67 |
+
"acc_norm": 0.7094668117519043,
|
68 |
+
"acc_norm_stderr": 0.010592765034696534
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
73 |
"anli_r2": 0,
|
74 |
"anli_r3": 0,
|
75 |
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b55boscarseed1/evaluation/rankeval/2b855b55boscarseed1_5_lm-eval_global_step52452_2023-02-13-14-30-07_5shots_backup.json
DELETED
@@ -1,32 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.33,
|
5 |
-
"acc_stderr": 0.01487687202745673
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.319,
|
9 |
-
"acc_stderr": 0.014746404865473484
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.335,
|
13 |
-
"acc_stderr": 0.01363087184382147
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5714285714285714,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.44974910394265244
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.72,
|
22 |
-
"acc_stderr": 0.04512608598542128
|
23 |
-
}
|
24 |
-
},
|
25 |
-
"versions": {
|
26 |
-
"anli_r1": 0,
|
27 |
-
"anli_r2": 0,
|
28 |
-
"anli_r3": 0,
|
29 |
-
"cb": 1,
|
30 |
-
"copa": 0
|
31 |
-
}
|
32 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed2/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.01536092726240684
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.01536092726240684
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.21213479459536141
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.21213479459536141
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2341310438118546
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2341310438118546
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.24305694094335184
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.24305694094335184
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.24457201060976938
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.24457201060976938
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24829610593304763
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.24829610593304763
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.19959197052596528
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03387029006729459
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.03387029006729459
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.041673153288058665
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.041673153288058665
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04639087859085017
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.04639087859085017
|
21 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04306159259575956
|
22 |
+
gem_xsum,3,median,rouge2_fmeasure,0.04306159259575956
|
23 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.011538671061568423
|
24 |
+
gem_xsum,4,median,rouge2_fmeasure,0.011538671061568423
|
25 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00019482690735564718
|
26 |
+
gem_xsum,5,median,rouge2_fmeasure,0.00019482690735564718
|
27 |
+
gem_xsum,5,average,multiple,0.02945490208514784
|
28 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05177887034670445
|
29 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.05177887034670445
|
30 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07646296897485043
|
31 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.07646296897485043
|
32 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09686673325919606
|
33 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.09686673325919606
|
34 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10785897009616782
|
35 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.10785897009616782
|
36 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10969476589024942
|
37 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.10969476589024942
|
38 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11818892524508536
|
39 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.11818892524508536
|
40 |
+
web_nlg_en,5,average,multiple,0.09347520563537559
|
41 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.028664153276260776
|
42 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.028664153276260776
|
43 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.033014749953558985
|
44 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.033014749953558985
|
45 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.057107197307562536
|
46 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.057107197307562536
|
47 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04908335278882542
|
48 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.04908335278882542
|
49 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01586522511044624
|
50 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.01586522511044624
|
51 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0024499806068106226
|
52 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.0024499806068106226
|
53 |
+
wiki_lingua_en,5,average,multiple,0.031030776507244097
|
2b855b55boscarseed2/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3874653621452948, "bleu_stderr": 0.03723542414830446, "rouge1_fmeasure": 0.1140383770287106, "rouge1_fmeasure_stderr": 0.0022130882625086713, "rouge1_precision": 0.07924120193077218, "rouge1_precision_stderr": 0.0020339612048906955, "rouge1_recall": 0.30832426939003943, "rouge1_recall_stderr": 0.005373189440858598, "rouge2_fmeasure": 0.05177887034670445, "rouge2_fmeasure_stderr": 0.0013077433915666854, "rouge2_precision": 0.03549078256911552, "rouge2_precision_stderr": 0.0012342524299106018, "rouge2_recall": 0.14419042385916486, "rouge2_recall_stderr": 0.0033307772208794675, "rougeL_fmeasure": 0.10885502560700408, "rougeL_fmeasure_stderr": 0.0020560929299673985, "rougeL_precision": 0.07533765079946074, "rougeL_precision_stderr": 0.0018908789584572414, "rougeL_recall": 0.29736676086786396, "rougeL_recall_stderr": 0.00519056968040173, "rougeLsum_fmeasure": 0.10656318707161754, "rougeLsum_fmeasure_stderr": 0.002059972579462119, "rougeLsum_precision": 0.07432576889319907, "rougeLsum_precision_stderr": 0.0019431481640472651, "rougeLsum_recall": 0.2880250733368785, "rougeLsum_recall_stderr": 0.004941073276117669}}, "1": {"PALM_prompt": {"bleu": 0.5020545893819387, "bleu_stderr": 0.03350584724315388, "rouge1_fmeasure": 0.15905776387607035, "rouge1_fmeasure_stderr": 0.0035245313721274083, "rouge1_precision": 0.13813597303100697, "rouge1_precision_stderr": 0.004343575234469626, "rouge1_recall": 0.313939693535177, "rouge1_recall_stderr": 0.004834625843692428, "rouge2_fmeasure": 0.07646296897485043, "rouge2_fmeasure_stderr": 0.0022702110297491826, "rouge2_precision": 0.06858927457884743, "rouge2_precision_stderr": 0.002958291745513936, "rouge2_recall": 0.15381670240011833, "rouge2_recall_stderr": 0.0033293044618979994, "rougeL_fmeasure": 0.1447493827945491, "rougeL_fmeasure_stderr": 0.0030370089229668153, "rougeL_precision": 0.1243830823397838, "rougeL_precision_stderr": 0.0038611095047569543, "rougeL_recall": 0.2942884315802642, "rougeL_recall_stderr": 0.004464599540458812, "rougeLsum_fmeasure": 0.1470618072831102, "rougeLsum_fmeasure_stderr": 0.003105657904059638, "rougeLsum_precision": 0.12701853538684826, "rougeLsum_precision_stderr": 0.003954181779126239, "rougeLsum_recall": 0.29696884678328234, "rougeLsum_recall_stderr": 0.004497750513527319}}, "2": {"PALM_prompt": {"bleu": 0.6474595663852917, "bleu_stderr": 0.033749261818264216, "rouge1_fmeasure": 0.18919345304479693, "rouge1_fmeasure_stderr": 0.004020906539943112, "rouge1_precision": 0.16682426470303552, "rouge1_precision_stderr": 0.004904035191755809, "rouge1_recall": 0.35518992720321907, "rouge1_recall_stderr": 0.004850634911987141, "rouge2_fmeasure": 0.09686673325919606, "rouge2_fmeasure_stderr": 0.0027323521411920545, "rouge2_precision": 0.08647846859568918, "rouge2_precision_stderr": 0.0031877568766792075, "rouge2_recall": 0.18512271044410877, "rouge2_recall_stderr": 0.0036273314316476895, "rougeL_fmeasure": 0.16989290956803452, "rougeL_fmeasure_stderr": 0.0034200883376219672, "rougeL_precision": 0.1474814848454657, "rougeL_precision_stderr": 0.004204053910516139, "rougeL_recall": 0.3299647242946164, "rougeL_recall_stderr": 0.004464709102476085, "rougeLsum_fmeasure": 0.17258237312480734, "rougeLsum_fmeasure_stderr": 0.003498237968429449, "rougeLsum_precision": 0.1505215372888856, "rougeLsum_precision_stderr": 0.004312762058101885, "rougeLsum_recall": 0.33271084790657673, "rougeLsum_recall_stderr": 0.004485077981721643}}, "3": {"PALM_prompt": {"bleu": 0.7974101000388205, "bleu_stderr": 0.035300995555352134, "rouge1_fmeasure": 0.20539523785240246, "rouge1_fmeasure_stderr": 0.0043441834312713794, "rouge1_precision": 0.19185041972546665, "rouge1_precision_stderr": 0.005538584270396667, "rouge1_recall": 0.3632943877982833, "rouge1_recall_stderr": 0.004901577309937019, "rouge2_fmeasure": 0.10785897009616782, "rouge2_fmeasure_stderr": 0.002976531129127704, "rouge2_precision": 0.10304352194037955, "rouge2_precision_stderr": 0.003729359321688117, "rouge2_recall": 0.19246399337065445, "rouge2_recall_stderr": 0.003792863001786476, "rougeL_fmeasure": 0.1832800576071327, "rougeL_fmeasure_stderr": 0.003685139906384128, "rougeL_precision": 0.1684332660334212, "rougeL_precision_stderr": 0.004768794698575043, "rougeL_recall": 0.33651638026223396, "rougeL_recall_stderr": 0.004495083026483739, "rougeLsum_fmeasure": 0.18724164222590742, "rougeLsum_fmeasure_stderr": 0.0038138315445584562, "rougeLsum_precision": 0.17356177830514777, "rougeLsum_precision_stderr": 0.004960226025778066, "rougeLsum_recall": 0.33988666777362586, "rougeLsum_recall_stderr": 0.004521495013562381}}, "4": {"PALM_prompt": {"bleu": 0.856187036773294, "bleu_stderr": 0.04208566609468901, "rouge1_fmeasure": 0.212854782343808, "rouge1_fmeasure_stderr": 0.004174314070051531, "rouge1_precision": 0.1941558979984076, "rouge1_precision_stderr": 0.005202551258696686, "rouge1_recall": 0.3806245354946727, "rouge1_recall_stderr": 0.004829195346257543, "rouge2_fmeasure": 0.10969476589024942, "rouge2_fmeasure_stderr": 0.0027731900950575474, "rouge2_precision": 0.10199546993794886, "rouge2_precision_stderr": 0.0033928104800875873, "rouge2_recall": 0.19965936669757708, "rouge2_recall_stderr": 0.003703551341074504, "rougeL_fmeasure": 0.18764372619171368, "rougeL_fmeasure_stderr": 0.0034255366263522004, "rougeL_precision": 0.1683634546629711, "rougeL_precision_stderr": 0.0043566832823716635, "rougeL_recall": 0.3488728378700193, "rougeL_recall_stderr": 0.004348803789084399, "rougeLsum_fmeasure": 0.19274181890169947, "rougeLsum_fmeasure_stderr": 0.0035913856572666724, "rougeLsum_precision": 0.17464962919276103, "rougeLsum_precision_stderr": 0.004600388675331363, "rougeLsum_recall": 0.3540949550262167, "rougeLsum_recall_stderr": 0.004425708982720254}}, "5": {"PALM_prompt": {"bleu": 0.9506007572156446, "bleu_stderr": 0.04343708104997622, "rouge1_fmeasure": 0.22311633980185275, "rouge1_fmeasure_stderr": 0.004346060849945347, "rouge1_precision": 0.21379476324957591, "rouge1_precision_stderr": 0.005726509093593779, "rouge1_recall": 0.38096154342423794, "rouge1_recall_stderr": 0.004858836768304506, "rouge2_fmeasure": 0.11818892524508536, "rouge2_fmeasure_stderr": 0.00295085514697514, "rouge2_precision": 0.11715119839250678, "rouge2_precision_stderr": 0.003890940240399591, "rouge2_recall": 0.20363784223133286, "rouge2_recall_stderr": 0.0037762362083992823, "rougeL_fmeasure": 0.19809568277405426, "rougeL_fmeasure_stderr": 0.003676881205999463, "rougeL_precision": 0.18686697869726804, "rougeL_precision_stderr": 0.00491293618971321, "rougeL_recall": 0.3511228954551617, "rougeL_recall_stderr": 0.004496699468485909, "rougeLsum_fmeasure": 0.20382788462421034, "rougeLsum_fmeasure_stderr": 0.0038417909033861986, "rougeLsum_precision": 0.19426453812080635, "rougeLsum_precision_stderr": 0.0051801806590510155, "rougeLsum_recall": 0.3562973317382755, "rougeLsum_recall_stderr": 0.004543508679359547}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.4474573358567544, "bleu_stderr": 0.16056028474727593, "rouge1_fmeasure": 0.11185950681616857, "rouge1_fmeasure_stderr": 0.0024820754785304356, "rouge1_precision": 0.11066253706914085, "rouge1_precision_stderr": 0.002928910263269492, "rouge1_recall": 0.1523983253994007, "rouge1_recall_stderr": 0.003399666506325113, "rouge2_fmeasure": 0.028664153276260776, "rouge2_fmeasure_stderr": 0.0009869120448643051, "rouge2_precision": 0.026167327632978802, "rouge2_precision_stderr": 0.0009590721493005132, "rouge2_recall": 0.039605087620781305, "rouge2_recall_stderr": 0.0014476441062954658, "rougeL_fmeasure": 0.08843546851316388, "rougeL_fmeasure_stderr": 0.0019092354099808754, "rougeL_precision": 0.08866909900482028, "rougeL_precision_stderr": 0.0024688540819133784, "rougeL_recall": 0.12260304933149747, "rougeL_recall_stderr": 0.002735237769190538, "rougeLsum_fmeasure": 0.10382759172597593, "rougeLsum_fmeasure_stderr": 0.00231759949718986, "rougeLsum_precision": 0.10335856371931597, "rougeLsum_precision_stderr": 0.002800320206450295, "rougeLsum_recall": 0.14173554566197402, "rougeLsum_recall_stderr": 0.0031924599240493227}}, "1": {"tldr_en": {"bleu": 2.234222113045526, "bleu_stderr": 0.11399020845442871, "rouge1_fmeasure": 0.16361987434365396, "rouge1_fmeasure_stderr": 0.0020659560046142464, "rouge1_precision": 0.21131636140922483, "rouge1_precision_stderr": 0.0035337158034419104, "rouge1_recall": 0.1866870269167913, "rouge1_recall_stderr": 0.00282099186687827, "rouge2_fmeasure": 0.033014749953558985, "rouge2_fmeasure_stderr": 0.0011114874793354045, "rouge2_precision": 0.0478517770664011, "rouge2_precision_stderr": 0.002038490805684234, "rouge2_recall": 0.037962875213077084, "rouge2_recall_stderr": 0.001371092507838403, "rougeL_fmeasure": 0.12521582817345858, "rougeL_fmeasure_stderr": 0.001558164893603696, "rougeL_precision": 0.16582293281238897, "rougeL_precision_stderr": 0.0029834300036000964, "rougeL_recall": 0.14259279499083446, "rougeL_recall_stderr": 0.0021300867673961586, "rougeLsum_fmeasure": 0.15406437689304697, "rougeLsum_fmeasure_stderr": 0.0019216050506638991, "rougeLsum_precision": 0.19965802567234178, "rougeLsum_precision_stderr": 0.003376519676950631, "rougeLsum_recall": 0.1758403732489615, "rougeLsum_recall_stderr": 0.00263481088532624}}, "2": {"tldr_en": {"bleu": 3.3676607764003217, "bleu_stderr": 0.09972910399869098, "rouge1_fmeasure": 0.21327790105912361, "rouge1_fmeasure_stderr": 0.002123626367765033, "rouge1_precision": 0.3141094418773111, "rouge1_precision_stderr": 0.00416573278001532, "rouge1_recall": 0.22940008952218904, "rouge1_recall_stderr": 0.00293965228839845, "rouge2_fmeasure": 0.057107197307562536, "rouge2_fmeasure_stderr": 0.001244259966149911, "rouge2_precision": 0.09549762938217742, "rouge2_precision_stderr": 0.00273530312183561, "rouge2_recall": 0.061398178149434976, "rouge2_recall_stderr": 0.0015372222367727667, "rougeL_fmeasure": 0.16372305515226618, "rougeL_fmeasure_stderr": 0.0016337413562063383, "rougeL_precision": 0.24992452403202214, "rougeL_precision_stderr": 0.003654335121206201, "rougeL_recall": 0.17501635595351986, "rougeL_recall_stderr": 0.002271613295698419, "rougeLsum_fmeasure": 0.20003228082390748, "rougeLsum_fmeasure_stderr": 0.0019992738211581395, "rougeLsum_precision": 0.2969029848611265, "rougeLsum_precision_stderr": 0.004037856450512545, "rougeLsum_recall": 0.21505288961058072, "rougeLsum_recall_stderr": 0.002773337400418779}}, "3": {"tldr_en": {"bleu": 2.5100531240020096, "bleu_stderr": 0.08355534400349254, "rouge1_fmeasure": 0.1768067526444316, "rouge1_fmeasure_stderr": 0.0024103919936753083, "rouge1_precision": 0.28268685911370317, "rouge1_precision_stderr": 0.004637188156668055, "rouge1_recall": 0.18409327360839817, "rouge1_recall_stderr": 0.0031019865635269804, "rouge2_fmeasure": 0.04908335278882542, "rouge2_fmeasure_stderr": 0.001259419567652984, "rouge2_precision": 0.08893786285650683, "rouge2_precision_stderr": 0.0028809029879869066, "rouge2_recall": 0.051451700030867215, "rouge2_recall_stderr": 0.0015100945662155152, "rougeL_fmeasure": 0.1398727094077612, "rougeL_fmeasure_stderr": 0.00190370757505087, "rougeL_precision": 0.23112334382781183, "rougeL_precision_stderr": 0.00404785682668799, "rougeL_recall": 0.14465585707078885, "rougeL_recall_stderr": 0.0024508713978240565, "rougeLsum_fmeasure": 0.16596299501958178, "rougeLsum_fmeasure_stderr": 0.002268558499185917, "rougeLsum_precision": 0.2676146693697874, "rougeLsum_precision_stderr": 0.0044929343631950796, "rougeLsum_recall": 0.1727964717658116, "rougeLsum_recall_stderr": 0.002934531649979208}}, "4": {"tldr_en": {"bleu": 0.04917871997945816, "bleu_stderr": 0.011759933442199213, "rouge1_fmeasure": 0.05785729663477462, "rouge1_fmeasure_stderr": 0.00206713711962407, "rouge1_precision": 0.09970745779830552, "rouge1_precision_stderr": 0.0038043727328119364, "rouge1_recall": 0.058330320058960515, "rouge1_recall_stderr": 0.002304682727068326, "rouge2_fmeasure": 0.01586522511044624, "rouge2_fmeasure_stderr": 0.0008738621376063839, "rouge2_precision": 0.03251777234466793, "rouge2_precision_stderr": 0.002118565236435376, "rouge2_recall": 0.0157312964921023, "rouge2_recall_stderr": 0.0009417925496167887, "rougeL_fmeasure": 0.04614935936982993, "rougeL_fmeasure_stderr": 0.0016418345149214004, "rougeL_precision": 0.08319245348009295, "rougeL_precision_stderr": 0.003314160751463669, "rougeL_recall": 0.045957921356138506, "rougeL_recall_stderr": 0.0017994917564425523, "rougeLsum_fmeasure": 0.054430641995148826, "rougeLsum_fmeasure_stderr": 0.0019486405975030696, "rougeLsum_precision": 0.09506233436053896, "rougeLsum_precision_stderr": 0.0036891395010224814, "rougeLsum_recall": 0.05471724465904111, "rougeLsum_recall_stderr": 0.002160632018166085}}, "5": {"tldr_en": {"bleu": 3.251297751095593e-15, "bleu_stderr": 2.2756569350525874e-13, "rouge1_fmeasure": 0.009268964916814223, "rouge1_fmeasure_stderr": 0.0009052508868439835, "rouge1_precision": 0.016926529938962757, "rouge1_precision_stderr": 0.0017542537795902154, "rouge1_recall": 0.009125253685177769, "rouge1_recall_stderr": 0.000969098392727483, "rouge2_fmeasure": 0.0024499806068106226, "rouge2_fmeasure_stderr": 0.0003527027815852848, "rouge2_precision": 0.005744462226327518, "rouge2_precision_stderr": 0.0010007163239292981, "rouge2_recall": 0.0023078050122082674, "rouge2_recall_stderr": 0.0003236324380479218, "rougeL_fmeasure": 0.007481300849596843, "rougeL_fmeasure_stderr": 0.0007363124175643151, "rougeL_precision": 0.014476931308522548, "rougeL_precision_stderr": 0.0015849642921362528, "rougeL_recall": 0.007236663297279324, "rougeL_recall_stderr": 0.0007586205468140797, "rougeLsum_fmeasure": 0.008756700641596853, "rougeLsum_fmeasure_stderr": 0.0008553971822440313, "rougeLsum_precision": 0.016248179255626216, "rougeLsum_precision_stderr": 0.0017036429753655277, "rougeLsum_recall": 0.008608059036178293, "rougeLsum_recall_stderr": 0.0009159521418304884}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.437766723519246, "bleu_stderr": 0.09601693208778553, "rouge1_fmeasure": 0.05697592047597413, "rouge1_fmeasure_stderr": 0.001797926433458445, "rouge1_precision": 0.06682593340511514, "rouge1_precision_stderr": 0.002952306748243577, "rouge1_recall": 0.07411634430724061, "rouge1_recall_stderr": 0.002440217152157383, "rouge2_fmeasure": 0.01536092726240684, "rouge2_fmeasure_stderr": 0.0007275923931318573, "rouge2_precision": 0.03302807726183616, "rouge2_precision_stderr": 0.0026842391549936058, "rouge2_recall": 0.020186894041125805, "rouge2_recall_stderr": 0.0010247172485624153, "rougeL_fmeasure": 0.04817663930211532, "rougeL_fmeasure_stderr": 0.001441653552877007, "rougeL_precision": 0.059300337799532137, "rougeL_precision_stderr": 0.0028179539898848588, "rougeL_recall": 0.0627631568875468, "rougeL_recall_stderr": 0.0019877464517148377, "rougeLsum_fmeasure": 0.05013042205677747, "rougeLsum_fmeasure_stderr": 0.0015967631420436168, "rougeLsum_precision": 0.06115951525215789, "rougeLsum_precision_stderr": 0.0028822223580796964, "rougeLsum_recall": 0.06504695760327411, "rougeLsum_recall_stderr": 0.0021677061434374}}, "1": {"generate_text_restaurant": {"bleu": 11.62900114602928, "bleu_stderr": 0.12375373723225518, "rouge1_fmeasure": 0.4532632284631901, "rouge1_fmeasure_stderr": 0.0023357245319248128, "rouge1_precision": 0.5451742466970191, "rouge1_precision_stderr": 0.0031871274029679347, "rouge1_recall": 0.42699010077430416, "rouge1_recall_stderr": 0.002993746517475021, "rouge2_fmeasure": 0.21213479459536141, "rouge2_fmeasure_stderr": 0.00200420243212873, "rouge2_precision": 0.25901594004684103, "rouge2_precision_stderr": 0.0026481164571408537, "rouge2_recall": 0.19967940412954954, "rouge2_recall_stderr": 0.002150539393155852, "rougeL_fmeasure": 0.3285423507597364, "rougeL_fmeasure_stderr": 0.0020415117574899455, "rougeL_precision": 0.39849798994000396, "rougeL_precision_stderr": 0.002891246679965464, "rougeL_recall": 0.30848615876058605, "rougeL_recall_stderr": 0.0024225677805289614, "rougeLsum_fmeasure": 0.36907713647334306, "rougeLsum_fmeasure_stderr": 0.002300458864383123, "rougeLsum_precision": 0.4456143620680348, "rougeLsum_precision_stderr": 0.003122680182394469, "rougeLsum_recall": 0.3470464325365321, "rougeLsum_recall_stderr": 0.0027268741807027325}}, "2": {"generate_text_restaurant": {"bleu": 13.729160049438256, "bleu_stderr": 0.1736907281701951, "rouge1_fmeasure": 0.47530706522941923, "rouge1_fmeasure_stderr": 0.002269457728441495, "rouge1_precision": 0.5634456592765337, "rouge1_precision_stderr": 0.0032605825884623875, "rouge1_recall": 0.4507541061463029, "rouge1_recall_stderr": 0.002906328181160322, "rouge2_fmeasure": 0.2341310438118546, "rouge2_fmeasure_stderr": 0.0020588137466063974, "rouge2_precision": 0.2815290448207918, "rouge2_precision_stderr": 0.002753285147028329, "rouge2_recall": 0.22195870594530004, "rouge2_recall_stderr": 0.0022195041064945104, "rougeL_fmeasure": 0.35308286491366014, "rougeL_fmeasure_stderr": 0.0020611951680921453, "rougeL_precision": 0.4209016470779559, "rougeL_precision_stderr": 0.002977541526452533, "rougeL_recall": 0.3340077842371586, "rougeL_recall_stderr": 0.002421139901332985, "rougeLsum_fmeasure": 0.39646432858915454, "rougeLsum_fmeasure_stderr": 0.002297137968372773, "rougeLsum_precision": 0.4708991644421341, "rougeLsum_precision_stderr": 0.0032013103972900166, "rougeLsum_recall": 0.37549861015631975, "rougeLsum_recall_stderr": 0.0027163777945055085}}, "3": {"generate_text_restaurant": {"bleu": 14.656321484968338, "bleu_stderr": 0.16972615882626144, "rouge1_fmeasure": 0.481855176889266, "rouge1_fmeasure_stderr": 0.0023194359466880616, "rouge1_precision": 0.562305630265398, "rouge1_precision_stderr": 0.0031652462189463455, "rouge1_recall": 0.45987404198942383, "rouge1_recall_stderr": 0.0029703575294102538, "rouge2_fmeasure": 0.24305694094335184, "rouge2_fmeasure_stderr": 0.002123297563978059, "rouge2_precision": 0.28607085855671466, "rouge2_precision_stderr": 0.0026818690615633698, "rouge2_recall": 0.23244587918587184, "rouge2_recall_stderr": 0.0023199138687154393, "rougeL_fmeasure": 0.3609373253216462, "rougeL_fmeasure_stderr": 0.0021623298108111337, "rougeL_precision": 0.42325154418181776, "rougeL_precision_stderr": 0.0029627688959181155, "rougeL_recall": 0.3438696118575136, "rougeL_recall_stderr": 0.002544580090336099, "rougeLsum_fmeasure": 0.4057493785678476, "rougeLsum_fmeasure_stderr": 0.0023733460830390393, "rougeLsum_precision": 0.4738281598993863, "rougeLsum_precision_stderr": 0.0031390310182449546, "rougeLsum_recall": 0.3870708802610871, "rougeLsum_recall_stderr": 0.0028114899441298146}}, "4": {"generate_text_restaurant": {"bleu": 14.86567209647234, "bleu_stderr": 0.1698994576553632, "rouge1_fmeasure": 0.48520025166396413, "rouge1_fmeasure_stderr": 0.0022710972512261737, "rouge1_precision": 0.5617781131802088, "rouge1_precision_stderr": 0.0031498802155263354, "rouge1_recall": 0.4644614761582281, "rouge1_recall_stderr": 0.002913234539030529, "rouge2_fmeasure": 0.24457201060976938, "rouge2_fmeasure_stderr": 0.0020933772966934307, "rouge2_precision": 0.28619393128692944, "rouge2_precision_stderr": 0.0026729329824444277, "rouge2_recall": 0.23420939809091432, "rouge2_recall_stderr": 0.002276451514506276, "rougeL_fmeasure": 0.3625475863783199, "rougeL_fmeasure_stderr": 0.0021245993785308807, "rougeL_precision": 0.42146275702407654, "rougeL_precision_stderr": 0.002914643656813693, "rougeL_recall": 0.3464822229612734, "rougeL_recall_stderr": 0.002492385305150263, "rougeLsum_fmeasure": 0.4093821222952128, "rougeLsum_fmeasure_stderr": 0.0023455160101698755, "rougeLsum_precision": 0.47411718930833535, "rougeLsum_precision_stderr": 0.0031026386741017796, "rougeLsum_recall": 0.39167901823656365, "rougeLsum_recall_stderr": 0.0027780135939656702}}, "5": {"generate_text_restaurant": {"bleu": 15.289969326745096, "bleu_stderr": 0.18607000102245602, "rouge1_fmeasure": 0.4906980372483691, "rouge1_fmeasure_stderr": 0.002239893074756519, "rouge1_precision": 0.5632204022613888, "rouge1_precision_stderr": 0.003103135022734702, "rouge1_recall": 0.47164683809384095, "rouge1_recall_stderr": 0.002908230821621064, "rouge2_fmeasure": 0.24829610593304763, "rouge2_fmeasure_stderr": 0.0020949289601966516, "rouge2_precision": 0.2877460964400698, "rouge2_precision_stderr": 0.0026429503525263634, "rouge2_recall": 0.2390728657869137, "rouge2_recall_stderr": 0.002310643034275551, "rougeL_fmeasure": 0.36759351204975677, "rougeL_fmeasure_stderr": 0.0021027612292422067, "rougeL_precision": 0.4237547074950468, "rougeL_precision_stderr": 0.0028847192725157873, "rougeL_recall": 0.35261589040821234, "rougeL_recall_stderr": 0.0024851798387174827, "rougeLsum_fmeasure": 0.4159536808731392, "rougeLsum_fmeasure_stderr": 0.0023281286496698744, "rougeLsum_precision": 0.47777427942316386, "rougeLsum_precision_stderr": 0.0030842277651955876, "rougeLsum_recall": 0.39955550811910373, "rougeLsum_recall_stderr": 0.0027834183799787483}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.4545481947126708, "bleu_stderr": 0.09713541074149565, "rouge1_fmeasure": 0.1689835758831468, "rouge1_fmeasure_stderr": 0.002909677887536588, "rouge1_precision": 0.128407969365843, "rouge1_precision_stderr": 0.002769379137931314, "rouge1_recall": 0.28081510742998406, "rouge1_recall_stderr": 0.0050120899952934604, "rouge2_fmeasure": 0.03387029006729459, "rouge2_fmeasure_stderr": 0.0013021207744595693, "rouge2_precision": 0.024652996119864618, "rouge2_precision_stderr": 0.000955948532331516, "rouge2_recall": 0.05810032212253773, "rouge2_recall_stderr": 0.0022927268363733713, "rougeL_fmeasure": 0.12615365625808, "rougeL_fmeasure_stderr": 0.0021035092514434853, "rougeL_precision": 0.09666837873309425, "rougeL_precision_stderr": 0.0022730104891178654, "rougeL_recall": 0.20987985909058215, "rougeL_recall_stderr": 0.0036739360369203336, "rougeLsum_fmeasure": 0.13565709106740156, "rougeLsum_fmeasure_stderr": 0.0023697011210948855, "rougeLsum_precision": 0.10360125474806216, "rougeLsum_precision_stderr": 0.002405028582077681, "rougeLsum_recall": 0.22574783474612775, "rougeLsum_recall_stderr": 0.0041146322853027936}}, "1": {"article_DOC_summary": {"bleu": 1.6998688208970665, "bleu_stderr": 0.0635045004591692, "rouge1_fmeasure": 0.20926024778288918, "rouge1_fmeasure_stderr": 0.003005889502894419, "rouge1_precision": 0.19023841390883325, "rouge1_precision_stderr": 0.0035309340744832805, "rouge1_recall": 0.27962343690881464, "rouge1_recall_stderr": 0.004132211352100015, "rouge2_fmeasure": 0.041673153288058665, "rouge2_fmeasure_stderr": 0.0017602506848527716, "rouge2_precision": 0.03891937014424581, "rouge2_precision_stderr": 0.0019275156487078334, "rouge2_recall": 0.05543377598085919, "rouge2_recall_stderr": 0.002244661735424776, "rougeL_fmeasure": 0.15651748983436384, "rougeL_fmeasure_stderr": 0.0023572357647388027, "rougeL_precision": 0.1431295220245165, "rougeL_precision_stderr": 0.0028617657540101794, "rougeL_recall": 0.2090088704299264, "rougeL_recall_stderr": 0.003179674014248957, "rougeLsum_fmeasure": 0.16204529127719666, "rougeLsum_fmeasure_stderr": 0.0024330869873563756, "rougeLsum_precision": 0.14718107674906517, "rougeLsum_precision_stderr": 0.0028690839015025824, "rougeLsum_recall": 0.21825500900582057, "rougeLsum_recall_stderr": 0.003449567102877007}}, "2": {"article_DOC_summary": {"bleu": 1.9501501324779182, "bleu_stderr": 0.14611784928893765, "rouge1_fmeasure": 0.21872784412701274, "rouge1_fmeasure_stderr": 0.003178250857438001, "rouge1_precision": 0.21424459004855737, "rouge1_precision_stderr": 0.003920695419573458, "rouge1_recall": 0.2624779464711849, "rouge1_recall_stderr": 0.0037294949681331594, "rouge2_fmeasure": 0.04639087859085017, "rouge2_fmeasure_stderr": 0.001947418699616045, "rouge2_precision": 0.04672293178100336, "rouge2_precision_stderr": 0.0021812637389958867, "rouge2_recall": 0.05465091873648108, "rouge2_recall_stderr": 0.0022123458296598327, "rougeL_fmeasure": 0.16660605329632028, "rougeL_fmeasure_stderr": 0.00250959150469725, "rougeL_precision": 0.1634764389407365, "rougeL_precision_stderr": 0.003131246677174094, "rougeL_recall": 0.200339897015504, "rougeL_recall_stderr": 0.0029094756092193936, "rougeLsum_fmeasure": 0.1695028174798023, "rougeLsum_fmeasure_stderr": 0.0025624042698553767, "rougeLsum_precision": 0.16575697101021414, "rougeLsum_precision_stderr": 0.0031483621075441776, "rougeLsum_recall": 0.20515038564912177, "rougeLsum_recall_stderr": 0.0031240747920875116}}, "3": {"article_DOC_summary": {"bleu": 2.0076275724787602, "bleu_stderr": 0.11212873434350258, "rouge1_fmeasure": 0.20434572391689246, "rouge1_fmeasure_stderr": 0.0034706073334128465, "rouge1_precision": 0.20437733612799427, "rouge1_precision_stderr": 0.004074795118159076, "rouge1_recall": 0.23878554843604352, "rouge1_recall_stderr": 0.004029638009617848, "rouge2_fmeasure": 0.04306159259575956, "rouge2_fmeasure_stderr": 0.0018777349352262806, "rouge2_precision": 0.043708098742975064, "rouge2_precision_stderr": 0.0020491597551648465, "rouge2_recall": 0.04956386041008167, "rouge2_recall_stderr": 0.0021286911262099997, "rougeL_fmeasure": 0.15409584255901532, "rougeL_fmeasure_stderr": 0.002676167396402293, "rougeL_precision": 0.15428188125603903, "rougeL_precision_stderr": 0.0031910918756160554, "rougeL_recall": 0.1810777416600256, "rougeL_recall_stderr": 0.003111455126901863, "rougeLsum_fmeasure": 0.15601886005302468, "rougeLsum_fmeasure_stderr": 0.0027270011126486237, "rougeLsum_precision": 0.1559693979368864, "rougeLsum_precision_stderr": 0.0032193278481836844, "rougeLsum_recall": 0.18388798573315704, "rougeLsum_recall_stderr": 0.003248853518755984}}, "4": {"article_DOC_summary": {"bleu": 0.19792231261244875, "bleu_stderr": 0.05396644461383391, "rouge1_fmeasure": 0.054815129113975035, "rouge1_fmeasure_stderr": 0.003185657149241013, "rouge1_precision": 0.064203475659168, "rouge1_precision_stderr": 0.004005751047576481, "rouge1_recall": 0.05912018551776059, "rouge1_recall_stderr": 0.003541759965969919, "rouge2_fmeasure": 0.011538671061568423, "rouge2_fmeasure_stderr": 0.0011279494735547844, "rouge2_precision": 0.014296893055854982, "rouge2_precision_stderr": 0.00167495513512877, "rouge2_recall": 0.012433579102778388, "rouge2_recall_stderr": 0.0012488062780850729, "rougeL_fmeasure": 0.04104305147864034, "rougeL_fmeasure_stderr": 0.002397020963803412, "rougeL_precision": 0.04883563717679719, "rougeL_precision_stderr": 0.003126685980664847, "rougeL_recall": 0.044466619309858256, "rougeL_recall_stderr": 0.0027081849463861368, "rougeLsum_fmeasure": 0.04194817002510108, "rougeLsum_fmeasure_stderr": 0.002441184716293297, "rougeLsum_precision": 0.049557293153701935, "rougeLsum_precision_stderr": 0.0031472184711958118, "rougeLsum_recall": 0.04587016139594867, "rougeLsum_recall_stderr": 0.0028101728309766413}}, "5": {"article_DOC_summary": {"bleu": 2.7352950413968226e-52, "bleu_stderr": 8.556725479397893e-38, "rouge1_fmeasure": 0.0015387951120581591, "rouge1_fmeasure_stderr": 0.0005347695829075357, "rouge1_precision": 0.0017440472784149934, "rouge1_precision_stderr": 0.0006359230951207035, "rouge1_recall": 0.0014111726662419768, "rouge1_recall_stderr": 0.00047596721200036436, "rouge2_fmeasure": 0.00019482690735564718, "rouge2_fmeasure_stderr": 0.00012504657939333306, "rouge2_precision": 0.00024693455968647276, "rouge2_precision_stderr": 0.00016570986035138282, "rouge2_recall": 0.00016295025728987994, "rouge2_recall_stderr": 0.00010177004951804534, "rougeL_fmeasure": 0.001230272095389995, "rougeL_fmeasure_stderr": 0.0004196974441483657, "rougeL_precision": 0.001394159775364338, "rougeL_precision_stderr": 0.0005079057065205211, "rougeL_recall": 0.0011335604934685137, "rougeL_recall_stderr": 0.00037243699653226857, "rougeLsum_fmeasure": 0.001230272095389995, "rougeLsum_fmeasure_stderr": 0.0004196974441483657, "rougeLsum_precision": 0.001394159775364338, "rougeLsum_precision_stderr": 0.0005079057065205211, "rougeLsum_recall": 0.0011335604934685137, "rougeLsum_recall_stderr": 0.00037243699653226857}}}}
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.338,0.014965960710224489,0
|
3 |
+
anli_r2,acc,0.356,0.015149042659306618,0
|
4 |
+
anli_r3,acc,0.34833333333333333,0.013759437498874073,0
|
5 |
+
arc_challenge,acc,0.25,0.012653835621466646,0
|
6 |
+
arc_challenge,acc_norm,0.27303754266211605,0.013019332762635737,0
|
7 |
+
arc_easy,acc,0.5281986531986532,0.010243454104071783,0
|
8 |
+
arc_easy,acc_norm,0.4684343434343434,0.010239317603199497,0
|
9 |
+
boolq,acc,0.6012232415902141,0.008563973987729907,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.23395931142410017,,1
|
12 |
+
copa,acc,0.73,0.0446196043338474,0
|
13 |
+
hellaswag,acc,0.3746265684126668,0.004830371317841067,0
|
14 |
+
hellaswag,acc_norm,0.45717984465245964,0.004971449552787177,0
|
15 |
+
piqa,acc,0.7078346028291621,0.010610252174513658,0
|
16 |
+
piqa,acc_norm,0.7094668117519043,0.010592765034696534,0
|
17 |
+
rte,acc,0.5379061371841155,0.030009848912529117,0
|
18 |
+
sciq,acc,0.814,0.012310790208412789,0
|
19 |
+
sciq,acc_norm,0.712,0.01432694179723156,0
|
20 |
+
storycloze_2016,acc,0.6574024585783004,0.010974556525299994,0
|
21 |
+
winogrande,acc,0.5130228887134964,0.014047718393997663,0
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_0_lm-eval_global_step52452_2023-02-13-14-30-06_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.338,
|
5 |
-
"acc_stderr": 0.014965960710224489
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.356,
|
9 |
-
"acc_stderr": 0.015149042659306618
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34833333333333333,
|
13 |
-
"acc_stderr": 0.013759437498874073
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.23395931142410017
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.73,
|
22 |
-
"acc_stderr": 0.0446196043338474
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3746265684126668,
|
26 |
-
"acc_stderr": 0.004830371317841067,
|
27 |
-
"acc_norm": 0.45717984465245964,
|
28 |
-
"acc_norm_stderr": 0.004971449552787177
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5379061371841155,
|
32 |
-
"acc_stderr": 0.030009848912529117
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5130228887134964,
|
36 |
-
"acc_stderr": 0.014047718393997663
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6574024585783004,
|
40 |
-
"acc_stderr": 0.010974556525299994
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6012232415902141,
|
44 |
-
"acc_stderr": 0.008563973987729907
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5281986531986532,
|
48 |
-
"acc_stderr": 0.010243454104071783,
|
49 |
-
"acc_norm": 0.4684343434343434,
|
50 |
-
"acc_norm_stderr": 0.010239317603199497
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.25,
|
54 |
-
"acc_stderr": 0.012653835621466646,
|
55 |
-
"acc_norm": 0.27303754266211605,
|
56 |
-
"acc_norm_stderr": 0.013019332762635737
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.814,
|
60 |
-
"acc_stderr": 0.012310790208412789,
|
61 |
-
"acc_norm": 0.712,
|
62 |
-
"acc_norm_stderr": 0.01432694179723156
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7078346028291621,
|
66 |
-
"acc_stderr": 0.010610252174513658,
|
67 |
-
"acc_norm": 0.7094668117519043,
|
68 |
-
"acc_norm_stderr": 0.010592765034696534
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.325,0.014818724459095524,0
|
3 |
+
anli_r2,acc,0.308,0.01460648312734276,0
|
4 |
+
anli_r3,acc,0.3308333333333333,0.013588208070709002,0
|
5 |
+
arc_challenge,acc,0.24829351535836178,0.012624912868089753,0
|
6 |
+
arc_challenge,acc_norm,0.2696245733788396,0.012968040686869147,0
|
7 |
+
arc_easy,acc,0.547979797979798,0.010212436978834095,0
|
8 |
+
arc_easy,acc_norm,0.5281986531986532,0.010243454104071787,0
|
9 |
+
boolq,acc,0.5480122324159021,0.008704643851177519,1
|
10 |
+
cb,acc,0.48214285714285715,0.06737697508644648,1
|
11 |
+
cb,f1,0.3421052631578947,,1
|
12 |
+
copa,acc,0.69,0.04648231987117316,0
|
13 |
+
hellaswag,acc,0.3707428799044015,0.0048201660022530735,0
|
14 |
+
hellaswag,acc_norm,0.46116311491734713,0.004974706428434293,0
|
15 |
+
piqa,acc,0.6996735582154516,0.010695225308183133,0
|
16 |
+
piqa,acc_norm,0.7007616974972797,0.010684130673134581,0
|
17 |
+
rte,acc,0.5342960288808665,0.030025579819366422,0
|
18 |
+
sciq,acc,0.869,0.010674874844837954,0
|
19 |
+
sciq,acc_norm,0.845,0.011450157470799463,0
|
20 |
+
storycloze_2016,acc,0.6509887760555852,0.011022640519108543,0
|
21 |
+
winogrande,acc,0.5256511444356748,0.01403398095610855,0
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_1_lm-eval_global_step52452_2023-02-13-14-30-06_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.325,
|
5 |
-
"acc_stderr": 0.014818724459095524
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.308,
|
9 |
-
"acc_stderr": 0.01460648312734276
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3308333333333333,
|
13 |
-
"acc_stderr": 0.013588208070709002
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.48214285714285715,
|
17 |
-
"acc_stderr": 0.06737697508644648,
|
18 |
-
"f1": 0.3421052631578947
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.69,
|
22 |
-
"acc_stderr": 0.04648231987117316
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3707428799044015,
|
26 |
-
"acc_stderr": 0.0048201660022530735,
|
27 |
-
"acc_norm": 0.46116311491734713,
|
28 |
-
"acc_norm_stderr": 0.004974706428434293
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5342960288808665,
|
32 |
-
"acc_stderr": 0.030025579819366422
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5256511444356748,
|
36 |
-
"acc_stderr": 0.01403398095610855
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6509887760555852,
|
40 |
-
"acc_stderr": 0.011022640519108543
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5480122324159021,
|
44 |
-
"acc_stderr": 0.008704643851177519
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.547979797979798,
|
48 |
-
"acc_stderr": 0.010212436978834095,
|
49 |
-
"acc_norm": 0.5281986531986532,
|
50 |
-
"acc_norm_stderr": 0.010243454104071787
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24829351535836178,
|
54 |
-
"acc_stderr": 0.012624912868089753,
|
55 |
-
"acc_norm": 0.2696245733788396,
|
56 |
-
"acc_norm_stderr": 0.012968040686869147
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.869,
|
60 |
-
"acc_stderr": 0.010674874844837954,
|
61 |
-
"acc_norm": 0.845,
|
62 |
-
"acc_norm_stderr": 0.011450157470799463
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.6996735582154516,
|
66 |
-
"acc_stderr": 0.010695225308183133,
|
67 |
-
"acc_norm": 0.7007616974972797,
|
68 |
-
"acc_norm_stderr": 0.010684130673134581
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.329,0.014865395385928355,0
|
3 |
+
anli_r2,acc,0.332,0.014899597242811485,0
|
4 |
+
anli_r3,acc,0.3258333333333333,0.013535422043417464,0
|
5 |
+
arc_challenge,acc,0.25,0.012653835621466646,0
|
6 |
+
arc_challenge,acc_norm,0.2781569965870307,0.013094469919538805,0
|
7 |
+
arc_easy,acc,0.5585016835016835,0.010189314382749922,0
|
8 |
+
arc_easy,acc_norm,0.5374579124579124,0.0102309521045708,0
|
9 |
+
boolq,acc,0.5376146788990825,0.008720273736433685,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.2872985170857511,,1
|
12 |
+
copa,acc,0.69,0.04648231987117316,0
|
13 |
+
hellaswag,acc,0.37094204341764586,0.004820697457420423,0
|
14 |
+
hellaswag,acc_norm,0.4651463851822346,0.004977643730848598,0
|
15 |
+
piqa,acc,0.70620239390642,0.01062757408051479,0
|
16 |
+
piqa,acc_norm,0.7040261153427638,0.010650414317148128,0
|
17 |
+
rte,acc,0.516245487364621,0.030080573208738064,0
|
18 |
+
sciq,acc,0.89,0.009899393819724446,0
|
19 |
+
sciq,acc_norm,0.889,0.009938701010583726,0
|
20 |
+
storycloze_2016,acc,0.6397648316408338,0.011101519668493525,0
|
21 |
+
winogrande,acc,0.5209155485398579,0.014040185494212945,0
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_2_lm-eval_global_step52452_2023-02-13-14-30-06_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.329,
|
5 |
-
"acc_stderr": 0.014865395385928355
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.332,
|
9 |
-
"acc_stderr": 0.014899597242811485
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3258333333333333,
|
13 |
-
"acc_stderr": 0.013535422043417464
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.2872985170857511
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.69,
|
22 |
-
"acc_stderr": 0.04648231987117316
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.37094204341764586,
|
26 |
-
"acc_stderr": 0.004820697457420423,
|
27 |
-
"acc_norm": 0.4651463851822346,
|
28 |
-
"acc_norm_stderr": 0.004977643730848598
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.516245487364621,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5209155485398579,
|
36 |
-
"acc_stderr": 0.014040185494212945
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6397648316408338,
|
40 |
-
"acc_stderr": 0.011101519668493525
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5376146788990825,
|
44 |
-
"acc_stderr": 0.008720273736433685
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5585016835016835,
|
48 |
-
"acc_stderr": 0.010189314382749922,
|
49 |
-
"acc_norm": 0.5374579124579124,
|
50 |
-
"acc_norm_stderr": 0.0102309521045708
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.25,
|
54 |
-
"acc_stderr": 0.012653835621466646,
|
55 |
-
"acc_norm": 0.2781569965870307,
|
56 |
-
"acc_norm_stderr": 0.013094469919538805
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.89,
|
60 |
-
"acc_stderr": 0.009899393819724446,
|
61 |
-
"acc_norm": 0.889,
|
62 |
-
"acc_norm_stderr": 0.009938701010583726
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.70620239390642,
|
66 |
-
"acc_stderr": 0.01062757408051479,
|
67 |
-
"acc_norm": 0.7040261153427638,
|
68 |
-
"acc_norm_stderr": 0.010650414317148128
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.315,0.014696631960792522,0
|
3 |
+
anli_r2,acc,0.358,0.015167928865407559,0
|
4 |
+
anli_r3,acc,0.335,0.01363087184382148,0
|
5 |
+
arc_challenge,acc,0.24914675767918087,0.01263940711192644,0
|
6 |
+
arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0
|
7 |
+
arc_easy,acc,0.5526094276094277,0.010202832385415646,0
|
8 |
+
arc_easy,acc_norm,0.5395622895622896,0.010227616386289013,0
|
9 |
+
boolq,acc,0.5305810397553516,0.008728682900189714,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.3333858888450927,,1
|
12 |
+
copa,acc,0.74,0.04408440022768078,0
|
13 |
+
hellaswag,acc,0.3721370244971121,0.004823867761332468,0
|
14 |
+
hellaswag,acc_norm,0.46176060545708025,0.00497516738206183,0
|
15 |
+
piqa,acc,0.7029379760609358,0.010661725404814795,0
|
16 |
+
piqa,acc_norm,0.705658324265506,0.010633311470347514,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.887,0.01001655286669686,0
|
19 |
+
sciq,acc_norm,0.881,0.010244215145336664,0
|
20 |
+
storycloze_2016,acc,0.652592196686264,0.011010826502718736,0
|
21 |
+
winogrande,acc,0.5335438042620363,0.014020826677598098,0
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_3.json
CHANGED
@@ -54,6 +54,18 @@
|
|
54 |
"acc_stderr": 0.01263940711192644,
|
55 |
"acc_norm": 0.2773037542662116,
|
56 |
"acc_norm_stderr": 0.013082095839059374
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
}
|
58 |
},
|
59 |
"versions": {
|
@@ -68,6 +80,8 @@
|
|
68 |
"storycloze_2016": 0,
|
69 |
"boolq": 1,
|
70 |
"arc_easy": 0,
|
71 |
-
"arc_challenge": 0
|
|
|
|
|
72 |
}
|
73 |
}
|
|
|
54 |
"acc_stderr": 0.01263940711192644,
|
55 |
"acc_norm": 0.2773037542662116,
|
56 |
"acc_norm_stderr": 0.013082095839059374
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.887,
|
60 |
+
"acc_stderr": 0.01001655286669686,
|
61 |
+
"acc_norm": 0.881,
|
62 |
+
"acc_norm_stderr": 0.010244215145336664
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7029379760609358,
|
66 |
+
"acc_stderr": 0.010661725404814795,
|
67 |
+
"acc_norm": 0.705658324265506,
|
68 |
+
"acc_norm_stderr": 0.010633311470347514
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_3_lm-eval_global_step52452_2023-02-13-14-30-06_3shots_backup.json
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.315,
|
5 |
-
"acc_stderr": 0.014696631960792522
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.358,
|
9 |
-
"acc_stderr": 0.015167928865407559
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.335,
|
13 |
-
"acc_stderr": 0.01363087184382148
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.3333858888450927
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.74,
|
22 |
-
"acc_stderr": 0.04408440022768078
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3721370244971121,
|
26 |
-
"acc_stderr": 0.004823867761332468,
|
27 |
-
"acc_norm": 0.46176060545708025,
|
28 |
-
"acc_norm_stderr": 0.00497516738206183
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5234657039711191,
|
32 |
-
"acc_stderr": 0.03006330041190266
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5335438042620363,
|
36 |
-
"acc_stderr": 0.014020826677598098
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.652592196686264,
|
40 |
-
"acc_stderr": 0.011010826502718736
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5305810397553516,
|
44 |
-
"acc_stderr": 0.008728682900189714
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5526094276094277,
|
48 |
-
"acc_stderr": 0.010202832385415646,
|
49 |
-
"acc_norm": 0.5395622895622896,
|
50 |
-
"acc_norm_stderr": 0.010227616386289013
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24914675767918087,
|
54 |
-
"acc_stderr": 0.01263940711192644,
|
55 |
-
"acc_norm": 0.2773037542662116,
|
56 |
-
"acc_norm_stderr": 0.013082095839059374
|
57 |
-
}
|
58 |
-
},
|
59 |
-
"versions": {
|
60 |
-
"anli_r1": 0,
|
61 |
-
"anli_r2": 0,
|
62 |
-
"anli_r3": 0,
|
63 |
-
"cb": 1,
|
64 |
-
"copa": 0,
|
65 |
-
"hellaswag": 0,
|
66 |
-
"rte": 0,
|
67 |
-
"winogrande": 0,
|
68 |
-
"storycloze_2016": 0,
|
69 |
-
"boolq": 1,
|
70 |
-
"arc_easy": 0,
|
71 |
-
"arc_challenge": 0
|
72 |
-
}
|
73 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.348,0.01507060460376841,0
|
3 |
+
anli_r2,acc,0.351,0.015100563798316405,0
|
4 |
+
anli_r3,acc,0.33166666666666667,0.013596836729485164,0
|
5 |
+
arc_challenge,acc,0.2568259385665529,0.0127669237941168,0
|
6 |
+
arc_challenge,acc_norm,0.28242320819112626,0.013155456884097224,0
|
7 |
+
arc_easy,acc,0.553030303030303,0.010201914927791676,0
|
8 |
+
arc_easy,acc_norm,0.5382996632996633,0.010229639820610516,0
|
9 |
+
boolq,acc,0.5180428134556575,0.008739359336700274,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.3448659295971534,,1
|
12 |
+
copa,acc,0.74,0.04408440022768077,0
|
13 |
+
hellaswag,acc,0.3714399522007568,0.00482202225488602,0
|
14 |
+
hellaswag,acc_norm,0.4623580959968134,0.004975621147406105,0
|
15 |
+
piqa,acc,0.7018498367791077,0.010672964114008308,0
|
16 |
+
piqa,acc_norm,0.704570184983678,0.010644731559342467,0
|
17 |
+
rte,acc,0.4548736462093863,0.029973636495415252,0
|
18 |
+
sciq,acc,0.894,0.009739551265785133,0
|
19 |
+
sciq,acc_norm,0.89,0.009899393819724453,0
|
20 |
+
storycloze_2016,acc,0.6445750935328701,0.011068528452399879,0
|
21 |
+
winogrande,acc,0.5422257300710339,0.014002284504422436,0
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_4.json
CHANGED
@@ -38,6 +38,34 @@
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.6445750935328701,
|
40 |
"acc_stderr": 0.011068528452399879
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
}
|
42 |
},
|
43 |
"versions": {
|
@@ -49,6 +77,11 @@
|
|
49 |
"hellaswag": 0,
|
50 |
"rte": 0,
|
51 |
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
}
|
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.6445750935328701,
|
40 |
"acc_stderr": 0.011068528452399879
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5180428134556575,
|
44 |
+
"acc_stderr": 0.008739359336700274
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.553030303030303,
|
48 |
+
"acc_stderr": 0.010201914927791676,
|
49 |
+
"acc_norm": 0.5382996632996633,
|
50 |
+
"acc_norm_stderr": 0.010229639820610516
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.2568259385665529,
|
54 |
+
"acc_stderr": 0.0127669237941168,
|
55 |
+
"acc_norm": 0.28242320819112626,
|
56 |
+
"acc_norm_stderr": 0.013155456884097224
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.894,
|
60 |
+
"acc_stderr": 0.009739551265785133,
|
61 |
+
"acc_norm": 0.89,
|
62 |
+
"acc_norm_stderr": 0.009899393819724453
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7018498367791077,
|
66 |
+
"acc_stderr": 0.010672964114008308,
|
67 |
+
"acc_norm": 0.704570184983678,
|
68 |
+
"acc_norm_stderr": 0.010644731559342467
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_4_lm-eval_global_step52452_2023-02-13-14-30-06_4shots_backup.json
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.348,
|
5 |
-
"acc_stderr": 0.01507060460376841
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.351,
|
9 |
-
"acc_stderr": 0.015100563798316405
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33166666666666667,
|
13 |
-
"acc_stderr": 0.013596836729485164
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.3448659295971534
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.74,
|
22 |
-
"acc_stderr": 0.04408440022768077
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3714399522007568,
|
26 |
-
"acc_stderr": 0.00482202225488602,
|
27 |
-
"acc_norm": 0.4623580959968134,
|
28 |
-
"acc_norm_stderr": 0.004975621147406105
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4548736462093863,
|
32 |
-
"acc_stderr": 0.029973636495415252
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5422257300710339,
|
36 |
-
"acc_stderr": 0.014002284504422436
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6445750935328701,
|
40 |
-
"acc_stderr": 0.011068528452399879
|
41 |
-
}
|
42 |
-
},
|
43 |
-
"versions": {
|
44 |
-
"anli_r1": 0,
|
45 |
-
"anli_r2": 0,
|
46 |
-
"anli_r3": 0,
|
47 |
-
"cb": 1,
|
48 |
-
"copa": 0,
|
49 |
-
"hellaswag": 0,
|
50 |
-
"rte": 0,
|
51 |
-
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
53 |
-
}
|
54 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.335,0.014933117490932568,0
|
3 |
+
anli_r2,acc,0.337,0.0149550879186536,0
|
4 |
+
anli_r3,acc,0.3358333333333333,0.013639261190932889,0
|
5 |
+
arc_challenge,acc,0.24829351535836178,0.012624912868089753,0
|
6 |
+
arc_challenge,acc_norm,0.2841296928327645,0.013179442447653886,0
|
7 |
+
arc_easy,acc,0.5509259259259259,0.010206428316323365,0
|
8 |
+
arc_easy,acc_norm,0.5471380471380471,0.010214087372211392,0
|
9 |
+
boolq,acc,0.5241590214067279,0.008734840763194172,1
|
10 |
+
cb,acc,0.4107142857142857,0.0663363415035954,1
|
11 |
+
cb,f1,0.27783930109511507,,1
|
12 |
+
copa,acc,0.67,0.04725815626252609,0
|
13 |
+
hellaswag,acc,0.371539533957379,0.004822286556305215,0
|
14 |
+
hellaswag,acc_norm,0.46594303923521213,0.004978192893406277,0
|
15 |
+
piqa,acc,0.704570184983678,0.010644731559342459,0
|
16 |
+
piqa,acc_norm,0.70620239390642,0.010627574080514813,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.893,0.009779910359847167,0
|
19 |
+
sciq,acc_norm,0.889,0.009938701010583726,0
|
20 |
+
storycloze_2016,acc,0.6413682522715125,0.011090657465688193,0
|
21 |
+
winogrande,acc,0.5311760063141279,0.014025142640639515,0
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_5.json
CHANGED
@@ -38,6 +38,34 @@
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.6413682522715125,
|
40 |
"acc_stderr": 0.011090657465688193
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
}
|
42 |
},
|
43 |
"versions": {
|
@@ -49,6 +77,11 @@
|
|
49 |
"hellaswag": 0,
|
50 |
"rte": 0,
|
51 |
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
}
|
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.6413682522715125,
|
40 |
"acc_stderr": 0.011090657465688193
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5241590214067279,
|
44 |
+
"acc_stderr": 0.008734840763194172
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.5509259259259259,
|
48 |
+
"acc_stderr": 0.010206428316323365,
|
49 |
+
"acc_norm": 0.5471380471380471,
|
50 |
+
"acc_norm_stderr": 0.010214087372211392
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.24829351535836178,
|
54 |
+
"acc_stderr": 0.012624912868089753,
|
55 |
+
"acc_norm": 0.2841296928327645,
|
56 |
+
"acc_norm_stderr": 0.013179442447653886
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.893,
|
60 |
+
"acc_stderr": 0.009779910359847167,
|
61 |
+
"acc_norm": 0.889,
|
62 |
+
"acc_norm_stderr": 0.009938701010583726
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.704570184983678,
|
66 |
+
"acc_stderr": 0.010644731559342459,
|
67 |
+
"acc_norm": 0.70620239390642,
|
68 |
+
"acc_norm_stderr": 0.010627574080514813
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b55boscarseed2/evaluation/rankeval/2b855b55boscarseed2_5_lm-eval_global_step52452_2023-02-13-14-30-06_5shots_backup.json
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.335,
|
5 |
-
"acc_stderr": 0.014933117490932568
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.337,
|
9 |
-
"acc_stderr": 0.0149550879186536
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3358333333333333,
|
13 |
-
"acc_stderr": 0.013639261190932889
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4107142857142857,
|
17 |
-
"acc_stderr": 0.0663363415035954,
|
18 |
-
"f1": 0.27783930109511507
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.67,
|
22 |
-
"acc_stderr": 0.04725815626252609
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.371539533957379,
|
26 |
-
"acc_stderr": 0.004822286556305215,
|
27 |
-
"acc_norm": 0.46594303923521213,
|
28 |
-
"acc_norm_stderr": 0.004978192893406277
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5234657039711191,
|
32 |
-
"acc_stderr": 0.03006330041190266
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5311760063141279,
|
36 |
-
"acc_stderr": 0.014025142640639515
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6413682522715125,
|
40 |
-
"acc_stderr": 0.011090657465688193
|
41 |
-
}
|
42 |
-
},
|
43 |
-
"versions": {
|
44 |
-
"anli_r1": 0,
|
45 |
-
"anli_r2": 0,
|
46 |
-
"anli_r3": 0,
|
47 |
-
"cb": 1,
|
48 |
-
"copa": 0,
|
49 |
-
"hellaswag": 0,
|
50 |
-
"rte": 0,
|
51 |
-
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
53 |
-
}
|
54 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed3/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.015075003955377725
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.015075003955377725
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2197193326260251
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.2197193326260251
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.24462503044359898
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.24462503044359898
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.24783052219620694
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.24783052219620694
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.24803460463857877
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.24803460463857877
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24688680363045243
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.24688680363045243
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.20369521624837333
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03650064845163216
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.03650064845163216
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.042159228682119326
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.042159228682119326
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.047169327493735744
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.047169327493735744
|
21 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04773019417977251
|
22 |
+
gem_xsum,3,median,rouge2_fmeasure,0.04773019417977251
|
23 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.012384552834017296
|
24 |
+
gem_xsum,4,median,rouge2_fmeasure,0.012384552834017296
|
25 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,9.281741700266572e-05
|
26 |
+
gem_xsum,5,median,rouge2_fmeasure,9.281741700266572e-05
|
27 |
+
gem_xsum,5,average,multiple,0.03100612817637995
|
28 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.044642962602441105
|
29 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.044642962602441105
|
30 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07208261868868578
|
31 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.07208261868868578
|
32 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09209494204875378
|
33 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.09209494204875378
|
34 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10424957521252359
|
35 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.10424957521252359
|
36 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10890579060188654
|
37 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.10890579060188654
|
38 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11883663273017754
|
39 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.11883663273017754
|
40 |
+
web_nlg_en,5,average,multiple,0.09013542031407806
|
41 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.02327004167912628
|
42 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.02327004167912628
|
43 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.031435380352664986
|
44 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.031435380352664986
|
45 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05229754159287467
|
46 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.05229754159287467
|
47 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04645910000265166
|
48 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.04645910000265166
|
49 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01548765621583035
|
50 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.01548765621583035
|
51 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.003396689228466778
|
52 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.003396689228466778
|
53 |
+
wiki_lingua_en,5,average,multiple,0.02872440151193579
|
2b855b55boscarseed3/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.33202476289978655, "bleu_stderr": 0.03284628255303658, "rouge1_fmeasure": 0.10035078272046244, "rouge1_fmeasure_stderr": 0.0019783236676076898, "rouge1_precision": 0.06651480936453831, "rouge1_precision_stderr": 0.001650969998879038, "rouge1_recall": 0.297334737955042, "rouge1_recall_stderr": 0.004992706637890668, "rouge2_fmeasure": 0.044642962602441105, "rouge2_fmeasure_stderr": 0.001178797135452423, "rouge2_precision": 0.029110043712061957, "rouge2_precision_stderr": 0.0008947308938169874, "rouge2_recall": 0.13391480095345307, "rouge2_recall_stderr": 0.0032321395002733538, "rougeL_fmeasure": 0.09648383030825737, "rougeL_fmeasure_stderr": 0.0018521276667425775, "rougeL_precision": 0.06360280420840975, "rougeL_precision_stderr": 0.0015109805669500105, "rougeL_recall": 0.2880932061217916, "rougeL_recall_stderr": 0.004879650046655422, "rougeLsum_fmeasure": 0.0940598212200592, "rougeLsum_fmeasure_stderr": 0.0018664528054209402, "rougeLsum_precision": 0.06247062164223968, "rougeLsum_precision_stderr": 0.0015581650631781932, "rougeLsum_recall": 0.27694653889243487, "rougeLsum_recall_stderr": 0.004633639788952067}}, "1": {"PALM_prompt": {"bleu": 0.527115014551651, "bleu_stderr": 0.025666419950371343, "rouge1_fmeasure": 0.14851607672855913, "rouge1_fmeasure_stderr": 0.0033689657278829425, "rouge1_precision": 0.1270389619853191, "rouge1_precision_stderr": 0.004160322890958984, "rouge1_recall": 0.3113982585236776, "rouge1_recall_stderr": 0.005100689200994648, "rouge2_fmeasure": 0.07208261868868578, "rouge2_fmeasure_stderr": 0.002234096091379544, "rouge2_precision": 0.06294568402552247, "rouge2_precision_stderr": 0.002863056502770272, "rouge2_recall": 0.15340774472215243, "rouge2_recall_stderr": 0.003411319647719293, "rougeL_fmeasure": 0.13589968933657245, "rougeL_fmeasure_stderr": 0.0029353826676204973, "rougeL_precision": 0.1157116753063807, "rougeL_precision_stderr": 0.003769961491523323, "rougeL_recall": 0.29147344461366753, "rougeL_recall_stderr": 0.00468545606109597, "rougeLsum_fmeasure": 0.13804305933456154, "rougeLsum_fmeasure_stderr": 0.003006886444225815, "rougeLsum_precision": 0.11802827465869996, "rougeLsum_precision_stderr": 0.003850703320853075, "rougeLsum_recall": 0.2933246534286482, "rougeLsum_recall_stderr": 0.00468201951647078}}, "2": {"PALM_prompt": {"bleu": 0.6597997964856303, "bleu_stderr": 0.034728244254522804, "rouge1_fmeasure": 0.18002908296123593, "rouge1_fmeasure_stderr": 0.003968306495814761, "rouge1_precision": 0.16610339248030914, "rouge1_precision_stderr": 0.00507284326160864, "rouge1_recall": 0.3352959906245331, "rouge1_recall_stderr": 0.004999412504809714, "rouge2_fmeasure": 0.09209494204875378, "rouge2_fmeasure_stderr": 0.0026603000241755223, "rouge2_precision": 0.08678039894771569, "rouge2_precision_stderr": 0.003378136673520059, "rouge2_recall": 0.1742610345269654, "rouge2_recall_stderr": 0.0036505372153588563, "rougeL_fmeasure": 0.1614621956023238, "rougeL_fmeasure_stderr": 0.003324273428019681, "rougeL_precision": 0.14649813778279558, "rougeL_precision_stderr": 0.0043379793964259455, "rougeL_recall": 0.31289951361042795, "rougeL_recall_stderr": 0.004601477399031109, "rougeLsum_fmeasure": 0.164808372020027, "rougeLsum_fmeasure_stderr": 0.0034322550025239302, "rougeLsum_precision": 0.1507150810319521, "rougeLsum_precision_stderr": 0.004505163531248335, "rougeLsum_recall": 0.3160265105718741, "rougeLsum_recall_stderr": 0.0046507219012497545}}, "3": {"PALM_prompt": {"bleu": 0.823682659540449, "bleu_stderr": 0.03223216531212538, "rouge1_fmeasure": 0.19910319186529463, "rouge1_fmeasure_stderr": 0.004405170032341835, "rouge1_precision": 0.19000820798830498, "rouge1_precision_stderr": 0.005654761920275743, "rouge1_recall": 0.3542179698381772, "rouge1_recall_stderr": 0.00504611245447937, "rouge2_fmeasure": 0.10424957521252359, "rouge2_fmeasure_stderr": 0.0029705826763534444, "rouge2_precision": 0.1027389531668612, "rouge2_precision_stderr": 0.0038013449699206645, "rouge2_recall": 0.185929248611057, "rouge2_recall_stderr": 0.0037754838215154443, "rougeL_fmeasure": 0.17723376057155618, "rougeL_fmeasure_stderr": 0.0036982270883436853, "rougeL_precision": 0.16748846871920003, "rougeL_precision_stderr": 0.004927644927911881, "rougeL_recall": 0.32704498822538286, "rougeL_recall_stderr": 0.0045579213655376595, "rougeLsum_fmeasure": 0.18174531523702536, "rougeLsum_fmeasure_stderr": 0.003846822902949112, "rougeLsum_precision": 0.17304403574735655, "rougeLsum_precision_stderr": 0.005121196453837608, "rougeLsum_recall": 0.3314123051295653, "rougeLsum_recall_stderr": 0.0046234640987954925}}, "4": {"PALM_prompt": {"bleu": 0.9530919051940064, "bleu_stderr": 0.0577655503873056, "rouge1_fmeasure": 0.2079827747732359, "rouge1_fmeasure_stderr": 0.004259502031408432, "rouge1_precision": 0.19701770449821013, "rouge1_precision_stderr": 0.005404879841926881, "rouge1_recall": 0.36981751550824943, "rouge1_recall_stderr": 0.00493079961155932, "rouge2_fmeasure": 0.10890579060188654, "rouge2_fmeasure_stderr": 0.0029073811099355118, "rouge2_precision": 0.1051003568367592, "rouge2_precision_stderr": 0.0035978375653891058, "rouge2_recall": 0.19524745454962628, "rouge2_recall_stderr": 0.003766208065478121, "rougeL_fmeasure": 0.18418656332394442, "rougeL_fmeasure_stderr": 0.0035517977647474175, "rougeL_precision": 0.17219580163143128, "rougeL_precision_stderr": 0.004627732394600431, "rougeL_recall": 0.34023933463326017, "rougeL_recall_stderr": 0.004497949648052741, "rougeLsum_fmeasure": 0.1893169899986314, "rougeLsum_fmeasure_stderr": 0.003703689468392939, "rougeLsum_precision": 0.17855926303676908, "rougeLsum_precision_stderr": 0.004851287660726657, "rougeLsum_recall": 0.3451827474898279, "rougeLsum_recall_stderr": 0.004540414961524762}}, "5": {"PALM_prompt": {"bleu": 1.018884418225518, "bleu_stderr": 0.06063389090761142, "rouge1_fmeasure": 0.2245014633073898, "rouge1_fmeasure_stderr": 0.004545756965978833, "rouge1_precision": 0.22200655898499053, "rouge1_precision_stderr": 0.005962548162960268, "rouge1_recall": 0.3774826876317441, "rouge1_recall_stderr": 0.004921209540706189, "rouge2_fmeasure": 0.11883663273017754, "rouge2_fmeasure_stderr": 0.003113425284275466, "rouge2_precision": 0.12083626340688242, "rouge2_precision_stderr": 0.004049580076214986, "rouge2_recall": 0.20032221007341308, "rouge2_recall_stderr": 0.0037578095361692466, "rougeL_fmeasure": 0.19808696683914262, "rougeL_fmeasure_stderr": 0.003815012659940734, "rougeL_precision": 0.19385895825070826, "rougeL_precision_stderr": 0.005166002921250581, "rougeL_recall": 0.3463288815796092, "rougeL_recall_stderr": 0.004493315251838996, "rougeLsum_fmeasure": 0.20367651597482986, "rougeLsum_fmeasure_stderr": 0.00397975215283144, "rougeLsum_precision": 0.20080334312524634, "rougeLsum_precision_stderr": 0.00538475664354496, "rougeLsum_recall": 0.3509129204118928, "rougeLsum_recall_stderr": 0.00453081936993939}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.7850233361049752, "bleu_stderr": 0.08520743952252237, "rouge1_fmeasure": 0.09645178040850848, "rouge1_fmeasure_stderr": 0.0023932324184934455, "rouge1_precision": 0.10013364964570891, "rouge1_precision_stderr": 0.0029985387314871463, "rouge1_recall": 0.1289356843926102, "rouge1_recall_stderr": 0.003252442943612275, "rouge2_fmeasure": 0.02327004167912628, "rouge2_fmeasure_stderr": 0.0008647699244277772, "rouge2_precision": 0.021651764261573173, "rouge2_precision_stderr": 0.0008981720810698575, "rouge2_recall": 0.032598865567093065, "rouge2_recall_stderr": 0.0013688978962574704, "rougeL_fmeasure": 0.07529103428773028, "rougeL_fmeasure_stderr": 0.0018177481003977558, "rougeL_precision": 0.0798145635639849, "rougeL_precision_stderr": 0.0025581061221673636, "rougeL_recall": 0.1030194283338092, "rougeL_recall_stderr": 0.0026473619878516663, "rougeLsum_fmeasure": 0.08939472105780072, "rougeLsum_fmeasure_stderr": 0.0022284928823277507, "rougeLsum_precision": 0.09366562084598902, "rougeLsum_precision_stderr": 0.00288003307671834, "rougeLsum_recall": 0.1197163043259789, "rougeLsum_recall_stderr": 0.0030465061402892178}}, "1": {"tldr_en": {"bleu": 2.149034150653557, "bleu_stderr": 0.0880436859723492, "rouge1_fmeasure": 0.1571029592533628, "rouge1_fmeasure_stderr": 0.0019787964900554204, "rouge1_precision": 0.21111776699074944, "rouge1_precision_stderr": 0.0036795176175499007, "rouge1_recall": 0.18105322303497254, "rouge1_recall_stderr": 0.002704526953003645, "rouge2_fmeasure": 0.031435380352664986, "rouge2_fmeasure_stderr": 0.001023452617747968, "rouge2_precision": 0.05017605453973606, "rouge2_precision_stderr": 0.002257054306007703, "rouge2_recall": 0.036689033105982816, "rouge2_recall_stderr": 0.001304908170146679, "rougeL_fmeasure": 0.1212831892222724, "rougeL_fmeasure_stderr": 0.0014852539083793823, "rougeL_precision": 0.1681933611426401, "rougeL_precision_stderr": 0.00320801194833936, "rougeL_recall": 0.14021014034344625, "rougeL_recall_stderr": 0.0020838861579054162, "rougeLsum_fmeasure": 0.14814548337398184, "rougeLsum_fmeasure_stderr": 0.00183959107231936, "rougeLsum_precision": 0.2001981792177172, "rougeLsum_precision_stderr": 0.003545698165430484, "rougeLsum_recall": 0.17075722396717186, "rougeLsum_recall_stderr": 0.0025253030583168973}}, "2": {"tldr_en": {"bleu": 3.2305991931473317, "bleu_stderr": 0.07950533990543071, "rouge1_fmeasure": 0.19819144050955464, "rouge1_fmeasure_stderr": 0.002161181470745133, "rouge1_precision": 0.29659957692420796, "rouge1_precision_stderr": 0.004313993725725908, "rouge1_recall": 0.2126729727290148, "rouge1_recall_stderr": 0.0028286561111436205, "rouge2_fmeasure": 0.05229754159287467, "rouge2_fmeasure_stderr": 0.0012492320842947013, "rouge2_precision": 0.09032858272882092, "rouge2_precision_stderr": 0.0028501839568620016, "rouge2_recall": 0.05443855757508895, "rouge2_recall_stderr": 0.0014238340672832245, "rougeL_fmeasure": 0.1554803027070678, "rougeL_fmeasure_stderr": 0.0016597756677151324, "rougeL_precision": 0.24038327671489826, "rougeL_precision_stderr": 0.0037910715865012022, "rougeL_recall": 0.16660116248057652, "rougeL_recall_stderr": 0.002203413004362285, "rougeLsum_fmeasure": 0.1857602434310385, "rougeLsum_fmeasure_stderr": 0.002016281937901344, "rougeLsum_precision": 0.280082550942368, "rougeLsum_precision_stderr": 0.004156093711309705, "rougeLsum_recall": 0.19923648622767753, "rougeLsum_recall_stderr": 0.002642427103030138}}, "3": {"tldr_en": {"bleu": 2.4191797072532624, "bleu_stderr": 0.05559997787678139, "rouge1_fmeasure": 0.17120594609085021, "rouge1_fmeasure_stderr": 0.0023855136999745593, "rouge1_precision": 0.27191585815512404, "rouge1_precision_stderr": 0.004553033969089403, "rouge1_recall": 0.17785515989361886, "rouge1_recall_stderr": 0.0029945927607217264, "rouge2_fmeasure": 0.04645910000265166, "rouge2_fmeasure_stderr": 0.0012307845898654572, "rouge2_precision": 0.08471260929686349, "rouge2_precision_stderr": 0.0028304634396926127, "rouge2_recall": 0.04775389377827743, "rouge2_recall_stderr": 0.0014249088126759941, "rougeL_fmeasure": 0.13587385417087836, "rougeL_fmeasure_stderr": 0.0018675251007595766, "rougeL_precision": 0.22280436054883324, "rougeL_precision_stderr": 0.003987085100348492, "rougeL_recall": 0.14044912634248938, "rougeL_recall_stderr": 0.0023437321076624205, "rougeLsum_fmeasure": 0.16058553787086796, "rougeLsum_fmeasure_stderr": 0.0022305451173721368, "rougeLsum_precision": 0.2572403556839211, "rougeLsum_precision_stderr": 0.0043942959684568315, "rougeLsum_recall": 0.16664812554605501, "rougeLsum_recall_stderr": 0.002803799009075263}}, "4": {"tldr_en": {"bleu": 0.05821785911961025, "bleu_stderr": 0.006926474108387649, "rouge1_fmeasure": 0.05705943866163425, "rouge1_fmeasure_stderr": 0.002034238951249155, "rouge1_precision": 0.09582438104190298, "rouge1_precision_stderr": 0.0037073724803882197, "rouge1_recall": 0.058712498837730144, "rouge1_recall_stderr": 0.002338440354244753, "rouge2_fmeasure": 0.01548765621583035, "rouge2_fmeasure_stderr": 0.0008491139070832301, "rouge2_precision": 0.030302600329343992, "rouge2_precision_stderr": 0.002020034963991066, "rouge2_recall": 0.01624326433634945, "rouge2_recall_stderr": 0.0010266482066008744, "rougeL_fmeasure": 0.04588756613231766, "rougeL_fmeasure_stderr": 0.0016219219311521358, "rougeL_precision": 0.08030364270664823, "rougeL_precision_stderr": 0.003249592772348841, "rougeL_recall": 0.04709615722253022, "rougeL_recall_stderr": 0.001883208275739314, "rougeLsum_fmeasure": 0.05326890477987492, "rougeLsum_fmeasure_stderr": 0.0018972568367539395, "rougeLsum_precision": 0.09101484836816211, "rougeLsum_precision_stderr": 0.003576586055780137, "rougeLsum_recall": 0.05458046297521363, "rougeLsum_recall_stderr": 0.002174773191043919}}, "5": {"tldr_en": {"bleu": 9.084911261227677e-17, "bleu_stderr": 1.938771872220056e-15, "rouge1_fmeasure": 0.00965877650109712, "rouge1_fmeasure_stderr": 0.0009502280880678353, "rouge1_precision": 0.017339640048071717, "rouge1_precision_stderr": 0.0017914288612189862, "rouge1_recall": 0.009388948695729904, "rouge1_recall_stderr": 0.0009896129220143181, "rouge2_fmeasure": 0.003396689228466778, "rouge2_fmeasure_stderr": 0.00046945115771734425, "rouge2_precision": 0.007695847190713622, "rouge2_precision_stderr": 0.0011846920894618485, "rouge2_recall": 0.0028956408424041136, "rouge2_recall_stderr": 0.00040203067458898954, "rougeL_fmeasure": 0.008167446118792413, "rougeL_fmeasure_stderr": 0.0008056769564050579, "rougeL_precision": 0.015395436111568952, "rougeL_precision_stderr": 0.0016573876664127856, "rougeL_recall": 0.007940994530119429, "rougeL_recall_stderr": 0.0008467405048451344, "rougeLsum_fmeasure": 0.009282795853843133, "rougeLsum_fmeasure_stderr": 0.0009151896700507592, "rougeLsum_precision": 0.01678224909258122, "rougeLsum_precision_stderr": 0.0017500452777960934, "rougeLsum_recall": 0.009052865544143397, "rougeLsum_recall_stderr": 0.0009622730677349403}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.8705138531971153, "bleu_stderr": 0.04867582531264036, "rouge1_fmeasure": 0.03809056028402446, "rouge1_fmeasure_stderr": 0.0016784416647229426, "rouge1_precision": 0.034545848496052654, "rouge1_precision_stderr": 0.001566579723382951, "rouge1_recall": 0.045834929071352386, "rouge1_recall_stderr": 0.002016811830226931, "rouge2_fmeasure": 0.015075003955377725, "rouge2_fmeasure_stderr": 0.0007874787502852935, "rouge2_precision": 0.01357831342305598, "rouge2_precision_stderr": 0.0007278435374694024, "rouge2_recall": 0.01818137390819633, "rouge2_recall_stderr": 0.0009483762899268107, "rougeL_fmeasure": 0.0335161304527499, "rougeL_fmeasure_stderr": 0.0014703931309561242, "rougeL_precision": 0.030361727102885563, "rougeL_precision_stderr": 0.001369965689115654, "rougeL_recall": 0.040364202751189966, "rougeL_recall_stderr": 0.0017680282610923641, "rougeLsum_fmeasure": 0.03475700574859405, "rougeLsum_fmeasure_stderr": 0.0015808773938519858, "rougeLsum_precision": 0.03177258180002162, "rougeLsum_precision_stderr": 0.0014955658234812072, "rougeLsum_recall": 0.04147354656808662, "rougeLsum_recall_stderr": 0.0018699366373118296}}, "1": {"generate_text_restaurant": {"bleu": 12.008758968305228, "bleu_stderr": 0.12564745639649488, "rouge1_fmeasure": 0.46552795049805773, "rouge1_fmeasure_stderr": 0.00235640354584502, "rouge1_precision": 0.5631594638857529, "rouge1_precision_stderr": 0.00324421624132654, "rouge1_recall": 0.4365816544468671, "rouge1_recall_stderr": 0.0030105433233647487, "rouge2_fmeasure": 0.2197193326260251, "rouge2_fmeasure_stderr": 0.0020109962036813298, "rouge2_precision": 0.26937549997021565, "rouge2_precision_stderr": 0.0026440834252656653, "rouge2_recall": 0.2057658676796454, "rouge2_recall_stderr": 0.002143734858738364, "rougeL_fmeasure": 0.335604122613543, "rougeL_fmeasure_stderr": 0.002079991474543142, "rougeL_precision": 0.4087872898136732, "rougeL_precision_stderr": 0.002934069491949776, "rougeL_recall": 0.31377900067912107, "rougeL_recall_stderr": 0.0024396772769032255, "rougeLsum_fmeasure": 0.3790404958358403, "rougeLsum_fmeasure_stderr": 0.0023299555118255666, "rougeLsum_precision": 0.45969831228314867, "rougeLsum_precision_stderr": 0.0031623483646411057, "rougeLsum_recall": 0.3549682540172786, "rougeLsum_recall_stderr": 0.002746427422139494}}, "2": {"generate_text_restaurant": {"bleu": 14.14198056712505, "bleu_stderr": 0.24387528882104717, "rouge1_fmeasure": 0.4878980019648943, "rouge1_fmeasure_stderr": 0.00229400065900204, "rouge1_precision": 0.5832193324685282, "rouge1_precision_stderr": 0.0032787034277368537, "rouge1_recall": 0.4601966943407872, "rouge1_recall_stderr": 0.0029819842123464044, "rouge2_fmeasure": 0.24462503044359898, "rouge2_fmeasure_stderr": 0.0020943947868852283, "rouge2_precision": 0.2960872844323463, "rouge2_precision_stderr": 0.002769000714927548, "rouge2_recall": 0.2309770935674146, "rouge2_recall_stderr": 0.0022965170518954945, "rougeL_fmeasure": 0.3620900980791755, "rougeL_fmeasure_stderr": 0.002113460359566787, "rougeL_precision": 0.4349496583122404, "rougeL_precision_stderr": 0.003015743000538396, "rougeL_recall": 0.34082058480437577, "rougeL_recall_stderr": 0.0025066913309293682, "rougeLsum_fmeasure": 0.406567917534112, "rougeLsum_fmeasure_stderr": 0.0023455647214753387, "rougeLsum_precision": 0.4867174466000412, "rougeLsum_precision_stderr": 0.0032350689341699907, "rougeLsum_recall": 0.38300473271219204, "rougeLsum_recall_stderr": 0.0027921658474823886}}, "3": {"generate_text_restaurant": {"bleu": 14.568494170196605, "bleu_stderr": 0.16235845379384753, "rouge1_fmeasure": 0.48940623430349556, "rouge1_fmeasure_stderr": 0.002237489170426613, "rouge1_precision": 0.580960944087758, "rouge1_precision_stderr": 0.0032426085149854737, "rouge1_recall": 0.462205873383073, "rouge1_recall_stderr": 0.0029025682592738675, "rouge2_fmeasure": 0.24783052219620694, "rouge2_fmeasure_stderr": 0.002089515541436856, "rouge2_precision": 0.29731229285227795, "rouge2_precision_stderr": 0.002737785275611433, "rouge2_recall": 0.23475220291108906, "rouge2_recall_stderr": 0.0023139060262653333, "rougeL_fmeasure": 0.3647405346505457, "rougeL_fmeasure_stderr": 0.0021293095742188914, "rougeL_precision": 0.43459494765721124, "rougeL_precision_stderr": 0.003021280699774259, "rougeL_recall": 0.34401804948389253, "rougeL_recall_stderr": 0.002508909081833447, "rougeLsum_fmeasure": 0.4100565331346358, "rougeLsum_fmeasure_stderr": 0.0023413011884842946, "rougeLsum_precision": 0.4870002916061261, "rougeLsum_precision_stderr": 0.0032257758633302993, "rougeLsum_recall": 0.3872001617504186, "rougeLsum_recall_stderr": 0.0027831042016586783}}, "4": {"generate_text_restaurant": {"bleu": 14.734731736932146, "bleu_stderr": 0.16887071667451783, "rouge1_fmeasure": 0.4894791754818595, "rouge1_fmeasure_stderr": 0.0023005348739165796, "rouge1_precision": 0.5801184894727782, "rouge1_precision_stderr": 0.003304446650411685, "rouge1_recall": 0.46099783072021006, "rouge1_recall_stderr": 0.0028878553497999355, "rouge2_fmeasure": 0.24803460463857877, "rouge2_fmeasure_stderr": 0.00214528331500546, "rouge2_precision": 0.29812075093333956, "rouge2_precision_stderr": 0.002844755419824555, "rouge2_recall": 0.23325171781043572, "rouge2_recall_stderr": 0.0022772307735508773, "rougeL_fmeasure": 0.36343319118608863, "rougeL_fmeasure_stderr": 0.0021424532256284854, "rougeL_precision": 0.4323253611145228, "rougeL_precision_stderr": 0.0030220895477853227, "rougeL_recall": 0.3419042442881764, "rougeL_recall_stderr": 0.0024830197258768685, "rougeLsum_fmeasure": 0.41100693983913, "rougeLsum_fmeasure_stderr": 0.002385212896337812, "rougeLsum_precision": 0.48702805692607404, "rougeLsum_precision_stderr": 0.003251909391956583, "rougeLsum_recall": 0.3871750907489159, "rougeLsum_recall_stderr": 0.002775190310841645}}, "5": {"generate_text_restaurant": {"bleu": 14.417330528707561, "bleu_stderr": 0.23744062587048995, "rouge1_fmeasure": 0.48870465280584424, "rouge1_fmeasure_stderr": 0.002241402168765513, "rouge1_precision": 0.5804014340987179, "rouge1_precision_stderr": 0.0032819804536962584, "rouge1_recall": 0.4582313763378584, "rouge1_recall_stderr": 0.0028210153655625013, "rouge2_fmeasure": 0.24688680363045243, "rouge2_fmeasure_stderr": 0.002112718830613371, "rouge2_precision": 0.2973507676511209, "rouge2_precision_stderr": 0.002822434875050306, "rouge2_recall": 0.23140500698702424, "rouge2_recall_stderr": 0.0022546925569823587, "rougeL_fmeasure": 0.3634152205481943, "rougeL_fmeasure_stderr": 0.002132655133838495, "rougeL_precision": 0.43320881922904375, "rougeL_precision_stderr": 0.0030348704584789053, "rougeL_recall": 0.34027316974490135, "rougeL_recall_stderr": 0.002449545539483717, "rougeLsum_fmeasure": 0.40942633387323074, "rougeLsum_fmeasure_stderr": 0.0023623920101575194, "rougeLsum_precision": 0.48672630521219834, "rougeLsum_precision_stderr": 0.003280235985520434, "rougeLsum_recall": 0.38371077804603543, "rougeLsum_recall_stderr": 0.0027322740126673704}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.6445432935397932, "bleu_stderr": 0.11288880012114288, "rouge1_fmeasure": 0.16563824544624003, "rouge1_fmeasure_stderr": 0.0032253697331514498, "rouge1_precision": 0.11963298530443908, "rouge1_precision_stderr": 0.002368819248238803, "rouge1_recall": 0.28117303100546565, "rouge1_recall_stderr": 0.005529983454772786, "rouge2_fmeasure": 0.03650064845163216, "rouge2_fmeasure_stderr": 0.001407177754346794, "rouge2_precision": 0.025988362314328585, "rouge2_precision_stderr": 0.000997588416820755, "rouge2_recall": 0.06429373809950764, "rouge2_recall_stderr": 0.002624993265784442, "rougeL_fmeasure": 0.12335764205289244, "rougeL_fmeasure_stderr": 0.0023998257893507076, "rougeL_precision": 0.08900328361038783, "rougeL_precision_stderr": 0.0017538328575853156, "rougeL_recall": 0.21035687677527884, "rougeL_recall_stderr": 0.004212998263040422, "rougeLsum_fmeasure": 0.12971536873089445, "rougeLsum_fmeasure_stderr": 0.0026207364705836296, "rougeLsum_precision": 0.09348267589650253, "rougeLsum_precision_stderr": 0.0019094389936997007, "rougeLsum_recall": 0.22162305835248405, "rougeLsum_recall_stderr": 0.004602701558276795}}, "1": {"article_DOC_summary": {"bleu": 1.734128327285966, "bleu_stderr": 0.08924592886370027, "rouge1_fmeasure": 0.20565249149745046, "rouge1_fmeasure_stderr": 0.002804831346629974, "rouge1_precision": 0.16963198767042315, "rouge1_precision_stderr": 0.002937269648695778, "rouge1_recall": 0.30825637395558336, "rouge1_recall_stderr": 0.004370185872392451, "rouge2_fmeasure": 0.042159228682119326, "rouge2_fmeasure_stderr": 0.001590769484466558, "rouge2_precision": 0.03482662251402544, "rouge2_precision_stderr": 0.0014769947445053828, "rouge2_recall": 0.06455952984895309, "rouge2_recall_stderr": 0.0024217677572532996, "rougeL_fmeasure": 0.15284179702621556, "rougeL_fmeasure_stderr": 0.002143177827632909, "rougeL_precision": 0.12598747225029228, "rougeL_precision_stderr": 0.002280349516115101, "rougeL_recall": 0.23023461091477196, "rougeL_recall_stderr": 0.003384529702917665, "rougeLsum_fmeasure": 0.15914388511785377, "rougeLsum_fmeasure_stderr": 0.0022721119916273304, "rougeLsum_precision": 0.1304550598704216, "rougeLsum_precision_stderr": 0.002321857032815001, "rougeLsum_recall": 0.24130012289322367, "rougeLsum_recall_stderr": 0.0037023325680176954}}, "2": {"article_DOC_summary": {"bleu": 2.071346995064759, "bleu_stderr": 0.11437801801926932, "rouge1_fmeasure": 0.22539563476819588, "rouge1_fmeasure_stderr": 0.003133489804867722, "rouge1_precision": 0.22067542118387265, "rouge1_precision_stderr": 0.003859452585001004, "rouge1_recall": 0.27030517058690373, "rouge1_recall_stderr": 0.00391277652919709, "rouge2_fmeasure": 0.047169327493735744, "rouge2_fmeasure_stderr": 0.00199109164400243, "rouge2_precision": 0.047009502675708165, "rouge2_precision_stderr": 0.002173179401977634, "rouge2_recall": 0.05686441658011089, "rouge2_recall_stderr": 0.00247318036490619, "rougeL_fmeasure": 0.16770225679218967, "rougeL_fmeasure_stderr": 0.002496420037294455, "rougeL_precision": 0.1642240699512137, "rougeL_precision_stderr": 0.0030808505625225004, "rougeL_recall": 0.20216628044461474, "rougeL_recall_stderr": 0.0031256639466597388, "rougeLsum_fmeasure": 0.17110619274109168, "rougeLsum_fmeasure_stderr": 0.0025646249178584764, "rougeLsum_precision": 0.16692030908684236, "rougeLsum_precision_stderr": 0.0030981804279163147, "rougeLsum_recall": 0.20768035344064897, "rougeLsum_recall_stderr": 0.0033851444187716706}}, "3": {"article_DOC_summary": {"bleu": 2.318524945908214, "bleu_stderr": 0.08598566221567794, "rouge1_fmeasure": 0.21646380285670908, "rouge1_fmeasure_stderr": 0.003530035532648745, "rouge1_precision": 0.22140615543341866, "rouge1_precision_stderr": 0.004204679986334029, "rouge1_recall": 0.2444906635779359, "rouge1_recall_stderr": 0.004073874459614794, "rouge2_fmeasure": 0.04773019417977251, "rouge2_fmeasure_stderr": 0.0020540531313582152, "rouge2_precision": 0.04952151051084217, "rouge2_precision_stderr": 0.0022666923793748218, "rouge2_recall": 0.0533842351917708, "rouge2_recall_stderr": 0.0023446986110702743, "rougeL_fmeasure": 0.16386346574102725, "rougeL_fmeasure_stderr": 0.0027882855655311897, "rougeL_precision": 0.16807663783660487, "rougeL_precision_stderr": 0.003387549311395567, "rougeL_recall": 0.1859775029237818, "rougeL_recall_stderr": 0.00325443532625321, "rougeLsum_fmeasure": 0.16552783963705708, "rougeLsum_fmeasure_stderr": 0.0028417200102711917, "rougeLsum_precision": 0.16936961306971046, "rougeLsum_precision_stderr": 0.0034063239974255637, "rougeLsum_recall": 0.18841832491605268, "rougeLsum_recall_stderr": 0.003382609617563602}}, "4": {"article_DOC_summary": {"bleu": 0.2202628831493563, "bleu_stderr": 0.05888934471976352, "rouge1_fmeasure": 0.05671539048524648, "rouge1_fmeasure_stderr": 0.003330283869930652, "rouge1_precision": 0.06436084008158685, "rouge1_precision_stderr": 0.004004154969143478, "rouge1_recall": 0.06086433870277819, "rouge1_recall_stderr": 0.0036498255182998273, "rouge2_fmeasure": 0.012384552834017296, "rouge2_fmeasure_stderr": 0.0012566880181981682, "rouge2_precision": 0.014617474767198527, "rouge2_precision_stderr": 0.0017366601516105656, "rouge2_recall": 0.012749228311253753, "rouge2_recall_stderr": 0.0012424948236292898, "rougeL_fmeasure": 0.04272344374969, "rougeL_fmeasure_stderr": 0.002558877704383889, "rougeL_precision": 0.048955624036311166, "rougeL_precision_stderr": 0.003162711183472558, "rougeL_recall": 0.0456391933473409, "rougeL_recall_stderr": 0.002760702511080215, "rougeLsum_fmeasure": 0.04389506319896344, "rougeLsum_fmeasure_stderr": 0.0026138307035845822, "rougeLsum_precision": 0.050043427048033697, "rougeLsum_precision_stderr": 0.0032007871669195633, "rougeLsum_recall": 0.047172374717404854, "rougeLsum_recall_stderr": 0.0028612594448202094}}, "5": {"article_DOC_summary": {"bleu": 3.79693853289103e-57, "bleu_stderr": 1.663506575542054e-51, "rouge1_fmeasure": 0.0019356785814855194, "rouge1_fmeasure_stderr": 0.0005869452676229577, "rouge1_precision": 0.0026890331605918753, "rouge1_precision_stderr": 0.0008483268258212475, "rouge1_recall": 0.0017377836109564373, "rouge1_recall_stderr": 0.0005352032014968291, "rouge2_fmeasure": 9.281741700266572e-05, "rouge2_fmeasure_stderr": 6.607511454084363e-05, "rouge2_precision": 0.00011077758719268152, "rouge2_precision_stderr": 7.83387543031719e-05, "rouge2_recall": 8.063215610385422e-05, "rouge2_recall_stderr": 5.79270034349398e-05, "rougeL_fmeasure": 0.001515345186644524, "rougeL_fmeasure_stderr": 0.000460327548876044, "rougeL_precision": 0.002152537869033098, "rougeL_precision_stderr": 0.0006750309400673729, "rougeL_recall": 0.0013275557055278245, "rougeL_recall_stderr": 0.0004057676471998518, "rougeLsum_fmeasure": 0.001515345186644524, "rougeLsum_fmeasure_stderr": 0.000460327548876044, "rougeLsum_precision": 0.002152537869033098, "rougeLsum_precision_stderr": 0.0006750309400673729, "rougeLsum_recall": 0.0013275557055278245, "rougeLsum_recall_stderr": 0.0004057676471998518}}}}
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.325,0.014818724459095526,0
|
3 |
+
anli_r2,acc,0.341,0.014998131348402709,0
|
4 |
+
anli_r3,acc,0.3258333333333333,0.013535422043417459,0
|
5 |
+
arc_challenge,acc,0.21843003412969283,0.01207429160570098,0
|
6 |
+
arc_challenge,acc_norm,0.2525597269624573,0.012696728980207704,0
|
7 |
+
arc_easy,acc,0.5340909090909091,0.010235908103438685,0
|
8 |
+
arc_easy,acc_norm,0.4882154882154882,0.010256933475911006,0
|
9 |
+
boolq,acc,0.6073394495412844,0.008541161248702906,1
|
10 |
+
cb,acc,0.5178571428571429,0.06737697508644647,1
|
11 |
+
cb,f1,0.3478682170542636,,1
|
12 |
+
copa,acc,0.71,0.04560480215720684,0
|
13 |
+
hellaswag,acc,0.374726150169289,0.004830628620181023,0
|
14 |
+
hellaswag,acc_norm,0.46415056761601275,0.004976939333240077,0
|
15 |
+
piqa,acc,0.7013057671381937,0.010678556398149242,0
|
16 |
+
piqa,acc_norm,0.7105549510337323,0.010581014740675621,0
|
17 |
+
rte,acc,0.5415162454873647,0.029992535385373314,0
|
18 |
+
sciq,acc,0.802,0.012607733934175315,0
|
19 |
+
sciq,acc_norm,0.724,0.014142984975740668,0
|
20 |
+
storycloze_2016,acc,0.6488508818813469,0.011038179124113263,0
|
21 |
+
winogrande,acc,0.5232833464877664,0.01403724130957364,0
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_0_lm-eval_global_step52452_2023-02-13-14-30-06_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.325,
|
5 |
-
"acc_stderr": 0.014818724459095526
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.341,
|
9 |
-
"acc_stderr": 0.014998131348402709
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3258333333333333,
|
13 |
-
"acc_stderr": 0.013535422043417459
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5178571428571429,
|
17 |
-
"acc_stderr": 0.06737697508644647,
|
18 |
-
"f1": 0.3478682170542636
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.71,
|
22 |
-
"acc_stderr": 0.04560480215720684
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.374726150169289,
|
26 |
-
"acc_stderr": 0.004830628620181023,
|
27 |
-
"acc_norm": 0.46415056761601275,
|
28 |
-
"acc_norm_stderr": 0.004976939333240077
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5415162454873647,
|
32 |
-
"acc_stderr": 0.029992535385373314
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5232833464877664,
|
36 |
-
"acc_stderr": 0.01403724130957364
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6488508818813469,
|
40 |
-
"acc_stderr": 0.011038179124113263
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6073394495412844,
|
44 |
-
"acc_stderr": 0.008541161248702906
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5340909090909091,
|
48 |
-
"acc_stderr": 0.010235908103438685,
|
49 |
-
"acc_norm": 0.4882154882154882,
|
50 |
-
"acc_norm_stderr": 0.010256933475911006
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.21843003412969283,
|
54 |
-
"acc_stderr": 0.01207429160570098,
|
55 |
-
"acc_norm": 0.2525597269624573,
|
56 |
-
"acc_norm_stderr": 0.012696728980207704
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.802,
|
60 |
-
"acc_stderr": 0.012607733934175315,
|
61 |
-
"acc_norm": 0.724,
|
62 |
-
"acc_norm_stderr": 0.014142984975740668
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7013057671381937,
|
66 |
-
"acc_stderr": 0.010678556398149242,
|
67 |
-
"acc_norm": 0.7105549510337323,
|
68 |
-
"acc_norm_stderr": 0.010581014740675621
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.329,0.01486539538592836,0
|
3 |
+
anli_r2,acc,0.323,0.014794927843348639,0
|
4 |
+
anli_r3,acc,0.345,0.013728421539454876,0
|
5 |
+
arc_challenge,acc,0.2440273037542662,0.012551447627856255,0
|
6 |
+
arc_challenge,acc_norm,0.25426621160409557,0.012724999945157736,0
|
7 |
+
arc_easy,acc,0.5391414141414141,0.01022829820076613,0
|
8 |
+
arc_easy,acc_norm,0.5244107744107744,0.010247548905242272,0
|
9 |
+
boolq,acc,0.5850152905198777,0.008617716361921567,1
|
10 |
+
cb,acc,0.5,0.06741998624632421,1
|
11 |
+
cb,f1,0.35057471264367807,,1
|
12 |
+
copa,acc,0.72,0.04512608598542127,0
|
13 |
+
hellaswag,acc,0.3695478988249353,0.0048169588177260836,0
|
14 |
+
hellaswag,acc_norm,0.4667396932881896,0.0049787293000748915,0
|
15 |
+
piqa,acc,0.70620239390642,0.0106275740805148,0
|
16 |
+
piqa,acc_norm,0.7040261153427638,0.010650414317148131,0
|
17 |
+
rte,acc,0.5451263537906137,0.029973636495415252,0
|
18 |
+
sciq,acc,0.861,0.010945263761042962,0
|
19 |
+
sciq,acc_norm,0.843,0.011510146979230187,0
|
20 |
+
storycloze_2016,acc,0.6386958845537146,0.011108686479432282,0
|
21 |
+
winogrande,acc,0.5272296764009471,0.014031631629827696,0
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_1_lm-eval_global_step52452_2023-02-13-14-30-06_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.329,
|
5 |
-
"acc_stderr": 0.01486539538592836
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.323,
|
9 |
-
"acc_stderr": 0.014794927843348639
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.345,
|
13 |
-
"acc_stderr": 0.013728421539454876
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5,
|
17 |
-
"acc_stderr": 0.06741998624632421,
|
18 |
-
"f1": 0.35057471264367807
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.72,
|
22 |
-
"acc_stderr": 0.04512608598542127
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3695478988249353,
|
26 |
-
"acc_stderr": 0.0048169588177260836,
|
27 |
-
"acc_norm": 0.4667396932881896,
|
28 |
-
"acc_norm_stderr": 0.0049787293000748915
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5451263537906137,
|
32 |
-
"acc_stderr": 0.029973636495415252
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5272296764009471,
|
36 |
-
"acc_stderr": 0.014031631629827696
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6386958845537146,
|
40 |
-
"acc_stderr": 0.011108686479432282
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5850152905198777,
|
44 |
-
"acc_stderr": 0.008617716361921567
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5391414141414141,
|
48 |
-
"acc_stderr": 0.01022829820076613,
|
49 |
-
"acc_norm": 0.5244107744107744,
|
50 |
-
"acc_norm_stderr": 0.010247548905242272
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2440273037542662,
|
54 |
-
"acc_stderr": 0.012551447627856255,
|
55 |
-
"acc_norm": 0.25426621160409557,
|
56 |
-
"acc_norm_stderr": 0.012724999945157736
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.861,
|
60 |
-
"acc_stderr": 0.010945263761042962,
|
61 |
-
"acc_norm": 0.843,
|
62 |
-
"acc_norm_stderr": 0.011510146979230187
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.70620239390642,
|
66 |
-
"acc_stderr": 0.0106275740805148,
|
67 |
-
"acc_norm": 0.7040261153427638,
|
68 |
-
"acc_norm_stderr": 0.010650414317148131
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.323,0.014794927843348642,0
|
3 |
+
anli_r2,acc,0.337,0.014955087918653602,0
|
4 |
+
anli_r3,acc,0.32,0.013471620929769144,0
|
5 |
+
arc_challenge,acc,0.24744027303754265,0.01261035266329267,0
|
6 |
+
arc_challenge,acc_norm,0.26706484641638223,0.012928933196496349,0
|
7 |
+
arc_easy,acc,0.5505050505050505,0.010207308833916037,0
|
8 |
+
arc_easy,acc_norm,0.5340909090909091,0.010235908103438688,0
|
9 |
+
boolq,acc,0.5847094801223242,0.008618637526341675,1
|
10 |
+
cb,acc,0.42857142857142855,0.06672848092813058,1
|
11 |
+
cb,f1,0.28200928200928205,,1
|
12 |
+
copa,acc,0.7,0.046056618647183814,0
|
13 |
+
hellaswag,acc,0.369946225851424,0.004818031396138923,0
|
14 |
+
hellaswag,acc_norm,0.4690300736904999,0.00498020045185168,0
|
15 |
+
piqa,acc,0.7154515778019587,0.010527218464130614,0
|
16 |
+
piqa,acc_norm,0.705658324265506,0.010633311470347509,0
|
17 |
+
rte,acc,0.5270758122743683,0.030052303463143713,0
|
18 |
+
sciq,acc,0.892,0.009820001651345693,0
|
19 |
+
sciq,acc_norm,0.872,0.010570133761108668,0
|
20 |
+
storycloze_2016,acc,0.6376269374665954,0.011115793699210296,0
|
21 |
+
winogrande,acc,0.5382794001578532,0.01401124259496412,0
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_2_lm-eval_global_step52452_2023-02-13-14-30-07_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.323,
|
5 |
-
"acc_stderr": 0.014794927843348642
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.337,
|
9 |
-
"acc_stderr": 0.014955087918653602
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.32,
|
13 |
-
"acc_stderr": 0.013471620929769144
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.28200928200928205
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.7,
|
22 |
-
"acc_stderr": 0.046056618647183814
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.369946225851424,
|
26 |
-
"acc_stderr": 0.004818031396138923,
|
27 |
-
"acc_norm": 0.4690300736904999,
|
28 |
-
"acc_norm_stderr": 0.00498020045185168
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5270758122743683,
|
32 |
-
"acc_stderr": 0.030052303463143713
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5382794001578532,
|
36 |
-
"acc_stderr": 0.01401124259496412
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6376269374665954,
|
40 |
-
"acc_stderr": 0.011115793699210296
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5847094801223242,
|
44 |
-
"acc_stderr": 0.008618637526341675
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5505050505050505,
|
48 |
-
"acc_stderr": 0.010207308833916037,
|
49 |
-
"acc_norm": 0.5340909090909091,
|
50 |
-
"acc_norm_stderr": 0.010235908103438688
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24744027303754265,
|
54 |
-
"acc_stderr": 0.01261035266329267,
|
55 |
-
"acc_norm": 0.26706484641638223,
|
56 |
-
"acc_norm_stderr": 0.012928933196496349
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.892,
|
60 |
-
"acc_stderr": 0.009820001651345693,
|
61 |
-
"acc_norm": 0.872,
|
62 |
-
"acc_norm_stderr": 0.010570133761108668
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7154515778019587,
|
66 |
-
"acc_stderr": 0.010527218464130614,
|
67 |
-
"acc_norm": 0.705658324265506,
|
68 |
-
"acc_norm_stderr": 0.010633311470347509
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.341,0.014998131348402706,0
|
3 |
+
anli_r2,acc,0.35,0.015090650341444233,0
|
4 |
+
anli_r3,acc,0.32666666666666666,0.013544340907003663,0
|
5 |
+
arc_challenge,acc,0.24061433447098976,0.012491468532390571,0
|
6 |
+
arc_challenge,acc_norm,0.2721843003412969,0.013006600406423706,0
|
7 |
+
arc_easy,acc,0.5509259259259259,0.010206428316323369,0
|
8 |
+
arc_easy,acc_norm,0.5366161616161617,0.01023223506393303,0
|
9 |
+
boolq,acc,0.5868501529051988,0.00861211754780359,1
|
10 |
+
cb,acc,0.5178571428571429,0.06737697508644648,1
|
11 |
+
cb,f1,0.41940672576964805,,1
|
12 |
+
copa,acc,0.72,0.045126085985421276,0
|
13 |
+
hellaswag,acc,0.3732324238199562,0.00482674616083019,0
|
14 |
+
hellaswag,acc_norm,0.46883091017725553,0.004980076707392432,0
|
15 |
+
piqa,acc,0.70620239390642,0.010627574080514797,0
|
16 |
+
piqa,acc_norm,0.7040261153427638,0.010650414317148131,0
|
17 |
+
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.887,0.010016552866696858,0
|
19 |
+
sciq,acc_norm,0.877,0.010391293421849877,0
|
20 |
+
storycloze_2016,acc,0.6424371993586317,0.011083341168827785,0
|
21 |
+
winogrande,acc,0.5382794001578532,0.014011242594964118,0
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_3_lm-eval_global_step52452_2023-02-13-14-30-06_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.341,
|
5 |
-
"acc_stderr": 0.014998131348402706
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.35,
|
9 |
-
"acc_stderr": 0.015090650341444233
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.32666666666666666,
|
13 |
-
"acc_stderr": 0.013544340907003663
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5178571428571429,
|
17 |
-
"acc_stderr": 0.06737697508644648,
|
18 |
-
"f1": 0.41940672576964805
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.72,
|
22 |
-
"acc_stderr": 0.045126085985421276
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3732324238199562,
|
26 |
-
"acc_stderr": 0.00482674616083019,
|
27 |
-
"acc_norm": 0.46883091017725553,
|
28 |
-
"acc_norm_stderr": 0.004980076707392432
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5234657039711191,
|
32 |
-
"acc_stderr": 0.03006330041190266
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5382794001578532,
|
36 |
-
"acc_stderr": 0.014011242594964118
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6424371993586317,
|
40 |
-
"acc_stderr": 0.011083341168827785
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5868501529051988,
|
44 |
-
"acc_stderr": 0.00861211754780359
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5509259259259259,
|
48 |
-
"acc_stderr": 0.010206428316323369,
|
49 |
-
"acc_norm": 0.5366161616161617,
|
50 |
-
"acc_norm_stderr": 0.01023223506393303
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.24061433447098976,
|
54 |
-
"acc_stderr": 0.012491468532390571,
|
55 |
-
"acc_norm": 0.2721843003412969,
|
56 |
-
"acc_norm_stderr": 0.013006600406423706
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.887,
|
60 |
-
"acc_stderr": 0.010016552866696858,
|
61 |
-
"acc_norm": 0.877,
|
62 |
-
"acc_norm_stderr": 0.010391293421849877
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.70620239390642,
|
66 |
-
"acc_stderr": 0.010627574080514797,
|
67 |
-
"acc_norm": 0.7040261153427638,
|
68 |
-
"acc_norm_stderr": 0.010650414317148131
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.35,0.015090650341444233,0
|
3 |
+
anli_r2,acc,0.329,0.014865395385928359,0
|
4 |
+
anli_r3,acc,0.335,0.013630871843821474,0
|
5 |
+
arc_challenge,acc,0.23293515358361774,0.012352507042617396,0
|
6 |
+
arc_challenge,acc_norm,0.2790102389078498,0.01310678488360134,0
|
7 |
+
arc_easy,acc,0.547979797979798,0.010212436978834099,0
|
8 |
+
arc_easy,acc_norm,0.5412457912457912,0.010224815730255816,0
|
9 |
+
boolq,acc,0.591743119266055,0.008596583869583202,1
|
10 |
+
cb,acc,0.4642857142857143,0.0672477765493766,1
|
11 |
+
cb,f1,0.2842465753424657,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.3746265684126668,0.004830371317841071,0
|
14 |
+
hellaswag,acc_norm,0.4667396932881896,0.004978729300074892,0
|
15 |
+
piqa,acc,0.7110990206746464,0.010575111841364905,0
|
16 |
+
piqa,acc_norm,0.7170837867247007,0.010508949177489676,0
|
17 |
+
rte,acc,0.4657039711191336,0.030025579819366422,0
|
18 |
+
sciq,acc,0.894,0.009739551265785134,0
|
19 |
+
sciq,acc_norm,0.892,0.009820001651345688,0
|
20 |
+
storycloze_2016,acc,0.6467129877071085,0.011053474766125627,0
|
21 |
+
winogrande,acc,0.5414364640883977,0.014004146853791914,0
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_4.json
CHANGED
@@ -48,6 +48,24 @@
|
|
48 |
"acc_stderr": 0.010212436978834099,
|
49 |
"acc_norm": 0.5412457912457912,
|
50 |
"acc_norm_stderr": 0.010224815730255816
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
}
|
52 |
},
|
53 |
"versions": {
|
@@ -61,6 +79,9 @@
|
|
61 |
"winogrande": 0,
|
62 |
"storycloze_2016": 0,
|
63 |
"boolq": 1,
|
64 |
-
"arc_easy": 0
|
|
|
|
|
|
|
65 |
}
|
66 |
}
|
|
|
48 |
"acc_stderr": 0.010212436978834099,
|
49 |
"acc_norm": 0.5412457912457912,
|
50 |
"acc_norm_stderr": 0.010224815730255816
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.23293515358361774,
|
54 |
+
"acc_stderr": 0.012352507042617396,
|
55 |
+
"acc_norm": 0.2790102389078498,
|
56 |
+
"acc_norm_stderr": 0.01310678488360134
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.894,
|
60 |
+
"acc_stderr": 0.009739551265785134,
|
61 |
+
"acc_norm": 0.892,
|
62 |
+
"acc_norm_stderr": 0.009820001651345688
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7110990206746464,
|
66 |
+
"acc_stderr": 0.010575111841364905,
|
67 |
+
"acc_norm": 0.7170837867247007,
|
68 |
+
"acc_norm_stderr": 0.010508949177489676
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
79 |
"winogrande": 0,
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_4_lm-eval_global_step52452_2023-02-13-14-30-06_4shots_backup.json
DELETED
@@ -1,66 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.35,
|
5 |
-
"acc_stderr": 0.015090650341444233
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.329,
|
9 |
-
"acc_stderr": 0.014865395385928359
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.335,
|
13 |
-
"acc_stderr": 0.013630871843821474
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4642857142857143,
|
17 |
-
"acc_stderr": 0.0672477765493766,
|
18 |
-
"f1": 0.2842465753424657
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.3746265684126668,
|
26 |
-
"acc_stderr": 0.004830371317841071,
|
27 |
-
"acc_norm": 0.4667396932881896,
|
28 |
-
"acc_norm_stderr": 0.004978729300074892
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4657039711191336,
|
32 |
-
"acc_stderr": 0.030025579819366422
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5414364640883977,
|
36 |
-
"acc_stderr": 0.014004146853791914
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6467129877071085,
|
40 |
-
"acc_stderr": 0.011053474766125627
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.591743119266055,
|
44 |
-
"acc_stderr": 0.008596583869583202
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.547979797979798,
|
48 |
-
"acc_stderr": 0.010212436978834099,
|
49 |
-
"acc_norm": 0.5412457912457912,
|
50 |
-
"acc_norm_stderr": 0.010224815730255816
|
51 |
-
}
|
52 |
-
},
|
53 |
-
"versions": {
|
54 |
-
"anli_r1": 0,
|
55 |
-
"anli_r2": 0,
|
56 |
-
"anli_r3": 0,
|
57 |
-
"cb": 1,
|
58 |
-
"copa": 0,
|
59 |
-
"hellaswag": 0,
|
60 |
-
"rte": 0,
|
61 |
-
"winogrande": 0,
|
62 |
-
"storycloze_2016": 0,
|
63 |
-
"boolq": 1,
|
64 |
-
"arc_easy": 0
|
65 |
-
}
|
66 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.33,0.01487687202745673,0
|
3 |
+
anli_r2,acc,0.319,0.014746404865473484,0
|
4 |
+
anli_r3,acc,0.335,0.01363087184382147,0
|
5 |
+
arc_challenge,acc,0.25,0.012653835621466646,0
|
6 |
+
arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0
|
7 |
+
arc_easy,acc,0.5467171717171717,0.01021490151673162,0
|
8 |
+
arc_easy,acc_norm,0.5458754208754208,0.010216507710244115,0
|
9 |
+
boolq,acc,0.5880733944954128,0.008608316516029646,1
|
10 |
+
cb,acc,0.5714285714285714,0.06672848092813058,1
|
11 |
+
cb,f1,0.44974910394265244,,1
|
12 |
+
copa,acc,0.72,0.04512608598542128,0
|
13 |
+
hellaswag,acc,0.37273451503684524,0.004825441080261174,0
|
14 |
+
hellaswag,acc_norm,0.46922923720374426,0.004980323400031081,0
|
15 |
+
piqa,acc,0.7067464635473341,0.010621818421101926,0
|
16 |
+
piqa,acc_norm,0.7094668117519043,0.010592765034696534,0
|
17 |
+
rte,acc,0.5018050541516246,0.030096267148976626,0
|
18 |
+
sciq,acc,0.893,0.009779910359847167,0
|
19 |
+
sciq,acc_norm,0.898,0.009575368801653866,0
|
20 |
+
storycloze_2016,acc,0.6456440406199893,0.011061031791615487,0
|
21 |
+
winogrande,acc,0.5374901341752171,0.014012928183336578,0
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_5.json
CHANGED
@@ -38,6 +38,34 @@
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.6456440406199893,
|
40 |
"acc_stderr": 0.011061031791615487
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
}
|
42 |
},
|
43 |
"versions": {
|
@@ -49,6 +77,11 @@
|
|
49 |
"hellaswag": 0,
|
50 |
"rte": 0,
|
51 |
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
}
|
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.6456440406199893,
|
40 |
"acc_stderr": 0.011061031791615487
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5880733944954128,
|
44 |
+
"acc_stderr": 0.008608316516029646
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.5467171717171717,
|
48 |
+
"acc_stderr": 0.01021490151673162,
|
49 |
+
"acc_norm": 0.5458754208754208,
|
50 |
+
"acc_norm_stderr": 0.010216507710244115
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.25,
|
54 |
+
"acc_stderr": 0.012653835621466646,
|
55 |
+
"acc_norm": 0.2832764505119454,
|
56 |
+
"acc_norm_stderr": 0.013167478735134575
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.893,
|
60 |
+
"acc_stderr": 0.009779910359847167,
|
61 |
+
"acc_norm": 0.898,
|
62 |
+
"acc_norm_stderr": 0.009575368801653866
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7067464635473341,
|
66 |
+
"acc_stderr": 0.010621818421101926,
|
67 |
+
"acc_norm": 0.7094668117519043,
|
68 |
+
"acc_norm_stderr": 0.010592765034696534
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b55boscarseed3/evaluation/rankeval/2b855b55boscarseed3_5_lm-eval_global_step52452_2023-02-13-14-30-06_5shots_backup.json
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.33,
|
5 |
-
"acc_stderr": 0.01487687202745673
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.319,
|
9 |
-
"acc_stderr": 0.014746404865473484
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.335,
|
13 |
-
"acc_stderr": 0.01363087184382147
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5714285714285714,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.44974910394265244
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.72,
|
22 |
-
"acc_stderr": 0.04512608598542128
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.37273451503684524,
|
26 |
-
"acc_stderr": 0.004825441080261174,
|
27 |
-
"acc_norm": 0.46922923720374426,
|
28 |
-
"acc_norm_stderr": 0.004980323400031081
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5018050541516246,
|
32 |
-
"acc_stderr": 0.030096267148976626
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5374901341752171,
|
36 |
-
"acc_stderr": 0.014012928183336578
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6456440406199893,
|
40 |
-
"acc_stderr": 0.011061031791615487
|
41 |
-
}
|
42 |
-
},
|
43 |
-
"versions": {
|
44 |
-
"anli_r1": 0,
|
45 |
-
"anli_r2": 0,
|
46 |
-
"anli_r3": 0,
|
47 |
-
"cb": 1,
|
48 |
-
"copa": 0,
|
49 |
-
"hellaswag": 0,
|
50 |
-
"rte": 0,
|
51 |
-
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
53 |
-
}
|
54 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|