Muennighoff commited on
Commit
84b8e78
·
1 Parent(s): df81cae
4b284b84b50c4py/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.017935100054319638
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.017935100054319638
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.24926963692397494
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.24926963692397494
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.28377360056353806
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.28377360056353806
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2914955959372599
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2914955959372599
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.29520676971635557
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.29520676971635557
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2962576738633393
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2962576738633393
14
+ e2e_nlg_cleaned,5,average,multiple,0.23898972950979788
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04680369312820704
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04680369312820704
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.0417008337343324
18
+ gem_xsum,1,median,rouge2_fmeasure,0.0417008337343324
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04850591809798841
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04850591809798841
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04967459047773307
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04967459047773307
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010664105243991476
24
+ gem_xsum,4,median,rouge2_fmeasure,0.010664105243991476
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0
27
+ gem_xsum,5,average,multiple,0.03289152344704207
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.054561293953947154
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.054561293953947154
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08055044249705505
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.08055044249705505
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.11096714396611368
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.11096714396611368
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.1237969599257094
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.1237969599257094
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.12704926086398677
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.12704926086398677
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.1379410586457312
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.1379410586457312
40
+ web_nlg_en,5,average,multiple,0.10581102664209054
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04006088372076606
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.04006088372076606
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.0563656735247714
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.0563656735247714
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.0694369521302777
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.0694369521302777
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05874455584632557
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.05874455584632557
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.019086213209709144
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.019086213209709144
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0031377384402291207
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0031377384402291207
53
+ wiki_lingua_en,5,average,multiple,0.041138669478679835
4b284b84b50c4py/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.5098138218469542, "bleu_stderr": 0.05060096360447959, "rouge1_fmeasure": 0.12518327032816767, "rouge1_fmeasure_stderr": 0.0020886600736863613, "rouge1_precision": 0.07997908962467618, "rouge1_precision_stderr": 0.0015470706939544915, "rouge1_recall": 0.39288588314861, "rouge1_recall_stderr": 0.005146822884118532, "rouge2_fmeasure": 0.054561293953947154, "rouge2_fmeasure_stderr": 0.0013114125578239729, "rouge2_precision": 0.03497906523132334, "rouge2_precision_stderr": 0.0009482186398467601, "rouge2_recall": 0.17442376421512773, "rouge2_recall_stderr": 0.0034708393927687826, "rougeL_fmeasure": 0.11622278605370577, "rougeL_fmeasure_stderr": 0.0018214505590851086, "rougeL_precision": 0.074005651678666, "rougeL_precision_stderr": 0.0013303805628979073, "rougeL_recall": 0.36848905125450354, "rougeL_recall_stderr": 0.004754000625118938, "rougeLsum_fmeasure": 0.11504001739123668, "rougeLsum_fmeasure_stderr": 0.0019083954925248243, "rougeLsum_precision": 0.07357051299191295, "rougeLsum_precision_stderr": 0.0014135363132826366, "rougeLsum_recall": 0.36055432587149516, "rougeLsum_recall_stderr": 0.004671200673802032}}, "1": {"PALM_prompt": {"bleu": 0.6895070382715364, "bleu_stderr": 0.04766983041296649, "rouge1_fmeasure": 0.15692004313455504, "rouge1_fmeasure_stderr": 0.003951322820929551, "rouge1_precision": 0.14059948527478203, "rouge1_precision_stderr": 0.00479074974537321, "rouge1_recall": 0.3008111137356406, "rouge1_recall_stderr": 0.005248426675337502, "rouge2_fmeasure": 0.08055044249705505, "rouge2_fmeasure_stderr": 0.002687882118841805, "rouge2_precision": 0.07251876896103553, "rouge2_precision_stderr": 0.003196165167747362, "rouge2_recall": 0.15513416232789282, "rouge2_recall_stderr": 0.0036756383667251643, "rougeL_fmeasure": 0.14225435798782274, "rougeL_fmeasure_stderr": 0.003390325244859444, "rougeL_precision": 0.12606437747394972, "rougeL_precision_stderr": 0.0042092536118418875, "rougeL_recall": 0.2816255516577338, "rougeL_recall_stderr": 0.004831045808569158, "rougeLsum_fmeasure": 0.14488507758956776, "rougeLsum_fmeasure_stderr": 0.0034779543816347985, "rougeLsum_precision": 0.12893805693705215, "rougeLsum_precision_stderr": 0.004317494742815136, "rougeLsum_recall": 0.2844061587949683, "rougeLsum_recall_stderr": 0.004869606097749162}}, "2": {"PALM_prompt": {"bleu": 0.9301933610868552, "bleu_stderr": 0.053485998140167926, "rouge1_fmeasure": 0.2047000917610419, "rouge1_fmeasure_stderr": 0.004631485720235752, "rouge1_precision": 0.18698649109774956, "rouge1_precision_stderr": 0.005491375971794517, "rouge1_recall": 0.3549167062140789, "rouge1_recall_stderr": 0.0050410643565384486, "rouge2_fmeasure": 0.11096714396611368, "rouge2_fmeasure_stderr": 0.003255925538473875, "rouge2_precision": 0.10372298958925724, "rouge2_precision_stderr": 0.0037780995467863385, "rouge2_recall": 0.19156481617514662, "rouge2_recall_stderr": 0.0038870076404307315, "rougeL_fmeasure": 0.1827665265757646, "rougeL_fmeasure_stderr": 0.003976346126182768, "rougeL_precision": 0.16451926764953234, "rougeL_precision_stderr": 0.004742921857172842, "rougeL_recall": 0.3289126081433448, "rougeL_recall_stderr": 0.004612183545153906, "rougeLsum_fmeasure": 0.18817541014456196, "rougeLsum_fmeasure_stderr": 0.00413341944039265, "rougeLsum_precision": 0.17045782427847134, "rougeLsum_precision_stderr": 0.004942769483842779, "rougeLsum_recall": 0.33427289847612396, "rougeLsum_recall_stderr": 0.0046918082247746195}}, "3": {"PALM_prompt": {"bleu": 1.119109356484705, "bleu_stderr": 0.04962455635561596, "rouge1_fmeasure": 0.2225302880447419, "rouge1_fmeasure_stderr": 0.0048695766878976265, "rouge1_precision": 0.2113366298073185, "rouge1_precision_stderr": 0.00602072613554832, "rouge1_recall": 0.3684338025148, "rouge1_recall_stderr": 0.0050090299676620745, "rouge2_fmeasure": 0.1237969599257094, "rouge2_fmeasure_stderr": 0.0035213373391811205, "rouge2_precision": 0.12118698500474166, "rouge2_precision_stderr": 0.004253808878018297, "rouge2_recall": 0.20324128030280655, "rouge2_recall_stderr": 0.004025508089204376, "rougeL_fmeasure": 0.19696471466521626, "rougeL_fmeasure_stderr": 0.004159272971540424, "rougeL_precision": 0.18453258517066995, "rougeL_precision_stderr": 0.005187830911977816, "rougeL_recall": 0.33877730437228865, "rougeL_recall_stderr": 0.004568116437867222, "rougeLsum_fmeasure": 0.20272897644508167, "rougeLsum_fmeasure_stderr": 0.0043201113166836756, "rougeLsum_precision": 0.19149813311149133, "rougeLsum_precision_stderr": 0.005414442320894715, "rougeLsum_recall": 0.3445724000767204, "rougeLsum_recall_stderr": 0.004647698525765314}}, "4": {"PALM_prompt": {"bleu": 1.261013189588581, "bleu_stderr": 0.05322833493770805, "rouge1_fmeasure": 0.23021977804800875, "rouge1_fmeasure_stderr": 0.004805017843578889, "rouge1_precision": 0.21559449684372597, "rouge1_precision_stderr": 0.005938155784608134, "rouge1_recall": 0.3902757566349068, "rouge1_recall_stderr": 0.005006436575404741, "rouge2_fmeasure": 0.12704926086398677, "rouge2_fmeasure_stderr": 0.0034019200764343867, "rouge2_precision": 0.12205672520700112, "rouge2_precision_stderr": 0.004080805752927633, "rouge2_recall": 0.2148394071545166, "rouge2_recall_stderr": 0.004027375128376547, "rougeL_fmeasure": 0.20281291598340542, "rougeL_fmeasure_stderr": 0.004060804343018511, "rougeL_precision": 0.186592457900143, "rougeL_precision_stderr": 0.0050396902586261885, "rougeL_recall": 0.3574010140832356, "rougeL_recall_stderr": 0.004512119144197919, "rougeLsum_fmeasure": 0.21054273661472223, "rougeLsum_fmeasure_stderr": 0.0042723456479503924, "rougeLsum_precision": 0.19551647097577013, "rougeLsum_precision_stderr": 0.005329618799143479, "rougeLsum_recall": 0.365076058959261, "rougeLsum_recall_stderr": 0.0046028589201870624}}, "5": {"PALM_prompt": {"bleu": 1.382222176784315, "bleu_stderr": 0.07057333675418884, "rouge1_fmeasure": 0.24587256860116596, "rouge1_fmeasure_stderr": 0.004941049800893347, "rouge1_precision": 0.23462975339068498, "rouge1_precision_stderr": 0.006207558666486256, "rouge1_recall": 0.4036527302329447, "rouge1_recall_stderr": 0.004972753683678452, "rouge2_fmeasure": 0.1379410586457312, "rouge2_fmeasure_stderr": 0.00354934847481156, "rouge2_precision": 0.135588287099774, "rouge2_precision_stderr": 0.004347173326246466, "rouge2_recall": 0.22623705206407865, "rouge2_recall_stderr": 0.004094013041743802, "rougeL_fmeasure": 0.21637303242146647, "rougeL_fmeasure_stderr": 0.0041569059177393106, "rougeL_precision": 0.2032261792267289, "rougeL_precision_stderr": 0.005270512254733833, "rougeL_recall": 0.3697141521231861, "rougeL_recall_stderr": 0.004491607957314641, "rougeLsum_fmeasure": 0.2245199059654717, "rougeLsum_fmeasure_stderr": 0.004390712369956832, "rougeLsum_precision": 0.21298951998949425, "rougeLsum_precision_stderr": 0.005590550686832627, "rougeLsum_recall": 0.37767970587330746, "rougeLsum_recall_stderr": 0.0045974467831160935}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.110745836930975, "bleu_stderr": 0.07064768665407693, "rouge1_fmeasure": 0.18391283765498312, "rouge1_fmeasure_stderr": 0.0020989680758685095, "rouge1_precision": 0.1622328097487295, "rouge1_precision_stderr": 0.0022006678942036004, "rouge1_recall": 0.2593501445501777, "rouge1_recall_stderr": 0.002979185903395691, "rouge2_fmeasure": 0.04006088372076606, "rouge2_fmeasure_stderr": 0.000953260340177019, "rouge2_precision": 0.034780325238359784, "rouge2_precision_stderr": 0.0008768109603226238, "rouge2_recall": 0.05930223667848994, "rouge2_recall_stderr": 0.001579605371883735, "rougeL_fmeasure": 0.14164532674216926, "rougeL_fmeasure_stderr": 0.0015137092346997794, "rougeL_precision": 0.12375204682982781, "rougeL_precision_stderr": 0.0015766613380100679, "rougeL_recall": 0.20454500638861178, "rougeL_recall_stderr": 0.002406647868246692, "rougeLsum_fmeasure": 0.1700189081143506, "rougeLsum_fmeasure_stderr": 0.0019429203849904086, "rougeLsum_precision": 0.14987931736685706, "rougeLsum_precision_stderr": 0.0020425047605301056, "rougeLsum_recall": 0.24059690190209113, "rougeLsum_recall_stderr": 0.0027893524858174706}}, "1": {"tldr_en": {"bleu": 3.023357351392093, "bleu_stderr": 0.11052320078944201, "rouge1_fmeasure": 0.2119754604069829, "rouge1_fmeasure_stderr": 0.0021782519083976153, "rouge1_precision": 0.3202481727430097, "rouge1_precision_stderr": 0.004264467627952037, "rouge1_recall": 0.2207962470049762, "rouge1_recall_stderr": 0.0028801491805220264, "rouge2_fmeasure": 0.0563656735247714, "rouge2_fmeasure_stderr": 0.0013284740367869353, "rouge2_precision": 0.0950682913953417, "rouge2_precision_stderr": 0.0027338223457833164, "rouge2_recall": 0.05763523272789191, "rouge2_recall_stderr": 0.0015051452622768476, "rougeL_fmeasure": 0.1619539966954325, "rougeL_fmeasure_stderr": 0.001682779518229553, "rougeL_precision": 0.252396656765881, "rougeL_precision_stderr": 0.0036788013120410477, "rougeL_recall": 0.16757172008949361, "rougeL_recall_stderr": 0.0022092850048167803, "rougeLsum_fmeasure": 0.198727672806483, "rougeLsum_fmeasure_stderr": 0.0020483089866878148, "rougeLsum_precision": 0.3016125071930562, "rougeLsum_precision_stderr": 0.00409496014253931, "rougeLsum_recall": 0.2069248240311855, "rougeLsum_recall_stderr": 0.0026981347577400188}}, "2": {"tldr_en": {"bleu": 3.6376939186549806, "bleu_stderr": 0.0847116802328539, "rouge1_fmeasure": 0.23731543258134605, "rouge1_fmeasure_stderr": 0.002181814977988499, "rouge1_precision": 0.36224625080917877, "rouge1_precision_stderr": 0.004268522066364884, "rouge1_recall": 0.23894150798627864, "rouge1_recall_stderr": 0.00287751614854991, "rouge2_fmeasure": 0.0694369521302777, "rouge2_fmeasure_stderr": 0.001451340290895489, "rouge2_precision": 0.11641566052506848, "rouge2_precision_stderr": 0.0029013500551178704, "rouge2_recall": 0.06855371632614782, "rouge2_recall_stderr": 0.001623314787498748, "rougeL_fmeasure": 0.18362158068544487, "rougeL_fmeasure_stderr": 0.0017594777053893203, "rougeL_precision": 0.2884586734883985, "rougeL_precision_stderr": 0.0037559342865190744, "rougeL_recall": 0.1828827212614525, "rougeL_recall_stderr": 0.002237692045406322, "rougeLsum_fmeasure": 0.2232809171801349, "rougeLsum_fmeasure_stderr": 0.002067558645695987, "rougeLsum_precision": 0.3429661634104491, "rougeLsum_precision_stderr": 0.0041458713432634365, "rougeLsum_recall": 0.22433930943775152, "rougeLsum_recall_stderr": 0.0027008082388441947}}, "3": {"tldr_en": {"bleu": 2.55145882385057, "bleu_stderr": 0.07101970072988376, "rouge1_fmeasure": 0.1988863772399344, "rouge1_fmeasure_stderr": 0.0026030390612248305, "rouge1_precision": 0.31060237609641006, "rouge1_precision_stderr": 0.004574907476929346, "rouge1_recall": 0.19457815683796006, "rouge1_recall_stderr": 0.003058377988481469, "rouge2_fmeasure": 0.05874455584632557, "rouge2_fmeasure_stderr": 0.0014421360291722958, "rouge2_precision": 0.09798346069589672, "rouge2_precision_stderr": 0.0027362211692342543, "rouge2_recall": 0.05653416860175655, "rouge2_recall_stderr": 0.0015263012561591658, "rougeL_fmeasure": 0.15600398127765575, "rougeL_fmeasure_stderr": 0.0020914175231427435, "rougeL_precision": 0.25000993696767837, "rougeL_precision_stderr": 0.003921044548290687, "rougeL_recall": 0.15127868847161555, "rougeL_recall_stderr": 0.00241265194449905, "rougeLsum_fmeasure": 0.18818778802744168, "rougeLsum_fmeasure_stderr": 0.00247201631180488, "rougeLsum_precision": 0.2951646008016156, "rougeLsum_precision_stderr": 0.0044047605714275665, "rougeLsum_recall": 0.18392754507983425, "rougeLsum_recall_stderr": 0.0028926410080899532}}, "4": {"tldr_en": {"bleu": 0.04876682262004958, "bleu_stderr": 0.007388216078076325, "rouge1_fmeasure": 0.06449655570225347, "rouge1_fmeasure_stderr": 0.002279359909802756, "rouge1_precision": 0.10237758965502898, "rouge1_precision_stderr": 0.0038082028079637707, "rouge1_recall": 0.06325406600566759, "rouge1_recall_stderr": 0.00244885822737541, "rouge2_fmeasure": 0.019086213209709144, "rouge2_fmeasure_stderr": 0.0010313879291181905, "rouge2_precision": 0.03313408946575443, "rouge2_precision_stderr": 0.001971552602522673, "rouge2_recall": 0.01873153070181728, "rouge2_recall_stderr": 0.0011188172332047676, "rougeL_fmeasure": 0.05111677540649122, "rougeL_fmeasure_stderr": 0.0018373594855618225, "rougeL_precision": 0.08330767573513635, "rougeL_precision_stderr": 0.0032229367947723703, "rougeL_recall": 0.049567641460076216, "rougeL_recall_stderr": 0.0019370210262382797, "rougeLsum_fmeasure": 0.06042226404875415, "rougeLsum_fmeasure_stderr": 0.0021401981057287767, "rougeLsum_precision": 0.09693167971670034, "rougeLsum_precision_stderr": 0.003643885871279085, "rougeLsum_recall": 0.059085237057582875, "rougeLsum_recall_stderr": 0.0022884093427356037}}, "5": {"tldr_en": {"bleu": 1.4976000723748304e-15, "bleu_stderr": 1.1059832294968636e-13, "rouge1_fmeasure": 0.010118014498166504, "rouge1_fmeasure_stderr": 0.0010113736545779898, "rouge1_precision": 0.016235121940252128, "rouge1_precision_stderr": 0.0016686434368938423, "rouge1_recall": 0.010183814258076275, "rouge1_recall_stderr": 0.001097667574193706, "rouge2_fmeasure": 0.0031377384402291207, "rouge2_fmeasure_stderr": 0.0004476520277228893, "rouge2_precision": 0.005590929108231266, "rouge2_precision_stderr": 0.000937988804117156, "rouge2_recall": 0.003129173374617729, "rouge2_recall_stderr": 0.00046850380728453, "rougeL_fmeasure": 0.008131697170035954, "rougeL_fmeasure_stderr": 0.0008247655086624107, "rougeL_precision": 0.01352266256791126, "rougeL_precision_stderr": 0.0014515230893845552, "rougeL_recall": 0.008025523871576613, "rougeL_recall_stderr": 0.000860076761469635, "rougeLsum_fmeasure": 0.009571072256019286, "rougeLsum_fmeasure_stderr": 0.0009617988003308719, "rougeLsum_precision": 0.01548330614926552, "rougeLsum_precision_stderr": 0.001611589733322966, "rougeLsum_recall": 0.009642698052240453, "rougeLsum_recall_stderr": 0.0010455429701617517}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.5852012244463308, "bleu_stderr": 0.026361027339458533, "rouge1_fmeasure": 0.14458919556827698, "rouge1_fmeasure_stderr": 0.0015532976494931187, "rouge1_precision": 0.1590141471297934, "rouge1_precision_stderr": 0.0014552899258277647, "rouge1_recall": 0.14197505590399634, "rouge1_recall_stderr": 0.002072363162364809, "rouge2_fmeasure": 0.017935100054319638, "rouge2_fmeasure_stderr": 0.0009632081918137612, "rouge2_precision": 0.018166989758767575, "rouge2_precision_stderr": 0.0008838695112435292, "rouge2_recall": 0.019810806358443775, "rouge2_recall_stderr": 0.00117585945466284, "rougeL_fmeasure": 0.11352665448181354, "rougeL_fmeasure_stderr": 0.0010961058265376489, "rougeL_precision": 0.1252634030724975, "rougeL_precision_stderr": 0.0010541325911481722, "rougeL_recall": 0.11084406976385312, "rougeL_recall_stderr": 0.001451614593329274, "rougeLsum_fmeasure": 0.13516732592132347, "rougeLsum_fmeasure_stderr": 0.0014776264399771696, "rougeLsum_precision": 0.14863259520829622, "rougeLsum_precision_stderr": 0.0013705583227301076, "rougeLsum_recall": 0.13286378093275208, "rougeLsum_recall_stderr": 0.0019874811518630986}}, "1": {"generate_text_restaurant": {"bleu": 13.507277195006653, "bleu_stderr": 0.16483547827335596, "rouge1_fmeasure": 0.5093521713616069, "rouge1_fmeasure_stderr": 0.0023997232757279057, "rouge1_precision": 0.6201851114252256, "rouge1_precision_stderr": 0.0030820025911735206, "rouge1_recall": 0.4690321202679439, "rouge1_recall_stderr": 0.003068769543571945, "rouge2_fmeasure": 0.24926963692397494, "rouge2_fmeasure_stderr": 0.002176003390187075, "rouge2_precision": 0.3073153288765937, "rouge2_precision_stderr": 0.0027529160822288828, "rouge2_recall": 0.2290062918365036, "rouge2_recall_stderr": 0.002283581117206086, "rougeL_fmeasure": 0.3643527171825137, "rougeL_fmeasure_stderr": 0.0022059644667924714, "rougeL_precision": 0.44708616232730114, "rougeL_precision_stderr": 0.002985934919462904, "rougeL_recall": 0.3342911836548333, "rougeL_recall_stderr": 0.002523892540835498, "rougeLsum_fmeasure": 0.41201423773876833, "rougeLsum_fmeasure_stderr": 0.00245063535885353, "rougeLsum_precision": 0.5034589363779699, "rougeLsum_precision_stderr": 0.003192957855851322, "rougeLsum_recall": 0.3785009643908683, "rougeLsum_recall_stderr": 0.0028181096908259035}}, "2": {"generate_text_restaurant": {"bleu": 16.481995381899633, "bleu_stderr": 0.15459753428389664, "rouge1_fmeasure": 0.5506597383073797, "rouge1_fmeasure_stderr": 0.002321540785820989, "rouge1_precision": 0.6444938902196201, "rouge1_precision_stderr": 0.002947588857328497, "rouge1_recall": 0.5129375627218103, "rouge1_recall_stderr": 0.0029659013100575885, "rouge2_fmeasure": 0.28377360056353806, "rouge2_fmeasure_stderr": 0.0023119772628562006, "rouge2_precision": 0.3344287852055559, "rouge2_precision_stderr": 0.002811288765537161, "rouge2_recall": 0.26446540440191596, "rouge2_recall_stderr": 0.002437621985064321, "rougeL_fmeasure": 0.39697613633354006, "rougeL_fmeasure_stderr": 0.0022993329597404658, "rougeL_precision": 0.466089444973472, "rougeL_precision_stderr": 0.002920225812448154, "rougeL_recall": 0.3693289721204083, "rougeL_recall_stderr": 0.002606982684965207, "rougeLsum_fmeasure": 0.4546121524741158, "rougeLsum_fmeasure_stderr": 0.002476903442286951, "rougeLsum_precision": 0.5322185064753675, "rougeLsum_precision_stderr": 0.0030727166950812556, "rougeLsum_recall": 0.4233023155134429, "rougeLsum_recall_stderr": 0.0028567132575660105}}, "3": {"generate_text_restaurant": {"bleu": 17.308450089705314, "bleu_stderr": 0.22086009360666295, "rouge1_fmeasure": 0.5567530202145615, "rouge1_fmeasure_stderr": 0.0022894627101959436, "rouge1_precision": 0.6431777215006504, "rouge1_precision_stderr": 0.002909343117628468, "rouge1_recall": 0.5224188094825261, "rouge1_recall_stderr": 0.002938502170763304, "rouge2_fmeasure": 0.2914955959372599, "rouge2_fmeasure_stderr": 0.002322927357655023, "rouge2_precision": 0.3384419249334283, "rouge2_precision_stderr": 0.002742792950498583, "rouge2_recall": 0.27392910888878425, "rouge2_recall_stderr": 0.0024887948729642026, "rougeL_fmeasure": 0.40222559726561513, "rougeL_fmeasure_stderr": 0.0023215626762372083, "rougeL_precision": 0.4655119630180568, "rougeL_precision_stderr": 0.0028772700896125296, "rougeL_recall": 0.3772066163124021, "rougeL_recall_stderr": 0.0026415829750796697, "rougeLsum_fmeasure": 0.461741585929262, "rougeLsum_fmeasure_stderr": 0.002503396697259892, "rougeLsum_precision": 0.5332125905228263, "rougeLsum_precision_stderr": 0.0030585004302424172, "rougeLsum_recall": 0.4332271251742262, "rougeLsum_recall_stderr": 0.0028824918758452683}}, "4": {"generate_text_restaurant": {"bleu": 17.495980616097135, "bleu_stderr": 0.17307149350143702, "rouge1_fmeasure": 0.5613161366361096, "rouge1_fmeasure_stderr": 0.0023171455286369763, "rouge1_precision": 0.6465369286737681, "rouge1_precision_stderr": 0.0029278310769372726, "rouge1_recall": 0.5260683563196515, "rouge1_recall_stderr": 0.002916828859151506, "rouge2_fmeasure": 0.29520676971635557, "rouge2_fmeasure_stderr": 0.002330515147148159, "rouge2_precision": 0.3421050999354562, "rouge2_precision_stderr": 0.0027552714259740396, "rouge2_recall": 0.27668219477331424, "rouge2_recall_stderr": 0.002469924371598272, "rougeL_fmeasure": 0.4027622295170213, "rougeL_fmeasure_stderr": 0.0023055504801453076, "rougeL_precision": 0.464776524456291, "rougeL_precision_stderr": 0.0028527686674565873, "rougeL_recall": 0.37716849465743735, "rougeL_recall_stderr": 0.0025916179057099017, "rougeLsum_fmeasure": 0.4660844647086055, "rougeLsum_fmeasure_stderr": 0.002514865422037328, "rougeLsum_precision": 0.5366861528228768, "rougeLsum_precision_stderr": 0.0030587155217260326, "rougeLsum_recall": 0.4367050144160445, "rougeLsum_recall_stderr": 0.00286635251824686}}, "5": {"generate_text_restaurant": {"bleu": 17.40280539828124, "bleu_stderr": 0.28202555102963817, "rouge1_fmeasure": 0.5616575775191787, "rouge1_fmeasure_stderr": 0.0022515349809343866, "rouge1_precision": 0.646995029439956, "rouge1_precision_stderr": 0.0029516950054299758, "rouge1_recall": 0.5253339359357596, "rouge1_recall_stderr": 0.0028094332205203746, "rouge2_fmeasure": 0.2962576738633393, "rouge2_fmeasure_stderr": 0.002279902179418181, "rouge2_precision": 0.3441558225622812, "rouge2_precision_stderr": 0.0027584401176844377, "rouge2_recall": 0.27662310263048556, "rouge2_recall_stderr": 0.002387047171386173, "rougeL_fmeasure": 0.40535164515562533, "rougeL_fmeasure_stderr": 0.0023211728697138507, "rougeL_precision": 0.46729152631416565, "rougeL_precision_stderr": 0.0029015066363383035, "rougeL_recall": 0.3791657172884169, "rougeL_recall_stderr": 0.002577249268209316, "rougeLsum_fmeasure": 0.46603328453805426, "rougeLsum_fmeasure_stderr": 0.0024810183729863256, "rougeLsum_precision": 0.5365825818684676, "rougeLsum_precision_stderr": 0.003084280912566546, "rougeLsum_recall": 0.4359071526525408, "rougeLsum_recall_stderr": 0.002799203068063538}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8763328831796027, "bleu_stderr": 0.0923827870325087, "rouge1_fmeasure": 0.20883442387891565, "rouge1_fmeasure_stderr": 0.002472005674573301, "rouge1_precision": 0.15909731432027147, "rouge1_precision_stderr": 0.002132759078497151, "rouge1_recall": 0.33975045112193397, "rouge1_recall_stderr": 0.0042287490792687265, "rouge2_fmeasure": 0.04680369312820704, "rouge2_fmeasure_stderr": 0.0015081103668458638, "rouge2_precision": 0.03459104558061762, "rouge2_precision_stderr": 0.0011298465379970643, "rouge2_recall": 0.07960025507169356, "rouge2_recall_stderr": 0.002673203279710482, "rougeL_fmeasure": 0.15700559398159578, "rougeL_fmeasure_stderr": 0.001840556826184235, "rougeL_precision": 0.1194089989583174, "rougeL_precision_stderr": 0.0015829660627589574, "rougeL_recall": 0.2570328994949185, "rougeL_recall_stderr": 0.0032985395811482776, "rougeLsum_fmeasure": 0.1646164153318214, "rougeLsum_fmeasure_stderr": 0.002053284091119862, "rougeLsum_precision": 0.12474361087892741, "rougeLsum_precision_stderr": 0.0016926059996416518, "rougeLsum_recall": 0.27063275162447814, "rougeLsum_recall_stderr": 0.003721601014570599}}, "1": {"article_DOC_summary": {"bleu": 1.62331392395729, "bleu_stderr": 0.13957719900990057, "rouge1_fmeasure": 0.19854683173474055, "rouge1_fmeasure_stderr": 0.003138904476839458, "rouge1_precision": 0.18476284734106926, "rouge1_precision_stderr": 0.003853949796022425, "rouge1_recall": 0.2636176040720579, "rouge1_recall_stderr": 0.004034355018649378, "rouge2_fmeasure": 0.0417008337343324, "rouge2_fmeasure_stderr": 0.001844056511669052, "rouge2_precision": 0.04013226103600592, "rouge2_precision_stderr": 0.002008900370209435, "rouge2_recall": 0.05468674823017407, "rouge2_recall_stderr": 0.0024161059976638417, "rougeL_fmeasure": 0.1526044357261457, "rougeL_fmeasure_stderr": 0.002430440574683936, "rougeL_precision": 0.14171333532444522, "rougeL_precision_stderr": 0.002985043877004314, "rougeL_recall": 0.2036738453976186, "rougeL_recall_stderr": 0.003165404030943361, "rougeLsum_fmeasure": 0.15507063400534885, "rougeLsum_fmeasure_stderr": 0.002494715563067228, "rougeLsum_precision": 0.14345673879227663, "rougeLsum_precision_stderr": 0.003002408421060159, "rougeLsum_recall": 0.20826830513702577, "rougeLsum_recall_stderr": 0.0034123171532269105}}, "2": {"article_DOC_summary": {"bleu": 2.264207726040316, "bleu_stderr": 0.1519372512140114, "rouge1_fmeasure": 0.22343466846613724, "rouge1_fmeasure_stderr": 0.0034294449616031216, "rouge1_precision": 0.22697147352921718, "rouge1_precision_stderr": 0.004135220441947088, "rouge1_recall": 0.2498602210924615, "rouge1_recall_stderr": 0.0037616618590036598, "rouge2_fmeasure": 0.04850591809798841, "rouge2_fmeasure_stderr": 0.0020532350545423056, "rouge2_precision": 0.050380892887394634, "rouge2_precision_stderr": 0.002269569961463128, "rouge2_recall": 0.052730357803667374, "rouge2_recall_stderr": 0.0022412228845386517, "rougeL_fmeasure": 0.16927761493532015, "rougeL_fmeasure_stderr": 0.0027160080990334553, "rougeL_precision": 0.17183853925167758, "rougeL_precision_stderr": 0.003273790335792938, "rougeL_recall": 0.1899140385833666, "rougeL_recall_stderr": 0.0029486077822403725, "rougeLsum_fmeasure": 0.17171028839217173, "rougeLsum_fmeasure_stderr": 0.0027519063561672985, "rougeLsum_precision": 0.173764260813844, "rougeLsum_precision_stderr": 0.0032801152695190023, "rougeLsum_recall": 0.19384428497014153, "rougeLsum_recall_stderr": 0.0031252549546704417}}, "3": {"article_DOC_summary": {"bleu": 2.7167462852460504, "bleu_stderr": 0.21202877464141628, "rouge1_fmeasure": 0.2201273905205651, "rouge1_fmeasure_stderr": 0.0037277736586615417, "rouge1_precision": 0.22976406694092735, "rouge1_precision_stderr": 0.004334778979956723, "rouge1_recall": 0.23539663630459298, "rouge1_recall_stderr": 0.0040658645001539, "rouge2_fmeasure": 0.04967459047773307, "rouge2_fmeasure_stderr": 0.0021617116986141554, "rouge2_precision": 0.05272713733903644, "rouge2_precision_stderr": 0.002437993294757084, "rouge2_recall": 0.05267467868770681, "rouge2_recall_stderr": 0.00232438412756398, "rougeL_fmeasure": 0.16488733019725074, "rougeL_fmeasure_stderr": 0.0029374277063209616, "rougeL_precision": 0.17233137155710748, "rougeL_precision_stderr": 0.00343068320220773, "rougeL_recall": 0.17671653344810714, "rougeL_recall_stderr": 0.003210034725965065, "rougeLsum_fmeasure": 0.16688283569428947, "rougeLsum_fmeasure_stderr": 0.0029463438546390693, "rougeLsum_precision": 0.17413520183712747, "rougeLsum_precision_stderr": 0.003429680687086958, "rougeLsum_recall": 0.17944378765773264, "rougeLsum_recall_stderr": 0.0032699248091540645}}, "4": {"article_DOC_summary": {"bleu": 0.09461470173164335, "bleu_stderr": 0.0329945643639599, "rouge1_fmeasure": 0.05257310234945274, "rouge1_fmeasure_stderr": 0.003202133641550312, "rouge1_precision": 0.0639762171944909, "rouge1_precision_stderr": 0.004112766009623441, "rouge1_recall": 0.05230451397205071, "rouge1_recall_stderr": 0.003300778272428403, "rouge2_fmeasure": 0.010664105243991476, "rouge2_fmeasure_stderr": 0.0011334684079076297, "rouge2_precision": 0.014084650794812208, "rouge2_precision_stderr": 0.0018021649472101588, "rouge2_recall": 0.010513947821728008, "rouge2_recall_stderr": 0.0011479600025205774, "rougeL_fmeasure": 0.03993936398500153, "rougeL_fmeasure_stderr": 0.002480210616396321, "rougeL_precision": 0.04969603393842411, "rougeL_precision_stderr": 0.0033642171169700583, "rougeL_recall": 0.03953134685954738, "rougeL_recall_stderr": 0.0025110554007188516, "rougeLsum_fmeasure": 0.040565926112939486, "rougeLsum_fmeasure_stderr": 0.00251132295406633, "rougeLsum_precision": 0.050311330553243515, "rougeLsum_precision_stderr": 0.003384236397405843, "rougeLsum_recall": 0.04034040163201439, "rougeLsum_recall_stderr": 0.002593851439782915}}, "5": {"article_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}}}
4b284b84b50c4py/evaluation/rankeval/4b284b84b50c4py_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.333,0.014910846164229863,0
3
+ anli_r2,acc,0.338,0.014965960710224489,0
4
+ anli_r3,acc,0.32916666666666666,0.013570806258433618,0
5
+ arc_challenge,acc,0.2627986348122867,0.012862523175351331,0
6
+ arc_challenge,acc_norm,0.2764505119453925,0.013069662474252425,0
7
+ arc_easy,acc,0.5989057239057239,0.010057051106534372,0
8
+ arc_easy,acc_norm,0.5425084175084175,0.010222638127749496,0
9
+ boolq,acc,0.5648318042813456,0.00867122958058212,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.45634706712873824,,1
12
+ copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.4470225054769966,0.004961693567208817,0
14
+ hellaswag,acc_norm,0.5853415654252141,0.004916561213591294,0
15
+ piqa,acc,0.736126224156692,0.010282996367695564,0
16
+ piqa,acc_norm,0.7475516866158868,0.010135665547362348,0
17
+ rte,acc,0.5595667870036101,0.029882123363118705,0
18
+ sciq,acc,0.839,0.011628164696727195,0
19
+ sciq,acc_norm,0.755,0.01360735683959812,0
20
+ storycloze_2016,acc,0.6985569214323891,0.010611646032767584,0
21
+ winogrande,acc,0.5753749013417522,0.013891893150264215,0
4b284b84b50c4py/evaluation/rankeval/4b284b84b50c4py_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.323,0.014794927843348635,0
3
+ anli_r2,acc,0.319,0.014746404865473475,0
4
+ anli_r3,acc,0.33416666666666667,0.01362243481313678,0
5
+ arc_challenge,acc,0.27559726962457337,0.01305716965576184,0
6
+ arc_challenge,acc_norm,0.30716723549488056,0.013481034054980945,0
7
+ arc_easy,acc,0.6140572390572391,0.009989277329503953,0
8
+ arc_easy,acc_norm,0.5862794612794613,0.010105878530238126,0
9
+ boolq,acc,0.5743119266055046,0.008647930658219415,1
10
+ cb,acc,0.5,0.06741998624632421,1
11
+ cb,f1,0.35412081193445827,,1
12
+ copa,acc,0.7,0.046056618647183814,0
13
+ hellaswag,acc,0.4482174865564629,0.004962949784236046,0
14
+ hellaswag,acc_norm,0.5867357100179247,0.00491413085543178,0
15
+ piqa,acc,0.7306855277475517,0.010350004070588758,0
16
+ piqa,acc_norm,0.7393906420021763,0.010241826155811639,0
17
+ rte,acc,0.5415162454873647,0.02999253538537331,0
18
+ sciq,acc,0.891,0.009859828407037188,0
19
+ sciq,acc_norm,0.894,0.009739551265785138,0
20
+ storycloze_2016,acc,0.6889363976483164,0.010705164869803167,0
21
+ winogrande,acc,0.5619573796369376,0.013944181296470804,0
4b284b84b50c4py/evaluation/rankeval/4b284b84b50c4py_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.319,0.014746404865473496,0
3
+ anli_r2,acc,0.324,0.01480686473373886,0
4
+ anli_r3,acc,0.3383333333333333,0.013664144006618263,0
5
+ arc_challenge,acc,0.2858361774744027,0.013203196088537369,0
6
+ arc_challenge,acc_norm,0.3242320819112628,0.01367881039951882,0
7
+ arc_easy,acc,0.6220538720538721,0.009949405744045467,0
8
+ arc_easy,acc_norm,0.6052188552188552,0.010030038935883601,0
9
+ boolq,acc,0.5844036697247706,0.008619555273337567,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.25326797385620914,,1
12
+ copa,acc,0.69,0.04648231987117316,0
13
+ hellaswag,acc,0.44692292372037445,0.0049615875742756235,0
14
+ hellaswag,acc_norm,0.5881298546106354,0.00491165988450615,0
15
+ piqa,acc,0.7301414581066377,0.010356595421852202,0
16
+ piqa,acc_norm,0.7464635473340587,0.010150090834551791,0
17
+ rte,acc,0.5054151624548736,0.030094698123239966,0
18
+ sciq,acc,0.908,0.009144376393151108,0
19
+ sciq,acc_norm,0.895,0.009698921026024975,0
20
+ storycloze_2016,acc,0.6990913949759487,0.010606289538707339,0
21
+ winogrande,acc,0.5666929755327546,0.013926915052757343,0
4b284b84b50c4py/evaluation/rankeval/4b284b84b50c4py_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.311,0.014645596385722692,0
3
+ anli_r2,acc,0.341,0.014998131348402709,0
4
+ anli_r3,acc,0.3433333333333333,0.01371263383046586,0
5
+ arc_challenge,acc,0.2841296928327645,0.013179442447653886,0
6
+ arc_challenge,acc_norm,0.3199658703071672,0.013631345807016196,0
7
+ arc_easy,acc,0.6287878787878788,0.009913599001845743,0
8
+ arc_easy,acc_norm,0.6199494949494949,0.009960175831493131,0
9
+ boolq,acc,0.5969418960244648,0.008579113210566461,1
10
+ cb,acc,0.6071428571428571,0.0658538889806635,1
11
+ cb,f1,0.39585039585039583,,1
12
+ copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.4494124676359291,0.004964177035221422,0
14
+ hellaswag,acc_norm,0.5891256721768572,0.004909870006388833,0
15
+ piqa,acc,0.7437431991294886,0.010185787831565067,0
16
+ piqa,acc_norm,0.7480957562568009,0.010128421335088688,0
17
+ rte,acc,0.5342960288808665,0.030025579819366426,0
18
+ sciq,acc,0.913,0.008916866630745918,0
19
+ sciq,acc_norm,0.912,0.00896305396259208,0
20
+ storycloze_2016,acc,0.6969535008017104,0.01062761307337672,0
21
+ winogrande,acc,0.5737963693764798,0.013898585965412338,0
4b284b84b50c4py/evaluation/rankeval/4b284b84b50c4py_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.35,0.015090650341444233,0
3
+ anli_r2,acc,0.34,0.014987482264363937,0
4
+ anli_r3,acc,0.33416666666666667,0.013622434813136772,0
5
+ arc_challenge,acc,0.29266211604095566,0.013295916103619406,0
6
+ arc_challenge,acc_norm,0.3250853242320819,0.013688147309729122,0
7
+ arc_easy,acc,0.625,0.009933992677987828,0
8
+ arc_easy,acc_norm,0.6224747474747475,0.009947227833469421,0
9
+ boolq,acc,0.5938837920489297,0.008589510943787407,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.35643298415256514,,1
12
+ copa,acc,0.71,0.04560480215720684,0
13
+ hellaswag,acc,0.448814977096196,0.00496356702912905,0
14
+ hellaswag,acc_norm,0.5899223262298346,0.004908423147162029,0
15
+ piqa,acc,0.7377584330794341,0.010262502565172449,0
16
+ piqa,acc_norm,0.7442872687704026,0.010178690109459872,0
17
+ rte,acc,0.5054151624548736,0.030094698123239966,0
18
+ sciq,acc,0.919,0.008632121032139985,0
19
+ sciq,acc_norm,0.919,0.008632121032139978,0
20
+ storycloze_2016,acc,0.7028327097808659,0.01056831334579161,0
21
+ winogrande,acc,0.5572217837411207,0.013960157350784994,0
4b284b84b50c4py/evaluation/rankeval/4b284b84b50c4py_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.341,0.014998131348402699,0
3
+ anli_r2,acc,0.324,0.01480686473373886,0
4
+ anli_r3,acc,0.34,0.013680495725767792,0
5
+ arc_challenge,acc,0.302901023890785,0.013428241573185349,0
6
+ arc_challenge,acc_norm,0.32081911262798635,0.013640943091946522,0
7
+ arc_easy,acc,0.6338383838383839,0.009885391390947724,0
8
+ arc_easy,acc_norm,0.6220538720538721,0.009949405744045478,0
9
+ boolq,acc,0.5782874617737003,0.008637194202160975,1
10
+ cb,acc,0.5892857142857143,0.06633634150359538,1
11
+ cb,f1,0.38386243386243385,,1
12
+ copa,acc,0.72,0.04512608598542127,0
13
+ hellaswag,acc,0.4480183230432185,0.004962742426849888,0
14
+ hellaswag,acc_norm,0.5910177255526787,0.00490641198447679,0
15
+ piqa,acc,0.733949945593036,0.010310039263352831,0
16
+ piqa,acc_norm,0.7475516866158868,0.010135665547362355,0
17
+ rte,acc,0.5740072202166066,0.02976495674177765,0
18
+ sciq,acc,0.914,0.008870325962594766,0
19
+ sciq,acc_norm,0.919,0.008632121032139993,0
20
+ storycloze_2016,acc,0.709246392303581,0.01050123362521308,0
21
+ winogrande,acc,0.5777426992896606,0.013881582030658556,0
4b284b84b90c4py/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,2.4041585445094217e-05
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,2.4041585445094217e-05
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.20123613248710673
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.20123613248710673
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.22626470896988504
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.22626470896988504
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23737804451490846
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.23737804451490846
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.24074082590052123
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.24074082590052123
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24269152018910306
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.24269152018910306
14
+ e2e_nlg_cleaned,5,average,multiple,0.19138921227449493
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04953978855224528
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04953978855224528
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.05319563291201607
18
+ gem_xsum,1,median,rouge2_fmeasure,0.05319563291201607
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.06406681848626028
20
+ gem_xsum,2,median,rouge2_fmeasure,0.06406681848626028
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.060726575070014825
22
+ gem_xsum,3,median,rouge2_fmeasure,0.060726575070014825
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.014905541100548116
24
+ gem_xsum,4,median,rouge2_fmeasure,0.014905541100548116
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0
27
+ gem_xsum,5,average,multiple,0.04040572602018076
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0474168734057962
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.0474168734057962
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07715129615847176
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.07715129615847176
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09725568781400727
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.09725568781400727
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10009879345207362
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.10009879345207362
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10768923599178877
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.10768923599178877
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.1166710535574247
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.1166710535574247
40
+ web_nlg_en,5,average,multiple,0.09104715672992705
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03196515454564458
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03196515454564458
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05711975562569963
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05711975562569963
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06947167396458416
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.06947167396458416
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05830834942902882
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.05830834942902882
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01814711808646534
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.01814711808646534
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0029039552430087483
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0029039552430087483
53
+ wiki_lingua_en,5,average,multiple,0.03965266781573855
4b284b84b90c4py/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.32769578493596396, "bleu_stderr": 0.03975594383595088, "rouge1_fmeasure": 0.10326176051858367, "rouge1_fmeasure_stderr": 0.001904944369287576, "rouge1_precision": 0.06692913386438858, "rouge1_precision_stderr": 0.0014929273207800753, "rouge1_recall": 0.30820653322004204, "rouge1_recall_stderr": 0.005177311857766762, "rouge2_fmeasure": 0.0474168734057962, "rouge2_fmeasure_stderr": 0.001185060228700246, "rouge2_precision": 0.03097570444381075, "rouge2_precision_stderr": 0.0010529803914309842, "rouge2_recall": 0.14822279757135365, "rouge2_recall_stderr": 0.003542602599361053, "rougeL_fmeasure": 0.09890244399665504, "rougeL_fmeasure_stderr": 0.0017757054937035596, "rougeL_precision": 0.06394295556979562, "rougeL_precision_stderr": 0.0014006358263887765, "rougeL_recall": 0.29855471539555173, "rougeL_recall_stderr": 0.005065807844017526, "rougeLsum_fmeasure": 0.09674304693311835, "rougeLsum_fmeasure_stderr": 0.0017873491443061993, "rougeLsum_precision": 0.0627981850154084, "rougeLsum_precision_stderr": 0.0014227622677173192, "rougeLsum_recall": 0.28896601455023163, "rougeLsum_recall_stderr": 0.0048173128169931275}}, "1": {"PALM_prompt": {"bleu": 0.5282730452896494, "bleu_stderr": 0.046658174629329835, "rouge1_fmeasure": 0.15453085732939212, "rouge1_fmeasure_stderr": 0.0037903004845717416, "rouge1_precision": 0.13492258154401704, "rouge1_precision_stderr": 0.004562341607217017, "rouge1_recall": 0.3083277617781391, "rouge1_recall_stderr": 0.005224242469589791, "rouge2_fmeasure": 0.07715129615847176, "rouge2_fmeasure_stderr": 0.0024953579026924877, "rouge2_precision": 0.0684914406475082, "rouge2_precision_stderr": 0.003138379441292843, "rouge2_recall": 0.15722468206768184, "rouge2_recall_stderr": 0.003642104936521507, "rougeL_fmeasure": 0.14047759853951516, "rougeL_fmeasure_stderr": 0.0032693658440114716, "rougeL_precision": 0.12182454902305588, "rougeL_precision_stderr": 0.004065319582844083, "rougeL_recall": 0.2884358880405701, "rougeL_recall_stderr": 0.004804007323913862, "rougeLsum_fmeasure": 0.1424777265304541, "rougeLsum_fmeasure_stderr": 0.0033271666317625133, "rougeLsum_precision": 0.12377843411968241, "rougeLsum_precision_stderr": 0.0041213993900085415, "rougeLsum_recall": 0.29087030685214843, "rougeLsum_recall_stderr": 0.004838562119626229}}, "2": {"PALM_prompt": {"bleu": 0.7087784769126132, "bleu_stderr": 0.033226802281645235, "rouge1_fmeasure": 0.18222591860939175, "rouge1_fmeasure_stderr": 0.004455148716981206, "rouge1_precision": 0.16142125578637487, "rouge1_precision_stderr": 0.00521948725155917, "rouge1_recall": 0.34149328495842945, "rouge1_recall_stderr": 0.005217003211960785, "rouge2_fmeasure": 0.09725568781400727, "rouge2_fmeasure_stderr": 0.0030924514288714934, "rouge2_precision": 0.08805537075933455, "rouge2_precision_stderr": 0.0036093882354670584, "rouge2_recall": 0.1829986510420906, "rouge2_recall_stderr": 0.003848198593789938, "rougeL_fmeasure": 0.16055322492077787, "rougeL_fmeasure_stderr": 0.003649385022398045, "rougeL_precision": 0.13958209810018274, "rougeL_precision_stderr": 0.004362399156331258, "rougeL_recall": 0.3153653930676925, "rougeL_recall_stderr": 0.004684824202191565, "rougeLsum_fmeasure": 0.1655387624687358, "rougeLsum_fmeasure_stderr": 0.0038291820607917926, "rougeLsum_precision": 0.144942026594572, "rougeLsum_precision_stderr": 0.0045689172044934, "rougeLsum_recall": 0.3205594730262639, "rougeLsum_recall_stderr": 0.004775953727795681}}, "3": {"PALM_prompt": {"bleu": 0.7714700261545828, "bleu_stderr": 0.038047357558188985, "rouge1_fmeasure": 0.18498068640385423, "rouge1_fmeasure_stderr": 0.004425768922114508, "rouge1_precision": 0.16107165063634774, "rouge1_precision_stderr": 0.00513681627205544, "rouge1_recall": 0.34932733859068815, "rouge1_recall_stderr": 0.0051329389904585035, "rouge2_fmeasure": 0.10009879345207362, "rouge2_fmeasure_stderr": 0.0031002562536374613, "rouge2_precision": 0.09019151735929883, "rouge2_precision_stderr": 0.0036306127475140846, "rouge2_recall": 0.1893033757457288, "rouge2_recall_stderr": 0.003926313984491448, "rougeL_fmeasure": 0.16433759459960015, "rougeL_fmeasure_stderr": 0.003713294544026085, "rougeL_precision": 0.14108149248005866, "rougeL_precision_stderr": 0.004377817120513084, "rougeL_recall": 0.32273874091603144, "rougeL_recall_stderr": 0.004636002507553704, "rougeLsum_fmeasure": 0.16891400390861658, "rougeLsum_fmeasure_stderr": 0.003867386295421696, "rougeLsum_precision": 0.14607155562281043, "rougeLsum_precision_stderr": 0.004575794466703003, "rougeLsum_recall": 0.32795629669192083, "rougeLsum_recall_stderr": 0.004717488507933725}}, "4": {"PALM_prompt": {"bleu": 0.9123438894304744, "bleu_stderr": 0.054886987568219775, "rouge1_fmeasure": 0.19646071564379733, "rouge1_fmeasure_stderr": 0.004584526858332184, "rouge1_precision": 0.17626934804442085, "rouge1_precision_stderr": 0.00548550937990432, "rouge1_recall": 0.3614704617922227, "rouge1_recall_stderr": 0.00519347079179478, "rouge2_fmeasure": 0.10768923599178877, "rouge2_fmeasure_stderr": 0.0032756191329392894, "rouge2_precision": 0.10020640346061736, "rouge2_precision_stderr": 0.003929477023073327, "rouge2_recall": 0.19816630032192067, "rouge2_recall_stderr": 0.004051203015277309, "rougeL_fmeasure": 0.17467184419120016, "rougeL_fmeasure_stderr": 0.0038819978483099313, "rougeL_precision": 0.15477559598701404, "rougeL_precision_stderr": 0.004722210624078396, "rougeL_recall": 0.333984023643314, "rougeL_recall_stderr": 0.004718700604899889, "rougeLsum_fmeasure": 0.18068519840080996, "rougeLsum_fmeasure_stderr": 0.004094888184813557, "rougeLsum_precision": 0.16126020980382802, "rougeLsum_precision_stderr": 0.004982567296016139, "rougeLsum_recall": 0.34047396885990955, "rougeLsum_recall_stderr": 0.004836737279204789}}, "5": {"PALM_prompt": {"bleu": 0.9788070872039196, "bleu_stderr": 0.061618196933129275, "rouge1_fmeasure": 0.20916748818592323, "rouge1_fmeasure_stderr": 0.00479171506562394, "rouge1_precision": 0.19273610758260953, "rouge1_precision_stderr": 0.005791238405778257, "rouge1_recall": 0.3676993300353655, "rouge1_recall_stderr": 0.005195679902322131, "rouge2_fmeasure": 0.1166710535574247, "rouge2_fmeasure_stderr": 0.0034103295858190466, "rouge2_precision": 0.11117635869373572, "rouge2_precision_stderr": 0.004096557178116057, "rouge2_recall": 0.2032058726566931, "rouge2_recall_stderr": 0.004118860263780741, "rougeL_fmeasure": 0.18544692143357658, "rougeL_fmeasure_stderr": 0.004050968469228267, "rougeL_precision": 0.16866145946287572, "rougeL_precision_stderr": 0.004966384394692544, "rougeL_recall": 0.33851323929520605, "rougeL_recall_stderr": 0.0046652983566289365, "rougeLsum_fmeasure": 0.1917055004501852, "rougeLsum_fmeasure_stderr": 0.004244961928468395, "rougeLsum_precision": 0.17545981366520674, "rougeLsum_precision_stderr": 0.005208319044227642, "rougeLsum_recall": 0.34549545700511236, "rougeLsum_recall_stderr": 0.004789083612922212}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.548981362509075, "bleu_stderr": 0.05489475020872184, "rouge1_fmeasure": 0.16038016919847042, "rouge1_fmeasure_stderr": 0.0019990640656002926, "rouge1_precision": 0.13820498211813353, "rouge1_precision_stderr": 0.002027158902079377, "rouge1_recall": 0.23211349713507806, "rouge1_recall_stderr": 0.00291460042679572, "rouge2_fmeasure": 0.03196515454564458, "rouge2_fmeasure_stderr": 0.0008384244311491337, "rouge2_precision": 0.027279734431356378, "rouge2_precision_stderr": 0.000808566127900892, "rouge2_recall": 0.04795859925947281, "rouge2_recall_stderr": 0.0013595400764566943, "rougeL_fmeasure": 0.12632096335907966, "rougeL_fmeasure_stderr": 0.0014653061463928971, "rougeL_precision": 0.10795465944704448, "rougeL_precision_stderr": 0.0015014665828846125, "rougeL_recall": 0.18653065431668062, "rougeL_recall_stderr": 0.0023311124084016413, "rougeLsum_fmeasure": 0.14775355696399795, "rougeLsum_fmeasure_stderr": 0.0018320947179453926, "rougeLsum_precision": 0.1272721140058725, "rougeLsum_precision_stderr": 0.0018695016021840376, "rougeLsum_recall": 0.21461003410413587, "rougeLsum_recall_stderr": 0.0027041822162299084}}, "1": {"tldr_en": {"bleu": 3.2149586203454135, "bleu_stderr": 0.08601336162907185, "rouge1_fmeasure": 0.20840915734426707, "rouge1_fmeasure_stderr": 0.0022474515177039, "rouge1_precision": 0.30976486099115014, "rouge1_precision_stderr": 0.004280572867004042, "rouge1_recall": 0.22171250687165736, "rouge1_recall_stderr": 0.002903356892688996, "rouge2_fmeasure": 0.05711975562569963, "rouge2_fmeasure_stderr": 0.0013517696390472863, "rouge2_precision": 0.09367814264741255, "rouge2_precision_stderr": 0.002682234882491779, "rouge2_recall": 0.05954696797555463, "rouge2_recall_stderr": 0.001573219011990061, "rougeL_fmeasure": 0.15980831128944248, "rougeL_fmeasure_stderr": 0.001731250860574063, "rougeL_precision": 0.24421651361727906, "rougeL_precision_stderr": 0.003653271074825019, "rougeL_recall": 0.169440745496233, "rougeL_recall_stderr": 0.0022449539637247034, "rougeLsum_fmeasure": 0.19542648771469187, "rougeLsum_fmeasure_stderr": 0.0021136593261581434, "rougeLsum_precision": 0.2915503548810816, "rougeLsum_precision_stderr": 0.004083572955218742, "rougeLsum_recall": 0.2076969626960914, "rougeLsum_recall_stderr": 0.002708340900126643}}, "2": {"tldr_en": {"bleu": 3.450240228422556, "bleu_stderr": 0.08572604048780053, "rouge1_fmeasure": 0.22867607912859314, "rouge1_fmeasure_stderr": 0.002288367779523059, "rouge1_precision": 0.37743042291482304, "rouge1_precision_stderr": 0.0044111013067848785, "rouge1_recall": 0.21457578593260976, "rouge1_recall_stderr": 0.0027147930891358064, "rouge2_fmeasure": 0.06947167396458416, "rouge2_fmeasure_stderr": 0.0014897828516901572, "rouge2_precision": 0.12606752980846056, "rouge2_precision_stderr": 0.003127297599114837, "rouge2_recall": 0.06341659021916847, "rouge2_recall_stderr": 0.001502226590979293, "rougeL_fmeasure": 0.18031282936121834, "rougeL_fmeasure_stderr": 0.0018354363289438068, "rougeL_precision": 0.30368513453736445, "rougeL_precision_stderr": 0.0038282082282305656, "rougeL_recall": 0.16859324773152481, "rougeL_recall_stderr": 0.002156520389396142, "rougeLsum_fmeasure": 0.21627023407211837, "rougeLsum_fmeasure_stderr": 0.0021753218409006274, "rougeLsum_precision": 0.358876036916817, "rougeLsum_precision_stderr": 0.004280469123137574, "rougeLsum_recall": 0.20264673443606565, "rougeLsum_recall_stderr": 0.002565365753084454}}, "3": {"tldr_en": {"bleu": 2.250994414839275, "bleu_stderr": 0.091897965299116, "rouge1_fmeasure": 0.1915901133021701, "rouge1_fmeasure_stderr": 0.0026542144927023453, "rouge1_precision": 0.3231052965784643, "rouge1_precision_stderr": 0.0048288512639913074, "rouge1_recall": 0.17602695575238506, "rouge1_recall_stderr": 0.0029126180026860333, "rouge2_fmeasure": 0.05830834942902882, "rouge2_fmeasure_stderr": 0.0014606581695085121, "rouge2_precision": 0.10718649475080308, "rouge2_precision_stderr": 0.003043489981867081, "rouge2_recall": 0.05259908512606004, "rouge2_recall_stderr": 0.0014779719324052925, "rougeL_fmeasure": 0.152042153519013, "rougeL_fmeasure_stderr": 0.0021243131526260018, "rougeL_precision": 0.26117167740111175, "rougeL_precision_stderr": 0.004114644011547143, "rougeL_recall": 0.13928386336050552, "rougeL_recall_stderr": 0.002323273382780931, "rougeLsum_fmeasure": 0.18069243349904934, "rougeLsum_fmeasure_stderr": 0.002513289684272712, "rougeLsum_precision": 0.30617830685256786, "rougeLsum_precision_stderr": 0.004645505843401255, "rougeLsum_recall": 0.1657464800681317, "rougeLsum_recall_stderr": 0.002746195520087434}}, "4": {"tldr_en": {"bleu": 0.025258960430548195, "bleu_stderr": 0.003948485052929433, "rouge1_fmeasure": 0.06237842951906701, "rouge1_fmeasure_stderr": 0.0022256374062476782, "rouge1_precision": 0.10502195172934231, "rouge1_precision_stderr": 0.0038274788621714575, "rouge1_recall": 0.05836198681843306, "rouge1_recall_stderr": 0.0022963137368669753, "rouge2_fmeasure": 0.01814711808646534, "rouge2_fmeasure_stderr": 0.0010091475220200094, "rouge2_precision": 0.032670588360068864, "rouge2_precision_stderr": 0.0019318159917030553, "rouge2_recall": 0.017077976305335606, "rouge2_recall_stderr": 0.0010116890806308784, "rougeL_fmeasure": 0.05013981007134034, "rougeL_fmeasure_stderr": 0.0018048231168335442, "rougeL_precision": 0.08626382373130143, "rougeL_precision_stderr": 0.003250461668985866, "rougeL_recall": 0.04676735388183072, "rougeL_recall_stderr": 0.0018490100679613992, "rougeLsum_fmeasure": 0.05887899666019905, "rougeLsum_fmeasure_stderr": 0.0021069624318686394, "rougeLsum_precision": 0.09971295777485246, "rougeLsum_precision_stderr": 0.0036578794541110163, "rougeLsum_recall": 0.054970129884443705, "rougeLsum_recall_stderr": 0.0021671633765488305}}, "5": {"tldr_en": {"bleu": 2.8847118757086147e-18, "bleu_stderr": 1.0881174740524596e-15, "rouge1_fmeasure": 0.00943834706966938, "rouge1_fmeasure_stderr": 0.0010003766763685682, "rouge1_precision": 0.016445607431009014, "rouge1_precision_stderr": 0.001764697223961224, "rouge1_recall": 0.008728512502978002, "rouge1_recall_stderr": 0.0009971058384574234, "rouge2_fmeasure": 0.0029039552430087483, "rouge2_fmeasure_stderr": 0.0004774675700448816, "rouge2_precision": 0.005181625191057042, "rouge2_precision_stderr": 0.0009188200417373838, "rouge2_recall": 0.002739189067258707, "rouge2_recall_stderr": 0.00047489173993906024, "rougeL_fmeasure": 0.007694935215955847, "rougeL_fmeasure_stderr": 0.0008377932935747717, "rougeL_precision": 0.01363309513555986, "rougeL_precision_stderr": 0.0015221201519944403, "rougeL_recall": 0.00716147585442943, "rougeL_recall_stderr": 0.0008491876319702115, "rougeLsum_fmeasure": 0.008930798210612606, "rougeLsum_fmeasure_stderr": 0.0009513347353657132, "rougeLsum_precision": 0.015630380418189347, "rougeLsum_precision_stderr": 0.0016879031425052684, "rougeLsum_recall": 0.00827872354781895, "rougeLsum_recall_stderr": 0.0009578709726846888}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.004589892921634932, "bleu_stderr": 0.0008286496328566937, "rouge1_fmeasure": 0.0007262029514504785, "rouge1_fmeasure_stderr": 0.00011515442861700407, "rouge1_precision": 0.000674118130389787, "rouge1_precision_stderr": 0.00011005799955189828, "rouge1_recall": 0.0009005819263409739, "rouge1_recall_stderr": 0.0001549446494541127, "rouge2_fmeasure": 2.4041585445094217e-05, "rouge2_fmeasure_stderr": 1.7003344460924265e-05, "rouge2_precision": 2.5904203323558163e-05, "rouge2_precision_stderr": 1.857633077557422e-05, "rouge2_recall": 2.3237179487179488e-05, "rouge2_recall_stderr": 1.6516156707881496e-05, "rougeL_fmeasure": 0.0007262029514504785, "rougeL_fmeasure_stderr": 0.00011515442861700407, "rougeL_precision": 0.000674118130389787, "rougeL_precision_stderr": 0.00011005799955189828, "rougeL_recall": 0.0009005819263409739, "rougeL_recall_stderr": 0.0001549446494541127, "rougeLsum_fmeasure": 0.0007131310560256419, "rougeLsum_fmeasure_stderr": 0.00011293370554354721, "rougeLsum_precision": 0.0006596253767665984, "rougeLsum_precision_stderr": 0.00010718618747988786, "rougeLsum_recall": 0.000888677164436212, "rougeLsum_recall_stderr": 0.00015358917656475344}}, "1": {"generate_text_restaurant": {"bleu": 11.034930993533054, "bleu_stderr": 0.13332964665842936, "rouge1_fmeasure": 0.4458394159496228, "rouge1_fmeasure_stderr": 0.0024214839651912675, "rouge1_precision": 0.5251197249455601, "rouge1_precision_stderr": 0.0031643681100212966, "rouge1_recall": 0.42654376151095885, "rouge1_recall_stderr": 0.0030811943169649428, "rouge2_fmeasure": 0.20123613248710673, "rouge2_fmeasure_stderr": 0.0019806207597444993, "rouge2_precision": 0.2391526828357595, "rouge2_precision_stderr": 0.0024766972503500713, "rouge2_recall": 0.19319012655283635, "rouge2_recall_stderr": 0.002175302414490571, "rougeL_fmeasure": 0.3185974977824894, "rougeL_fmeasure_stderr": 0.002055319096748302, "rougeL_precision": 0.3779508668413123, "rougeL_precision_stderr": 0.002790322431049451, "rougeL_recall": 0.30415773624077147, "rougeL_recall_stderr": 0.002463566717606618, "rougeLsum_fmeasure": 0.360317469964398, "rougeLsum_fmeasure_stderr": 0.0023334281921691844, "rougeLsum_precision": 0.42582636928947853, "rougeLsum_precision_stderr": 0.003051090729659719, "rougeLsum_recall": 0.3441788135347828, "rougeLsum_recall_stderr": 0.0027718947951392726}}, "2": {"generate_text_restaurant": {"bleu": 12.771640570906062, "bleu_stderr": 0.2060419894477191, "rouge1_fmeasure": 0.4788825727434599, "rouge1_fmeasure_stderr": 0.002387862746351256, "rouge1_precision": 0.5595284794955765, "rouge1_precision_stderr": 0.003192743656984621, "rouge1_recall": 0.458999460931844, "rouge1_recall_stderr": 0.003076244916781993, "rouge2_fmeasure": 0.22626470896988504, "rouge2_fmeasure_stderr": 0.002084055008532813, "rouge2_precision": 0.2675881136128828, "rouge2_precision_stderr": 0.0026609913920924073, "rouge2_recall": 0.21728767743885152, "rouge2_recall_stderr": 0.0022849533099495727, "rougeL_fmeasure": 0.340239223174204, "rougeL_fmeasure_stderr": 0.0020685288000278786, "rougeL_precision": 0.3998385653380442, "rougeL_precision_stderr": 0.0028340803275566304, "rougeL_recall": 0.3257042920642795, "rougeL_recall_stderr": 0.0024878781480618837, "rougeLsum_fmeasure": 0.38519550509628453, "rougeLsum_fmeasure_stderr": 0.002326104998323997, "rougeLsum_precision": 0.4514174896117289, "rougeLsum_precision_stderr": 0.0030819147896655806, "rougeLsum_recall": 0.3687371333732566, "rougeLsum_recall_stderr": 0.0027813273526875975}}, "3": {"generate_text_restaurant": {"bleu": 14.005406815184266, "bleu_stderr": 0.12271870090441088, "rouge1_fmeasure": 0.48759778574772905, "rouge1_fmeasure_stderr": 0.0023941945598986414, "rouge1_precision": 0.5660598857135201, "rouge1_precision_stderr": 0.0031766953242821708, "rouge1_recall": 0.4683648243640382, "rouge1_recall_stderr": 0.003046743761683525, "rouge2_fmeasure": 0.23737804451490846, "rouge2_fmeasure_stderr": 0.0021768013720999536, "rouge2_precision": 0.27833050766140727, "rouge2_precision_stderr": 0.0027216290949920656, "rouge2_recall": 0.22887591136774074, "rouge2_recall_stderr": 0.0023944080948235095, "rougeL_fmeasure": 0.3530640227325825, "rougeL_fmeasure_stderr": 0.0021445484656071687, "rougeL_precision": 0.4127381563853338, "rougeL_precision_stderr": 0.0029088192021074766, "rougeL_recall": 0.3384881982279927, "rougeL_recall_stderr": 0.0025316448107753856, "rougeLsum_fmeasure": 0.40008555890862296, "rougeLsum_fmeasure_stderr": 0.0023777692215410707, "rougeLsum_precision": 0.4652893223977917, "rougeLsum_precision_stderr": 0.0030828851728351317, "rougeLsum_recall": 0.3841728151825094, "rougeLsum_recall_stderr": 0.0028300146058369154}}, "4": {"generate_text_restaurant": {"bleu": 14.419275020848124, "bleu_stderr": 0.13828607091454695, "rouge1_fmeasure": 0.4907147307696341, "rouge1_fmeasure_stderr": 0.0023943344615635193, "rouge1_precision": 0.567279637993312, "rouge1_precision_stderr": 0.003215450041263448, "rouge1_recall": 0.47322059188493276, "rouge1_recall_stderr": 0.003096913731962911, "rouge2_fmeasure": 0.24074082590052123, "rouge2_fmeasure_stderr": 0.002205806958099008, "rouge2_precision": 0.28086260963596926, "rouge2_precision_stderr": 0.002753918734671383, "rouge2_recall": 0.2329491601936547, "rouge2_recall_stderr": 0.0024249531394830446, "rougeL_fmeasure": 0.35761932102833915, "rougeL_fmeasure_stderr": 0.0021687049416861674, "rougeL_precision": 0.41581156770758565, "rougeL_precision_stderr": 0.0029300907267918886, "rougeL_recall": 0.34419546765174536, "rougeL_recall_stderr": 0.0025865639304949286, "rougeLsum_fmeasure": 0.4048851308640318, "rougeLsum_fmeasure_stderr": 0.002417900533606593, "rougeLsum_precision": 0.46864410689082053, "rougeLsum_precision_stderr": 0.0031375768366613157, "rougeLsum_recall": 0.39033480310668645, "rougeLsum_recall_stderr": 0.0029093860331955578}}, "5": {"generate_text_restaurant": {"bleu": 14.50557398220931, "bleu_stderr": 0.1788730940951381, "rouge1_fmeasure": 0.493149445854328, "rouge1_fmeasure_stderr": 0.0023877314453735143, "rouge1_precision": 0.5680595308920772, "rouge1_precision_stderr": 0.0032174760758321916, "rouge1_recall": 0.4734480859781366, "rouge1_recall_stderr": 0.0030010973081521465, "rouge2_fmeasure": 0.24269152018910306, "rouge2_fmeasure_stderr": 0.0021894051692507408, "rouge2_precision": 0.28277797869514065, "rouge2_precision_stderr": 0.0027644432806077637, "rouge2_recall": 0.2331052458115948, "rouge2_recall_stderr": 0.0023560037879423864, "rougeL_fmeasure": 0.3599150458563419, "rougeL_fmeasure_stderr": 0.0021576313923635662, "rougeL_precision": 0.4170399993874809, "rougeL_precision_stderr": 0.002935905553436331, "rougeL_recall": 0.3447943392851923, "rougeL_recall_stderr": 0.0025007260373378615, "rougeLsum_fmeasure": 0.4072211668883497, "rougeLsum_fmeasure_stderr": 0.002413496070492698, "rougeLsum_precision": 0.46973654677811527, "rougeLsum_precision_stderr": 0.003144147743035595, "rougeLsum_recall": 0.39070735300171605, "rougeLsum_recall_stderr": 0.0028240962693331585}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.1091283399596295, "bleu_stderr": 0.078543455169586, "rouge1_fmeasure": 0.20676726913729868, "rouge1_fmeasure_stderr": 0.0028128208972934264, "rouge1_precision": 0.14956679968116302, "rouge1_precision_stderr": 0.002146954239004557, "rouge1_recall": 0.35445434242448814, "rouge1_recall_stderr": 0.00485002734824426, "rouge2_fmeasure": 0.04953978855224528, "rouge2_fmeasure_stderr": 0.0017486414475509453, "rouge2_precision": 0.03553527160604039, "rouge2_precision_stderr": 0.001311441571283853, "rouge2_recall": 0.08702493538242996, "rouge2_recall_stderr": 0.0030706539468616156, "rougeL_fmeasure": 0.15357963795708082, "rougeL_fmeasure_stderr": 0.0021553752703554457, "rougeL_precision": 0.11114776123890578, "rougeL_precision_stderr": 0.0016629639825015393, "rougeL_recall": 0.26405184275588883, "rougeL_recall_stderr": 0.0037991071996669114, "rougeLsum_fmeasure": 0.1647091116646791, "rougeLsum_fmeasure_stderr": 0.0024004425542895258, "rougeLsum_precision": 0.11891432446586195, "rougeLsum_precision_stderr": 0.0018178483160269225, "rougeLsum_recall": 0.28419149687872164, "rougeLsum_recall_stderr": 0.004269480851205538}}, "1": {"article_DOC_summary": {"bleu": 2.2375381157763674, "bleu_stderr": 0.12123852057793609, "rouge1_fmeasure": 0.22104054737159648, "rouge1_fmeasure_stderr": 0.0032907972020246494, "rouge1_precision": 0.19678864571385832, "rouge1_precision_stderr": 0.0037906362311771564, "rouge1_recall": 0.3039859207374779, "rouge1_recall_stderr": 0.00439258710871469, "rouge2_fmeasure": 0.05319563291201607, "rouge2_fmeasure_stderr": 0.0021295141568621087, "rouge2_precision": 0.04842406103962097, "rouge2_precision_stderr": 0.0021862035481226946, "rouge2_recall": 0.0722475298009662, "rouge2_recall_stderr": 0.0027400510140456642, "rougeL_fmeasure": 0.16826239668532597, "rougeL_fmeasure_stderr": 0.002656761272224791, "rougeL_precision": 0.14967987964463805, "rougeL_precision_stderr": 0.0030146826154706413, "rougeL_recall": 0.23224842098838183, "rougeL_recall_stderr": 0.0035555765488067674, "rougeLsum_fmeasure": 0.17313917199316306, "rougeLsum_fmeasure_stderr": 0.00272933804741195, "rougeLsum_precision": 0.1532367039936427, "rougeLsum_precision_stderr": 0.003030843593504894, "rougeLsum_recall": 0.2406307845673897, "rougeLsum_recall_stderr": 0.003811431525949725}}, "2": {"article_DOC_summary": {"bleu": 3.1564017175438543, "bleu_stderr": 0.20202237805230427, "rouge1_fmeasure": 0.2517163916441416, "rouge1_fmeasure_stderr": 0.00360847892275996, "rouge1_precision": 0.25577905035887855, "rouge1_precision_stderr": 0.004268216306752809, "rouge1_recall": 0.2741070186064677, "rouge1_recall_stderr": 0.003941197148791585, "rouge2_fmeasure": 0.06406681848626028, "rouge2_fmeasure_stderr": 0.0024707364356220275, "rouge2_precision": 0.0664225638210701, "rouge2_precision_stderr": 0.002697049630231203, "rouge2_recall": 0.0685373477177349, "rouge2_recall_stderr": 0.002625382941485827, "rougeL_fmeasure": 0.1915729280830488, "rougeL_fmeasure_stderr": 0.0029982437440479724, "rougeL_precision": 0.19462592227597153, "rougeL_precision_stderr": 0.003495756192452141, "rougeL_recall": 0.20884518327376556, "rougeL_recall_stderr": 0.003251905110702449, "rougeLsum_fmeasure": 0.19401389777862615, "rougeLsum_fmeasure_stderr": 0.003016343542150355, "rougeLsum_precision": 0.1965437439590985, "rougeLsum_precision_stderr": 0.0034881044404459064, "rougeLsum_recall": 0.21259927587702904, "rougeLsum_recall_stderr": 0.003380889372214054}}, "3": {"article_DOC_summary": {"bleu": 3.5382306117386153, "bleu_stderr": 0.31745084670660484, "rouge1_fmeasure": 0.240575608667714, "rouge1_fmeasure_stderr": 0.0037903793504037815, "rouge1_precision": 0.25563261095337336, "rouge1_precision_stderr": 0.004374147181810261, "rouge1_recall": 0.24766843802128052, "rouge1_recall_stderr": 0.0041445698386913124, "rouge2_fmeasure": 0.060726575070014825, "rouge2_fmeasure_stderr": 0.0025154116557184864, "rouge2_precision": 0.06563577841957732, "rouge2_precision_stderr": 0.0028634098751176454, "rouge2_recall": 0.062190069448469706, "rouge2_recall_stderr": 0.002642115242478322, "rougeL_fmeasure": 0.18176415294970483, "rougeL_fmeasure_stderr": 0.00317468180314014, "rougeL_precision": 0.19390542270671096, "rougeL_precision_stderr": 0.0036994894774715677, "rougeL_recall": 0.18688162460703145, "rougeL_recall_stderr": 0.003423686246194091, "rougeLsum_fmeasure": 0.18406043174817502, "rougeLsum_fmeasure_stderr": 0.0031906015608140384, "rougeLsum_precision": 0.19600044830616276, "rougeLsum_precision_stderr": 0.0037043261867812064, "rougeLsum_recall": 0.18991178246390195, "rougeLsum_recall_stderr": 0.0035036972092426545}}, "4": {"article_DOC_summary": {"bleu": 0.1108812214260021, "bleu_stderr": 0.03680831967269514, "rouge1_fmeasure": 0.05989026106361492, "rouge1_fmeasure_stderr": 0.003666344934736649, "rouge1_precision": 0.0697850373479175, "rouge1_precision_stderr": 0.004393288282974084, "rouge1_recall": 0.058560632380087935, "rouge1_recall_stderr": 0.0036844830469558955, "rouge2_fmeasure": 0.014905541100548116, "rouge2_fmeasure_stderr": 0.0015264648162996788, "rouge2_precision": 0.0167894222918787, "rouge2_precision_stderr": 0.0017050142026830716, "rouge2_recall": 0.014504048913324238, "rouge2_recall_stderr": 0.0015361907981547417, "rougeL_fmeasure": 0.045481577598383005, "rougeL_fmeasure_stderr": 0.002861716037932508, "rougeL_precision": 0.054339720606664, "rougeL_precision_stderr": 0.0036392597246897116, "rougeL_recall": 0.0442694737141457, "rougeL_recall_stderr": 0.002863997458668099, "rougeLsum_fmeasure": 0.045789978239963544, "rougeLsum_fmeasure_stderr": 0.002874197774307689, "rougeLsum_precision": 0.05463959232502368, "rougeLsum_precision_stderr": 0.003648053619297217, "rougeLsum_recall": 0.04462759119900641, "rougeLsum_recall_stderr": 0.00288346658669232}}, "5": {"article_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}}}
4b284b84b90c4py/evaluation/rankeval/4b284b84b90c4py_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.338,0.014965960710224482,0
3
+ anli_r2,acc,0.333,0.014910846164229859,0
4
+ anli_r3,acc,0.33,0.013579531277800923,0
5
+ arc_challenge,acc,0.2781569965870307,0.013094469919538809,0
6
+ arc_challenge,acc_norm,0.29692832764505117,0.013352025976725223,0
7
+ arc_easy,acc,0.6123737373737373,0.009997307914447612,0
8
+ arc_easy,acc_norm,0.5517676767676768,0.01020464512685695,0
9
+ boolq,acc,0.5363914373088685,0.008721861424877866,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.29078164450800714,,1
12
+ copa,acc,0.76,0.04292346959909282,0
13
+ hellaswag,acc,0.4658434574785899,0.004978124945759852,0
14
+ hellaswag,acc_norm,0.6105357498506274,0.004866322258335979,0
15
+ piqa,acc,0.7589771490750816,0.009979042717267314,0
16
+ piqa,acc_norm,0.7665941240478781,0.009869247889520986,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.831,0.011856625977890129,0
19
+ sciq,acc_norm,0.746,0.013772206565168537,0
20
+ storycloze_2016,acc,0.7081774452164618,0.01051258861619963,0
21
+ winogrande,acc,0.6108918705603789,0.013702520871485949,0
4b284b84b90c4py/evaluation/rankeval/4b284b84b90c4py_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.325,0.014818724459095526,0
3
+ anli_r2,acc,0.303,0.014539683710535246,0
4
+ anli_r3,acc,0.3458333333333333,0.013736245342311014,0
5
+ arc_challenge,acc,0.3148464163822526,0.01357265770308495,0
6
+ arc_challenge,acc_norm,0.32764505119453924,0.013715847940719348,0
7
+ arc_easy,acc,0.6553030303030303,0.009752321586569784,0
8
+ arc_easy,acc_norm,0.6435185185185185,0.009828046544504438,0
9
+ boolq,acc,0.5474006116207951,0.008705669190431184,1
10
+ cb,acc,0.5,0.06741998624632421,1
11
+ cb,f1,0.3554421768707483,,1
12
+ copa,acc,0.77,0.04229525846816506,0
13
+ hellaswag,acc,0.462158932483569,0.0049754706908671535,0
14
+ hellaswag,acc_norm,0.6109340768771161,0.0048654194682138914,0
15
+ piqa,acc,0.7535364526659413,0.010054810789671822,0
16
+ piqa,acc_norm,0.7622415669205659,0.009932525779525492,0
17
+ rte,acc,0.5451263537906137,0.029973636495415252,0
18
+ sciq,acc,0.904,0.009320454434783219,0
19
+ sciq,acc_norm,0.901,0.009449248027662737,0
20
+ storycloze_2016,acc,0.7006948156066275,0.010590117252248798,0
21
+ winogrande,acc,0.5982636148382005,0.0137784392666495,0
4b284b84b90c4py/evaluation/rankeval/4b284b84b90c4py_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.313,0.014671272822977892,0
3
+ anli_r2,acc,0.341,0.014998131348402707,0
4
+ anli_r3,acc,0.3283333333333333,0.013562032919529019,0
5
+ arc_challenge,acc,0.318259385665529,0.013611993916971451,0
6
+ arc_challenge,acc_norm,0.3378839590443686,0.013822047922283517,0
7
+ arc_easy,acc,0.6691919191919192,0.009654540125986119,0
8
+ arc_easy,acc_norm,0.6637205387205387,0.009694178072725204,0
9
+ boolq,acc,0.5391437308868502,0.008718214887614912,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.3035294117647059,,1
12
+ copa,acc,0.8,0.04020151261036845,0
13
+ hellaswag,acc,0.4645488946425015,0.0049772234853420255,0
14
+ hellaswag,acc_norm,0.6142202748456482,0.004857840934549179,0
15
+ piqa,acc,0.7589771490750816,0.009979042717267315,0
16
+ piqa,acc_norm,0.7704026115342764,0.009812682950815181,0
17
+ rte,acc,0.51985559566787,0.030072723167317184,0
18
+ sciq,acc,0.92,0.008583336977753655,0
19
+ sciq,acc_norm,0.918,0.008680515615523724,0
20
+ storycloze_2016,acc,0.7076429716729022,0.010518239729787741,0
21
+ winogrande,acc,0.6085240726124704,0.013717487071290856,0
4b284b84b90c4py/evaluation/rankeval/4b284b84b90c4py_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.31,0.014632638658632903,0
3
+ anli_r2,acc,0.357,0.015158521721486769,0
4
+ anli_r3,acc,0.3475,0.013751753243291854,0
5
+ arc_challenge,acc,0.3122866894197952,0.013542598541688065,0
6
+ arc_challenge,acc_norm,0.3378839590443686,0.013822047922283516,0
7
+ arc_easy,acc,0.6662457912457912,0.009676065683575473,0
8
+ arc_easy,acc_norm,0.6696127946127947,0.009651430216428194,0
9
+ boolq,acc,0.5278287461773701,0.008731499445069577,1
10
+ cb,acc,0.375,0.06527912098338669,1
11
+ cb,f1,0.33730158730158727,,1
12
+ copa,acc,0.83,0.03775251680686371,0
13
+ hellaswag,acc,0.468034256124278,0.004979573765575858,0
14
+ hellaswag,acc_norm,0.6170085640310695,0.004851227527070881,0
15
+ piqa,acc,0.7568008705114254,0.010009611953858919,0
16
+ piqa,acc_norm,0.7704026115342764,0.009812682950815183,0
17
+ rte,acc,0.5451263537906137,0.029973636495415252,0
18
+ sciq,acc,0.916,0.008776162089491129,0
19
+ sciq,acc_norm,0.921,0.00853415677333344,0
20
+ storycloze_2016,acc,0.7194013896312133,0.010389809647288825,0
21
+ winogrande,acc,0.5974743488555643,0.013782866831703046,0
4b284b84b90c4py/evaluation/rankeval/4b284b84b90c4py_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.334,0.014922019523732965,0
3
+ anli_r2,acc,0.332,0.014899597242811482,0
4
+ anli_r3,acc,0.3408333333333333,0.013688600793296939,0
5
+ arc_challenge,acc,0.3174061433447099,0.013602239088038169,0
6
+ arc_challenge,acc_norm,0.3447098976109215,0.01388881628678211,0
7
+ arc_easy,acc,0.6696127946127947,0.009651430216428183,0
8
+ arc_easy,acc_norm,0.6670875420875421,0.009669958978395324,0
9
+ boolq,acc,0.5284403669724771,0.008730896561344785,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.4541975308641975,,1
12
+ copa,acc,0.8,0.04020151261036845,0
13
+ hellaswag,acc,0.4691296554471221,0.004980262025472477,0
14
+ hellaswag,acc_norm,0.6192989444333798,0.004845668799108535,0
15
+ piqa,acc,0.7606093579978237,0.009955884250291678,0
16
+ piqa,acc_norm,0.7725788900979326,0.009779850767847225,0
17
+ rte,acc,0.49097472924187724,0.030091559826331334,0
18
+ sciq,acc,0.931,0.008018934050315146,0
19
+ sciq,acc_norm,0.931,0.008018934050315155,0
20
+ storycloze_2016,acc,0.7145911277391769,0.010443395884062115,0
21
+ winogrande,acc,0.611681136543015,0.01369745665845723,0
4b284b84b90c4py/evaluation/rankeval/4b284b84b90c4py_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.34,0.014987482264363937,0
3
+ anli_r2,acc,0.327,0.014842213153411245,0
4
+ anli_r3,acc,0.345,0.013728421539454876,0
5
+ arc_challenge,acc,0.3242320819112628,0.013678810399518815,0
6
+ arc_challenge,acc_norm,0.34215017064846415,0.013864152159177278,0
7
+ arc_easy,acc,0.6687710437710438,0.009657641311350912,0
8
+ arc_easy,acc_norm,0.6734006734006734,0.009623047038267656,0
9
+ boolq,acc,0.5302752293577981,0.008729009003964295,1
10
+ cb,acc,0.5178571428571429,0.06737697508644648,1
11
+ cb,f1,0.3647798742138364,,1
12
+ copa,acc,0.83,0.037752516806863715,0
13
+ hellaswag,acc,0.4731129257120096,0.004982561815214123,0
14
+ hellaswag,acc_norm,0.6240788687512447,0.004833699243292339,0
15
+ piqa,acc,0.7600652883569097,0.009963625892809544,0
16
+ piqa,acc_norm,0.7709466811751904,0.009804509865175504,0
17
+ rte,acc,0.5415162454873647,0.029992535385373314,0
18
+ sciq,acc,0.925,0.008333333333333356,0
19
+ sciq,acc_norm,0.932,0.007964887911291605,0
20
+ storycloze_2016,acc,0.7226082308925709,0.010353267472010765,0
21
+ winogrande,acc,0.5982636148382005,0.013778439266649492,0