diff --git "a/4b284b42bc4/eval/merged.json" "b/4b284b42bc4/eval/merged.json" new file mode 100644--- /dev/null +++ "b/4b284b42bc4/eval/merged.json" @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4273206525263921, "bleu_stderr": 0.05364575256139351, "rouge1_fmeasure": 0.11230037179769856, "rouge1_fmeasure_stderr": 0.0021352190368007454, "rouge1_precision": 0.07516332695488044, "rouge1_precision_stderr": 0.0017494053619516534, "rouge1_recall": 0.3000002487080154, "rouge1_recall_stderr": 0.004693559983294075, "rouge2_fmeasure": 0.05344453588119793, "rouge2_fmeasure_stderr": 0.0013515518560690634, "rouge2_precision": 0.03590952594711019, "rouge2_precision_stderr": 0.00111479086151588, "rouge2_recall": 0.14620648751654972, "rouge2_recall_stderr": 0.003234756290211622, "rougeL_fmeasure": 0.10771019147913463, "rougeL_fmeasure_stderr": 0.0019671591552995436, "rougeL_precision": 0.07168459834878929, "rougeL_precision_stderr": 0.0015668817269601622, "rougeL_recall": 0.29090649446634975, "rougeL_recall_stderr": 0.004555120095447256, "rougeLsum_fmeasure": 0.10717155022915739, "rougeLsum_fmeasure_stderr": 0.0019927955387855968, "rougeLsum_precision": 0.07152558346509422, "rougeLsum_precision_stderr": 0.001609325107400611, "rougeLsum_recall": 0.28799432197952096, "rougeLsum_recall_stderr": 0.004451937577964067}, "explicit-graph-description2": {"bleu": 0.33659811457161004, "bleu_stderr": 0.07179334370346768, "rouge1_fmeasure": 0.13734703076547916, "rouge1_fmeasure_stderr": 0.0020799156371082523, "rouge1_precision": 0.17179568666388068, "rouge1_precision_stderr": 0.0027588098305790145, "rouge1_recall": 0.14967776591519644, "rouge1_recall_stderr": 0.0023741585126890443, "rouge2_fmeasure": 0.011184314741657642, "rouge2_fmeasure_stderr": 0.0009686746779174165, "rouge2_precision": 0.012844648594530184, "rouge2_precision_stderr": 0.000953695060736493, "rouge2_recall": 0.01366618248229483, "rouge2_recall_stderr": 0.0014161977749096548, "rougeL_fmeasure": 0.11556695658268057, "rougeL_fmeasure_stderr": 0.001653530715898047, "rougeL_precision": 0.14267159823314632, "rougeL_precision_stderr": 0.0021803516732606496, "rougeL_recall": 0.13040116723987202, "rougeL_recall_stderr": 0.0020787261762492937, "rougeLsum_fmeasure": 0.11016382456596313, "rougeLsum_fmeasure_stderr": 0.0018129599146147113, "rougeLsum_precision": 0.14166916768123394, "rougeLsum_precision_stderr": 0.002522392923923697, "rougeLsum_recall": 0.11682503514042476, "rougeLsum_recall_stderr": 0.002049865434311089}, "implicit-graph-description": {"bleu": 0.15369360248720013, "bleu_stderr": 0.01910359117683677, "rouge1_fmeasure": 0.050674302559418065, "rouge1_fmeasure_stderr": 0.0011850389149770726, "rouge1_precision": 0.030770028240182067, "rouge1_precision_stderr": 0.0009226075888278027, "rouge1_recall": 0.230051193857904, "rouge1_recall_stderr": 0.0030536564547568117, "rouge2_fmeasure": 0.008207247449935154, "rouge2_fmeasure_stderr": 0.0007072767342602839, "rouge2_precision": 0.005257320096320131, "rouge2_precision_stderr": 0.0005237467608889611, "rouge2_recall": 0.031456692666470484, "rouge2_recall_stderr": 0.0019898714460596075, "rougeL_fmeasure": 0.049014322701212684, "rougeL_fmeasure_stderr": 0.0010406088468448477, "rougeL_precision": 0.02948342463609005, "rougeL_precision_stderr": 0.0007987803660190042, "rougeL_recall": 0.22664687188671498, "rougeL_recall_stderr": 0.0028881333653311973, "rougeLsum_fmeasure": 0.0361766232091386, "rougeLsum_fmeasure_stderr": 0.001070618326400763, "rougeLsum_precision": 0.022110816162663957, "rougeLsum_precision_stderr": 0.0008133150658395728, "rougeLsum_recall": 0.16487262586384596, "rougeLsum_recall_stderr": 0.0027329966443987454}, "non-explicit-description": {"bleu": 0.4008552013254772, "bleu_stderr": 0.05150329890936936, "rouge1_fmeasure": 0.06697983816620594, "rouge1_fmeasure_stderr": 0.002031025478858438, "rouge1_precision": 0.041187412322809, "rouge1_precision_stderr": 0.0015212733720666, "rouge1_recall": 0.2722316696175966, "rouge1_recall_stderr": 0.0046313887753788, "rouge2_fmeasure": 0.020385877678818807, "rouge2_fmeasure_stderr": 0.0012767680070777197, "rouge2_precision": 0.013070747205979933, "rouge2_precision_stderr": 0.0009299883930244878, "rouge2_recall": 0.0784073545551269, "rouge2_recall_stderr": 0.0034778014934171812, "rougeL_fmeasure": 0.062415738929841806, "rougeL_fmeasure_stderr": 0.001651359195900617, "rougeL_precision": 0.037903415422310224, "rougeL_precision_stderr": 0.0012109396727457977, "rougeL_recall": 0.26240382375978827, "rougeL_recall_stderr": 0.004287520636463648, "rougeLsum_fmeasure": 0.056710032049994176, "rougeLsum_fmeasure_stderr": 0.0017539781237973047, "rougeLsum_precision": 0.034753268390260716, "rougeLsum_precision_stderr": 0.0012974419802824465, "rougeLsum_recall": 0.2358071803161779, "rougeLsum_recall_stderr": 0.004143971381818325}, "very-explicit-description": {"bleu": 0.0009597827582021916, "bleu_stderr": 2.787222007252637e-06, "rouge1_fmeasure": 0.028025138472397872, "rouge1_fmeasure_stderr": 0.00044637295791461953, "rouge1_precision": 0.01704208739894692, "rouge1_precision_stderr": 0.00038134288013957874, "rouge1_recall": 0.11377181108508606, "rouge1_recall_stderr": 0.001221955325919493, "rouge2_fmeasure": 3.843511432044316e-06, "rouge2_fmeasure_stderr": 3.84351143204421e-06, "rouge2_precision": 2.243966255235454e-06, "rouge2_precision_stderr": 2.24396625523544e-06, "rouge2_recall": 1.3383655879440027e-05, "rouge2_recall_stderr": 1.3383655879440093e-05, "rougeL_fmeasure": 0.028021943208707357, "rougeL_fmeasure_stderr": 0.0004460769556010694, "rougeL_precision": 0.01704208739894692, "rougeL_precision_stderr": 0.00038134288013957874, "rougeL_recall": 0.11376898775808511, "rougeL_recall_stderr": 0.0012220146777960862, "rougeLsum_fmeasure": 0.019321658838726875, "rougeLsum_fmeasure_stderr": 0.00028489753193219213, "rougeLsum_precision": 0.011845314198379468, "rougeLsum_precision_stderr": 0.0003148449817594768, "rougeLsum_recall": 0.08190788638445848, "rougeLsum_recall_stderr": 0.0007545756315167773}}, "1": {"PALM_prompt": {"bleu": 0.5663724921835591, "bleu_stderr": 0.03793270967595185, "rouge1_fmeasure": 0.12756709737220784, "rouge1_fmeasure_stderr": 0.002030468381881967, "rouge1_precision": 0.08217512985928499, "rouge1_precision_stderr": 0.0015069260165856669, "rouge1_recall": 0.3920117360421087, "rouge1_recall_stderr": 0.005374085547490371, "rouge2_fmeasure": 0.0604895960614538, "rouge2_fmeasure_stderr": 0.0013006193696080512, "rouge2_precision": 0.03886435424085205, "rouge2_precision_stderr": 0.0009357648884166175, "rouge2_recall": 0.1952944224324245, "rouge2_recall_stderr": 0.003818475633981706, "rougeL_fmeasure": 0.12035361435811785, "rougeL_fmeasure_stderr": 0.00181560137606144, "rougeL_precision": 0.07729829270693146, "rougeL_precision_stderr": 0.0013303444161253287, "rougeL_recall": 0.3723170212800364, "rougeL_recall_stderr": 0.005024550440352146, "rougeLsum_fmeasure": 0.1206781467869011, "rougeLsum_fmeasure_stderr": 0.0018904175388126346, "rougeLsum_precision": 0.07779518642109406, "rougeLsum_precision_stderr": 0.0014090714637609509, "rougeLsum_recall": 0.3702218835789373, "rougeLsum_recall_stderr": 0.004905029541255394}, "explicit-graph-description2": {"bleu": 4.108172938460409, "bleu_stderr": 0.15815575820636524, "rouge1_fmeasure": 0.3274178122458887, "rouge1_fmeasure_stderr": 0.0049286280426292985, "rouge1_precision": 0.2979124156767924, "rouge1_precision_stderr": 0.0054955941631360024, "rouge1_recall": 0.4891593464751427, "rouge1_recall_stderr": 0.007011942792981163, "rouge2_fmeasure": 0.15888598086244612, "rouge2_fmeasure_stderr": 0.003356976364238643, "rouge2_precision": 0.14521802379162835, "rouge2_precision_stderr": 0.0037085106298636638, "rouge2_recall": 0.2447744707424197, "rouge2_recall_stderr": 0.004896941862479997, "rougeL_fmeasure": 0.24956913127077388, "rougeL_fmeasure_stderr": 0.003890488336637862, "rougeL_precision": 0.2295708148490674, "rougeL_precision_stderr": 0.0046150064407066435, "rougeL_recall": 0.38128693273071246, "rougeL_recall_stderr": 0.005688750994881648, "rougeLsum_fmeasure": 0.29017150876669406, "rougeLsum_fmeasure_stderr": 0.004434001029770126, "rougeLsum_precision": 0.26398378931320715, "rougeLsum_precision_stderr": 0.004954506205875008, "rougeLsum_recall": 0.43612713385035845, "rougeLsum_recall_stderr": 0.006362125054891411}, "implicit-graph-description": {"bleu": 1.2681634787853089, "bleu_stderr": 0.04998595810874735, "rouge1_fmeasure": 0.15190412398303874, "rouge1_fmeasure_stderr": 0.002523824368707439, "rouge1_precision": 0.09550865383429513, "rouge1_precision_stderr": 0.0019686712411162346, "rouge1_recall": 0.5542119160082642, "rouge1_recall_stderr": 0.004657384009324578, "rouge2_fmeasure": 0.06685938813989681, "rouge2_fmeasure_stderr": 0.0015427103067306345, "rouge2_precision": 0.042165060237563064, "rouge2_precision_stderr": 0.00115469645580311, "rouge2_recall": 0.25868024994419325, "rouge2_recall_stderr": 0.004235643426537448, "rougeL_fmeasure": 0.12871999313910717, "rougeL_fmeasure_stderr": 0.0018578488402400592, "rougeL_precision": 0.07997084816197671, "rougeL_precision_stderr": 0.0014665506771249248, "rougeL_recall": 0.49629192822486257, "rougeL_recall_stderr": 0.004293241239132906, "rougeLsum_fmeasure": 0.1333199688923633, "rougeLsum_fmeasure_stderr": 0.002328093617925018, "rougeLsum_precision": 0.08397023684370912, "rougeLsum_precision_stderr": 0.0018152970797848846, "rougeLsum_recall": 0.48869658983291286, "rougeLsum_recall_stderr": 0.004444469008391446}, "non-explicit-description": {"bleu": 1.9140039012454733, "bleu_stderr": 0.0890847222319399, "rouge1_fmeasure": 0.24260640014350202, "rouge1_fmeasure_stderr": 0.0027768727339828343, "rouge1_precision": 0.15982099926698054, "rouge1_precision_stderr": 0.0023693100913810337, "rouge1_recall": 0.6818447127780973, "rouge1_recall_stderr": 0.0039531831866198575, "rouge2_fmeasure": 0.10697454965926212, "rouge2_fmeasure_stderr": 0.0018050822143224363, "rouge2_precision": 0.06975853570163917, "rouge2_precision_stderr": 0.0014283398957334861, "rouge2_recall": 0.32360264018596285, "rouge2_recall_stderr": 0.004059465670996164, "rougeL_fmeasure": 0.18681341391416414, "rougeL_fmeasure_stderr": 0.001991917246485152, "rougeL_precision": 0.1216236757997378, "rougeL_precision_stderr": 0.0017202747181496215, "rougeL_recall": 0.5577387527759773, "rougeL_recall_stderr": 0.004272666872975094, "rougeLsum_fmeasure": 0.2056812857329856, "rougeLsum_fmeasure_stderr": 0.0023946587191271823, "rougeLsum_precision": 0.13510402818298697, "rougeLsum_precision_stderr": 0.002022793072384455, "rougeLsum_recall": 0.5884796713341609, "rougeLsum_recall_stderr": 0.0038365814028463647}, "very-explicit-description": {"bleu": 1.6165943378598948, "bleu_stderr": 0.06490921111069572, "rouge1_fmeasure": 0.1889848661578333, "rouge1_fmeasure_stderr": 0.0026135009154337114, "rouge1_precision": 0.12338574759046983, "rouge1_precision_stderr": 0.002589887344409682, "rouge1_recall": 0.6456549100912315, "rouge1_recall_stderr": 0.00481962570332347, "rouge2_fmeasure": 0.08423671425300502, "rouge2_fmeasure_stderr": 0.0016535014543450363, "rouge2_precision": 0.055329645026635225, "rouge2_precision_stderr": 0.0015917418388580041, "rouge2_recall": 0.313132985326238, "rouge2_recall_stderr": 0.004251686217898649, "rougeL_fmeasure": 0.15317825327978427, "rougeL_fmeasure_stderr": 0.0019859291190526406, "rougeL_precision": 0.0994133930281887, "rougeL_precision_stderr": 0.0020701300317446116, "rougeL_recall": 0.5475096620248264, "rougeL_recall_stderr": 0.0044971516232100136, "rougeLsum_fmeasure": 0.16658141546860505, "rougeLsum_fmeasure_stderr": 0.0023468522672181577, "rougeLsum_precision": 0.10855678843630082, "rougeLsum_precision_stderr": 0.0023039457528804285, "rougeLsum_recall": 0.5770399349468458, "rougeLsum_recall_stderr": 0.0046936823206671025}}, "2": {"PALM_prompt": {"bleu": 0.6313130510239234, "bleu_stderr": 0.02727704631142144, "rouge1_fmeasure": 0.12940251519432414, "rouge1_fmeasure_stderr": 0.0018563230360622849, "rouge1_precision": 0.08219107059651697, "rouge1_precision_stderr": 0.0013422258523153536, "rouge1_recall": 0.4151352004015792, "rouge1_recall_stderr": 0.005332508949374494, "rouge2_fmeasure": 0.06086364336249341, "rouge2_fmeasure_stderr": 0.0011655118907789416, "rouge2_precision": 0.03845495568474405, "rouge2_precision_stderr": 0.0008134611533833349, "rouge2_recall": 0.20877882384960775, "rouge2_recall_stderr": 0.003907086651198888, "rougeL_fmeasure": 0.12014889628673367, "rougeL_fmeasure_stderr": 0.0016294871735385334, "rougeL_precision": 0.07626492600480415, "rougeL_precision_stderr": 0.0011790764283155961, "rougeL_recall": 0.38715675983971815, "rougeL_recall_stderr": 0.004836346504741043, "rougeLsum_fmeasure": 0.12261763705509457, "rougeLsum_fmeasure_stderr": 0.0017345722248776798, "rougeLsum_precision": 0.07790995067901116, "rougeLsum_precision_stderr": 0.0012592690527861467, "rougeLsum_recall": 0.3934974316637387, "rougeLsum_recall_stderr": 0.004930425413445172}, "explicit-graph-description2": {"bleu": 5.777777933818976, "bleu_stderr": 0.14012798168135124, "rouge1_fmeasure": 0.43240476392568056, "rouge1_fmeasure_stderr": 0.003817266189092974, "rouge1_precision": 0.4187592773209161, "rouge1_precision_stderr": 0.005626717003828703, "rouge1_recall": 0.5928704577888301, "rouge1_recall_stderr": 0.005099101322054699, "rouge2_fmeasure": 0.2342525950120652, "rouge2_fmeasure_stderr": 0.003402274339943465, "rouge2_precision": 0.23096493796741716, "rouge2_precision_stderr": 0.004461915637136568, "rouge2_recall": 0.33273997806822503, "rouge2_recall_stderr": 0.0045935501380386535, "rougeL_fmeasure": 0.3391427202561202, "rougeL_fmeasure_stderr": 0.003536978206670861, "rougeL_precision": 0.3309444597203636, "rougeL_precision_stderr": 0.0051028624371683194, "rougeL_recall": 0.47140835027491623, "rougeL_recall_stderr": 0.004716481756787479, "rougeLsum_fmeasure": 0.3827941384172546, "rougeLsum_fmeasure_stderr": 0.0035523699230254864, "rougeLsum_precision": 0.37021425910295114, "rougeLsum_precision_stderr": 0.005128361770056663, "rougeLsum_recall": 0.529932604416665, "rougeLsum_recall_stderr": 0.004898567623492216}, "implicit-graph-description": {"bleu": 1.8205397209702776, "bleu_stderr": 0.0673137206688154, "rouge1_fmeasure": 0.1894275171118333, "rouge1_fmeasure_stderr": 0.002566906047548628, "rouge1_precision": 0.1291177089470445, "rouge1_precision_stderr": 0.0027994673830584586, "rouge1_recall": 0.5858999609001735, "rouge1_recall_stderr": 0.004575569928066966, "rouge2_fmeasure": 0.09546217670504362, "rouge2_fmeasure_stderr": 0.0017012398458140122, "rouge2_precision": 0.06520674856202793, "rouge2_precision_stderr": 0.0017537810536778107, "rouge2_recall": 0.319077314103273, "rouge2_recall_stderr": 0.004289568372222579, "rougeL_fmeasure": 0.15348751527744423, "rougeL_fmeasure_stderr": 0.002000176462271699, "rougeL_precision": 0.10404670276255262, "rougeL_precision_stderr": 0.002280876939330266, "rougeL_recall": 0.4958646402293731, "rougeL_recall_stderr": 0.004496077189792039, "rougeLsum_fmeasure": 0.16985387548623407, "rougeLsum_fmeasure_stderr": 0.0023697702811214943, "rougeLsum_precision": 0.1158438971486494, "rougeLsum_precision_stderr": 0.0025491115811417464, "rougeLsum_recall": 0.5272294008125672, "rougeLsum_recall_stderr": 0.00439142331254594}, "non-explicit-description": {"bleu": 2.0435403008123805, "bleu_stderr": 0.05599898561038878, "rouge1_fmeasure": 0.22138147276109751, "rouge1_fmeasure_stderr": 0.0027276614342971843, "rouge1_precision": 0.14341756746441994, "rouge1_precision_stderr": 0.0021572189793251386, "rouge1_recall": 0.6692546744897384, "rouge1_recall_stderr": 0.0042151038244085, "rouge2_fmeasure": 0.10198081937332562, "rouge2_fmeasure_stderr": 0.0016820967709998602, "rouge2_precision": 0.06513694044684888, "rouge2_precision_stderr": 0.0012285955010832561, "rouge2_recall": 0.340485606672825, "rouge2_recall_stderr": 0.004440352246050117, "rougeL_fmeasure": 0.1686362992157118, "rougeL_fmeasure_stderr": 0.0019210388579455546, "rougeL_precision": 0.10787155464448445, "rougeL_precision_stderr": 0.0014894882606878578, "rougeL_recall": 0.5414697804415199, "rougeL_recall_stderr": 0.004443875807815742, "rougeLsum_fmeasure": 0.19080393600333767, "rougeLsum_fmeasure_stderr": 0.002379667991105986, "rougeLsum_precision": 0.12338580056305985, "rougeLsum_precision_stderr": 0.0018753197047131456, "rougeLsum_recall": 0.5852144480947122, "rougeLsum_recall_stderr": 0.004054572409003386}, "very-explicit-description": {"bleu": 2.321779562659134, "bleu_stderr": 0.09538695554747774, "rouge1_fmeasure": 0.2217914175746633, "rouge1_fmeasure_stderr": 0.0025405101764270433, "rouge1_precision": 0.1436735682332211, "rouge1_precision_stderr": 0.0026579719365293812, "rouge1_recall": 0.7218682078589462, "rouge1_recall_stderr": 0.004052459422366133, "rouge2_fmeasure": 0.10461100159118264, "rouge2_fmeasure_stderr": 0.0016599147829125997, "rouge2_precision": 0.06775722832824736, "rouge2_precision_stderr": 0.0016552029301074905, "rouge2_recall": 0.37598071720158144, "rouge2_recall_stderr": 0.00439962213590862, "rougeL_fmeasure": 0.16606908225684747, "rougeL_fmeasure_stderr": 0.0018185139704651083, "rougeL_precision": 0.10707337342923572, "rougeL_precision_stderr": 0.002042276382879633, "rougeL_recall": 0.5690619918916721, "rougeL_recall_stderr": 0.004240683333110691, "rougeLsum_fmeasure": 0.19923289796510618, "rougeLsum_fmeasure_stderr": 0.002225786554401813, "rougeLsum_precision": 0.12837715690106088, "rougeLsum_precision_stderr": 0.0022626789530070107, "rougeLsum_recall": 0.6569143874299237, "rougeLsum_recall_stderr": 0.004022385130191463}}, "3": {"PALM_prompt": {"bleu": 0.6636681020720647, "bleu_stderr": 0.03135011211987113, "rouge1_fmeasure": 0.13179598729950448, "rouge1_fmeasure_stderr": 0.0018608653117238654, "rouge1_precision": 0.08335165133186997, "rouge1_precision_stderr": 0.0013566594758271408, "rouge1_recall": 0.4363611958728497, "rouge1_recall_stderr": 0.005466144892220358, "rouge2_fmeasure": 0.06172653863702163, "rouge2_fmeasure_stderr": 0.0011849569250187196, "rouge2_precision": 0.038821951607757095, "rouge2_precision_stderr": 0.0008309945118711307, "rouge2_recall": 0.21925719264824975, "rouge2_recall_stderr": 0.003975211126463013, "rougeL_fmeasure": 0.1203852422828686, "rougeL_fmeasure_stderr": 0.0015949578691764172, "rougeL_precision": 0.07607458853736868, "rougeL_precision_stderr": 0.0011617831771301823, "rougeL_recall": 0.3989579329521947, "rougeL_recall_stderr": 0.004797365715247612, "rougeLsum_fmeasure": 0.12430003630857168, "rougeLsum_fmeasure_stderr": 0.0017268968711559307, "rougeLsum_precision": 0.07866244698502353, "rougeLsum_precision_stderr": 0.0012631336576740743, "rougeLsum_recall": 0.41007264952780736, "rougeLsum_recall_stderr": 0.004984839846868383}, "explicit-graph-description2": {"bleu": 5.50955888983536, "bleu_stderr": 0.19407341974434633, "rouge1_fmeasure": 0.41628159920481106, "rouge1_fmeasure_stderr": 0.003759423443598214, "rouge1_precision": 0.39649918511817167, "rouge1_precision_stderr": 0.005575303775752109, "rouge1_recall": 0.5868851616945063, "rouge1_recall_stderr": 0.004874648360119763, "rouge2_fmeasure": 0.2270035578776477, "rouge2_fmeasure_stderr": 0.003326641885288619, "rouge2_precision": 0.21990121411366634, "rouge2_precision_stderr": 0.0043685198653270335, "rouge2_recall": 0.3314774836213164, "rouge2_recall_stderr": 0.004469810663401372, "rougeL_fmeasure": 0.32821478779416957, "rougeL_fmeasure_stderr": 0.0034635730509386077, "rougeL_precision": 0.31489793141614947, "rougeL_precision_stderr": 0.00499101931869006, "rougeL_recall": 0.46789790447333446, "rougeL_recall_stderr": 0.004497579692650062, "rougeLsum_fmeasure": 0.3686101872995796, "rougeLsum_fmeasure_stderr": 0.0035306120812830873, "rougeLsum_precision": 0.3510123362567013, "rougeLsum_precision_stderr": 0.005090798191984392, "rougeLsum_recall": 0.5229838686288929, "rougeLsum_recall_stderr": 0.004678146069876245}, "implicit-graph-description": {"bleu": 1.8545076876817848, "bleu_stderr": 0.07724809572451992, "rouge1_fmeasure": 0.19184315592400922, "rouge1_fmeasure_stderr": 0.0026570113202673156, "rouge1_precision": 0.13600857468800703, "rouge1_precision_stderr": 0.0032198416630618205, "rouge1_recall": 0.5594347319631547, "rouge1_recall_stderr": 0.004736205368048093, "rouge2_fmeasure": 0.10072973751222168, "rouge2_fmeasure_stderr": 0.0019070871002960905, "rouge2_precision": 0.0723944116238141, "rouge2_precision_stderr": 0.0022142669018531003, "rouge2_recall": 0.3143589440307445, "rouge2_recall_stderr": 0.004352566272275511, "rougeL_fmeasure": 0.15619428997394516, "rougeL_fmeasure_stderr": 0.0022182380723355204, "rougeL_precision": 0.11104879023945975, "rougeL_precision_stderr": 0.0027919267110680514, "rougeL_recall": 0.468994364527055, "rougeL_recall_stderr": 0.004573707755828136, "rougeLsum_fmeasure": 0.17258810787266, "rougeLsum_fmeasure_stderr": 0.0024527387178268248, "rougeLsum_precision": 0.12268948388502472, "rougeLsum_precision_stderr": 0.0029619856495348173, "rougeLsum_recall": 0.5051081506672119, "rougeLsum_recall_stderr": 0.00451999806892129}, "non-explicit-description": {"bleu": 2.1672251632248507, "bleu_stderr": 0.085436950676859, "rouge1_fmeasure": 0.2190831997552054, "rouge1_fmeasure_stderr": 0.0027341558417022524, "rouge1_precision": 0.14272679805162844, "rouge1_precision_stderr": 0.002164859751576681, "rouge1_recall": 0.6487556417334954, "rouge1_recall_stderr": 0.004293557109036889, "rouge2_fmeasure": 0.10398704909605484, "rouge2_fmeasure_stderr": 0.0017830520112251, "rouge2_precision": 0.0668120918836395, "rouge2_precision_stderr": 0.0012982874018482465, "rouge2_recall": 0.3372622132047622, "rouge2_recall_stderr": 0.004484332483533757, "rougeL_fmeasure": 0.1684031326291484, "rougeL_fmeasure_stderr": 0.0020124658427430444, "rougeL_precision": 0.10846308776944723, "rougeL_precision_stderr": 0.0015579592414390658, "rougeL_recall": 0.5276090858620072, "rougeL_recall_stderr": 0.004477426871140599, "rougeLsum_fmeasure": 0.19074419384356142, "rougeLsum_fmeasure_stderr": 0.002403668787230532, "rougeLsum_precision": 0.12406050272782151, "rougeLsum_precision_stderr": 0.001899638347295273, "rougeLsum_recall": 0.5733012952773476, "rougeLsum_recall_stderr": 0.004139080844482678}, "very-explicit-description": {"bleu": 2.36020603763585, "bleu_stderr": 0.08996406440773841, "rouge1_fmeasure": 0.20833619042301474, "rouge1_fmeasure_stderr": 0.002066573477323779, "rouge1_precision": 0.12704646175613926, "rouge1_precision_stderr": 0.0014965307460908653, "rouge1_recall": 0.7128153505808225, "rouge1_recall_stderr": 0.003930554377011592, "rouge2_fmeasure": 0.0979814053469226, "rouge2_fmeasure_stderr": 0.0013226281454679096, "rouge2_precision": 0.05894052317262764, "rouge2_precision_stderr": 0.000893949418071198, "rouge2_recall": 0.3735214163364516, "rouge2_recall_stderr": 0.004317456593719202, "rougeL_fmeasure": 0.15603166938331334, "rougeL_fmeasure_stderr": 0.0013588402692121038, "rougeL_precision": 0.09418094825751869, "rougeL_precision_stderr": 0.0009581252383436105, "rougeL_recall": 0.5641669222725268, "rougeL_recall_stderr": 0.004154206174788438, "rougeLsum_fmeasure": 0.18866870584363496, "rougeLsum_fmeasure_stderr": 0.001917270059093571, "rougeLsum_precision": 0.11490360635669425, "rougeLsum_precision_stderr": 0.0013800805146195233, "rougeLsum_recall": 0.651781997216155, "rougeLsum_recall_stderr": 0.003952076475725171}}, "4": {"PALM_prompt": {"bleu": 0.7445914925255956, "bleu_stderr": 0.04471373508927592, "rouge1_fmeasure": 0.13191715691415712, "rouge1_fmeasure_stderr": 0.001848451971487058, "rouge1_precision": 0.08335134599012016, "rouge1_precision_stderr": 0.0013601845542222193, "rouge1_recall": 0.4369343435538318, "rouge1_recall_stderr": 0.00545111390697167, "rouge2_fmeasure": 0.061883789388597316, "rouge2_fmeasure_stderr": 0.0011527436830992247, "rouge2_precision": 0.03875848226958462, "rouge2_precision_stderr": 0.0008040225568103941, "rouge2_recall": 0.22226421540491542, "rouge2_recall_stderr": 0.004032909658463521, "rougeL_fmeasure": 0.11940663728897792, "rougeL_fmeasure_stderr": 0.0015860574558582763, "rougeL_precision": 0.0754476375575414, "rougeL_precision_stderr": 0.001174153361661276, "rougeL_recall": 0.3964743307100715, "rougeL_recall_stderr": 0.004828941243739242, "rougeLsum_fmeasure": 0.12423777278329896, "rougeLsum_fmeasure_stderr": 0.0017238674224559544, "rougeLsum_precision": 0.07858636401791905, "rougeLsum_precision_stderr": 0.0012777097204336968, "rougeLsum_recall": 0.41038517941804836, "rougeLsum_recall_stderr": 0.005000143163857465}, "explicit-graph-description2": {"bleu": 5.135430012754724, "bleu_stderr": 0.10515298831784857, "rouge1_fmeasure": 0.39437568045135746, "rouge1_fmeasure_stderr": 0.0036213014676998953, "rouge1_precision": 0.35672902435293663, "rouge1_precision_stderr": 0.005099748319399167, "rouge1_recall": 0.5855419342513171, "rouge1_recall_stderr": 0.0047751903798185485, "rouge2_fmeasure": 0.21128493584064525, "rouge2_fmeasure_stderr": 0.003094624637133662, "rouge2_precision": 0.19346061280634166, "rouge2_precision_stderr": 0.0038984749084759063, "rouge2_recall": 0.32689522978214663, "rouge2_recall_stderr": 0.004424054434433312, "rougeL_fmeasure": 0.30988671994943884, "rougeL_fmeasure_stderr": 0.003261405607838547, "rougeL_precision": 0.2814695831840839, "rougeL_precision_stderr": 0.004516835082055319, "rougeL_recall": 0.46812215627206843, "rougeL_recall_stderr": 0.0044823471544487535, "rougeLsum_fmeasure": 0.35042208286660137, "rougeLsum_fmeasure_stderr": 0.0033912455978920217, "rougeLsum_precision": 0.3168249874123057, "rougeLsum_precision_stderr": 0.004673199828417057, "rougeLsum_recall": 0.5242590494964836, "rougeLsum_recall_stderr": 0.004606375146707341}, "implicit-graph-description": {"bleu": 1.8462916594144025, "bleu_stderr": 0.05355432178643776, "rouge1_fmeasure": 0.1860265502144425, "rouge1_fmeasure_stderr": 0.002448379226213876, "rouge1_precision": 0.1278346302045783, "rouge1_precision_stderr": 0.002830129642681041, "rouge1_recall": 0.5523726285950913, "rouge1_recall_stderr": 0.004673512208644458, "rouge2_fmeasure": 0.09678025171714386, "rouge2_fmeasure_stderr": 0.001657069001811618, "rouge2_precision": 0.06637198877707197, "rouge2_precision_stderr": 0.0018025537365178354, "rouge2_recall": 0.3123595828710712, "rouge2_recall_stderr": 0.004281424928475089, "rougeL_fmeasure": 0.15098496922208457, "rougeL_fmeasure_stderr": 0.0019252783341197905, "rougeL_precision": 0.10317713408980647, "rougeL_precision_stderr": 0.002276858012143727, "rougeL_recall": 0.46461791835294375, "rougeL_recall_stderr": 0.004513282830193284, "rougeLsum_fmeasure": 0.16714030208262487, "rougeLsum_fmeasure_stderr": 0.00222548922751469, "rougeLsum_precision": 0.11465363747249216, "rougeLsum_precision_stderr": 0.002512595497125225, "rougeLsum_recall": 0.5001173935227217, "rougeLsum_recall_stderr": 0.004512311417204082}, "non-explicit-description": {"bleu": 2.11735852420537, "bleu_stderr": 0.05771649184585253, "rouge1_fmeasure": 0.21502708033318635, "rouge1_fmeasure_stderr": 0.0026834110246252534, "rouge1_precision": 0.14145240880709448, "rouge1_precision_stderr": 0.0021670886996082153, "rouge1_recall": 0.6252122931764315, "rouge1_recall_stderr": 0.004362943212513885, "rouge2_fmeasure": 0.10086843076345688, "rouge2_fmeasure_stderr": 0.0017128031184130191, "rouge2_precision": 0.06554379446551205, "rouge2_precision_stderr": 0.0012763015030346867, "rouge2_recall": 0.3230079259138109, "rouge2_recall_stderr": 0.004400185110309783, "rougeL_fmeasure": 0.1644008070500405, "rougeL_fmeasure_stderr": 0.001977392152542587, "rougeL_precision": 0.10695926781407243, "rougeL_precision_stderr": 0.0015582110755479384, "rougeL_recall": 0.5039167223908304, "rougeL_recall_stderr": 0.004437183394069731, "rougeLsum_fmeasure": 0.18738101319708167, "rougeLsum_fmeasure_stderr": 0.002358418447535664, "rougeLsum_precision": 0.12299929966219343, "rougeLsum_precision_stderr": 0.0018989567005332623, "rougeLsum_recall": 0.5541344307335275, "rougeLsum_recall_stderr": 0.004167377796893304}, "very-explicit-description": {"bleu": 2.2779805846158383, "bleu_stderr": 0.07170204680218034, "rouge1_fmeasure": 0.2028260190998294, "rouge1_fmeasure_stderr": 0.002051576187767249, "rouge1_precision": 0.12364000397771846, "rouge1_precision_stderr": 0.0014761152355166514, "rouge1_recall": 0.7008467587934072, "rouge1_recall_stderr": 0.004059296791116624, "rouge2_fmeasure": 0.09599216387839389, "rouge2_fmeasure_stderr": 0.0012765491377424094, "rouge2_precision": 0.057702782901645155, "rouge2_precision_stderr": 0.0008602811108256085, "rouge2_recall": 0.37151697900109665, "rouge2_recall_stderr": 0.004354887109733919, "rougeL_fmeasure": 0.15237152706065096, "rougeL_fmeasure_stderr": 0.0013583120131860718, "rougeL_precision": 0.09198221409882565, "rougeL_precision_stderr": 0.0009571046005731393, "rougeL_recall": 0.5557672590175016, "rougeL_recall_stderr": 0.004224535855348862, "rougeLsum_fmeasure": 0.18385704643499215, "rougeLsum_fmeasure_stderr": 0.0018931085048969525, "rougeLsum_precision": 0.1119488799103195, "rougeLsum_precision_stderr": 0.0013559824797201349, "rougeLsum_recall": 0.6409089457417603, "rougeLsum_recall_stderr": 0.004037392997113196}}, "5": {"PALM_prompt": {"bleu": 0.8005310739494581, "bleu_stderr": 0.033724293174082695, "rouge1_fmeasure": 0.13259315662099147, "rouge1_fmeasure_stderr": 0.0017356987872365276, "rouge1_precision": 0.08338013300594908, "rouge1_precision_stderr": 0.001260130864731573, "rouge1_recall": 0.44613389749037924, "rouge1_recall_stderr": 0.0053462691621041685, "rouge2_fmeasure": 0.06197974009288303, "rouge2_fmeasure_stderr": 0.001091686612001906, "rouge2_precision": 0.03871058510533021, "rouge2_precision_stderr": 0.0007593773025926396, "rouge2_recall": 0.22607554732619956, "rouge2_recall_stderr": 0.004027973657557073, "rougeL_fmeasure": 0.11956303513601427, "rougeL_fmeasure_stderr": 0.001512266881380029, "rougeL_precision": 0.07523239671078998, "rougeL_precision_stderr": 0.0011063220739786807, "rougeL_recall": 0.40282994585844645, "rougeL_recall_stderr": 0.0046937441735825924, "rougeLsum_fmeasure": 0.12469867862475097, "rougeLsum_fmeasure_stderr": 0.0016220089487202947, "rougeLsum_precision": 0.07851137617969797, "rougeLsum_precision_stderr": 0.001188072848467214, "rougeLsum_recall": 0.41817890606333163, "rougeLsum_recall_stderr": 0.004881701441444398}, "explicit-graph-description2": {"bleu": 4.852629241722782, "bleu_stderr": 0.14804180093942532, "rouge1_fmeasure": 0.3764579585424841, "rouge1_fmeasure_stderr": 0.0035986469316141526, "rouge1_precision": 0.3346764742386238, "rouge1_precision_stderr": 0.004843721993493443, "rouge1_recall": 0.5735003817601863, "rouge1_recall_stderr": 0.004776228919481996, "rouge2_fmeasure": 0.20065053485376905, "rouge2_fmeasure_stderr": 0.002995889807999223, "rouge2_precision": 0.17835412437343545, "rouge2_precision_stderr": 0.0035647689800114026, "rouge2_recall": 0.32281685329216003, "rouge2_recall_stderr": 0.004493079326800901, "rougeL_fmeasure": 0.29763435336229693, "rougeL_fmeasure_stderr": 0.00319184324141869, "rougeL_precision": 0.264408350747097, "rougeL_precision_stderr": 0.004225202781211489, "rougeL_recall": 0.4629069290187871, "rougeL_recall_stderr": 0.004517741150892403, "rougeLsum_fmeasure": 0.3356555748821186, "rougeLsum_fmeasure_stderr": 0.003377968614381854, "rougeLsum_precision": 0.297705821504551, "rougeLsum_precision_stderr": 0.00441559661770478, "rougeLsum_recall": 0.5151776462027118, "rougeLsum_recall_stderr": 0.004625313103403918}, "implicit-graph-description": {"bleu": 1.8681612789604296, "bleu_stderr": 0.06823634030163615, "rouge1_fmeasure": 0.180067960659294, "rouge1_fmeasure_stderr": 0.002349653336178239, "rouge1_precision": 0.12245978780125266, "rouge1_precision_stderr": 0.002687499326593076, "rouge1_recall": 0.5431558597151445, "rouge1_recall_stderr": 0.004799564920689805, "rouge2_fmeasure": 0.09493898136075185, "rouge2_fmeasure_stderr": 0.0016863474758624926, "rouge2_precision": 0.06531606229983922, "rouge2_precision_stderr": 0.0019418038810328894, "rouge2_recall": 0.3098803783037246, "rouge2_recall_stderr": 0.004421579698743007, "rougeL_fmeasure": 0.14667555612439104, "rougeL_fmeasure_stderr": 0.0018835128549316987, "rougeL_precision": 0.09968277865275298, "rougeL_precision_stderr": 0.0022756173885421344, "rougeL_recall": 0.45735672460251464, "rougeL_recall_stderr": 0.004628702328621956, "rougeLsum_fmeasure": 0.16241611578249593, "rougeLsum_fmeasure_stderr": 0.0021493351407070567, "rougeLsum_precision": 0.11042759700784825, "rougeLsum_precision_stderr": 0.0024350506323870297, "rougeLsum_recall": 0.49324760825101704, "rougeLsum_recall_stderr": 0.0046636387400625905}, "non-explicit-description": {"bleu": 2.0204780863218708, "bleu_stderr": 0.06557727097019353, "rouge1_fmeasure": 0.20234254736965623, "rouge1_fmeasure_stderr": 0.002550195794881746, "rouge1_precision": 0.13156331057584722, "rouge1_precision_stderr": 0.00203140416680327, "rouge1_recall": 0.6063200915125521, "rouge1_recall_stderr": 0.004442245861735415, "rouge2_fmeasure": 0.09425690323781327, "rouge2_fmeasure_stderr": 0.001647787208632052, "rouge2_precision": 0.060544387595184664, "rouge2_precision_stderr": 0.0012069679093256386, "rouge2_recall": 0.3113139243596638, "rouge2_recall_stderr": 0.004491647148840445, "rougeL_fmeasure": 0.15605859797727822, "rougeL_fmeasure_stderr": 0.0019039344200042776, "rougeL_precision": 0.1005384257823588, "rougeL_precision_stderr": 0.0014914329572150095, "rougeL_recall": 0.4915833625433358, "rougeL_recall_stderr": 0.004427689382576967, "rougeLsum_fmeasure": 0.17736851108434767, "rougeLsum_fmeasure_stderr": 0.002250984265206207, "rougeLsum_precision": 0.11521013016106826, "rougeLsum_precision_stderr": 0.0017980094224657457, "rougeLsum_recall": 0.539127179437604, "rougeLsum_recall_stderr": 0.00423506881193511}, "very-explicit-description": {"bleu": 2.3867607776529205, "bleu_stderr": 0.07410446411824859, "rouge1_fmeasure": 0.20011806334917795, "rouge1_fmeasure_stderr": 0.0020113768045419067, "rouge1_precision": 0.12201036458298717, "rouge1_precision_stderr": 0.0014806840791434343, "rouge1_recall": 0.6991506268276161, "rouge1_recall_stderr": 0.00406189988321713, "rouge2_fmeasure": 0.0949140921236195, "rouge2_fmeasure_stderr": 0.0012601273301899127, "rouge2_precision": 0.05693913792040207, "rouge2_precision_stderr": 0.0008440989695321365, "rouge2_recall": 0.37256288020219525, "rouge2_recall_stderr": 0.004406895757444443, "rougeL_fmeasure": 0.1497544344956808, "rougeL_fmeasure_stderr": 0.00131856546208141, "rougeL_precision": 0.09037011023468404, "rougeL_precision_stderr": 0.0009499539634538327, "rougeL_recall": 0.5528331022129467, "rougeL_recall_stderr": 0.004235035612235216, "rougeLsum_fmeasure": 0.1812738472805107, "rougeLsum_fmeasure_stderr": 0.0018629632662416327, "rougeLsum_precision": 0.11036350095750773, "rougeLsum_precision_stderr": 0.0013509318154878004, "rougeLsum_recall": 0.6389934623997965, "rougeLsum_recall_stderr": 0.004055163370934182}}}, "GEM/wiki_lingua_en": {"0": {"article_summary_en": {"bleu": 1.8179420568974458, "bleu_stderr": 0.07211632534304736, "rouge1_fmeasure": 0.19362563894365822, "rouge1_fmeasure_stderr": 0.0019089814975787524, "rouge1_precision": 0.16426814795441422, "rouge1_precision_stderr": 0.001963248697761681, "rouge1_recall": 0.2844248414233626, "rouge1_recall_stderr": 0.0027186243949292458, "rouge2_fmeasure": 0.0397636720249358, "rouge2_fmeasure_stderr": 0.0008958485875661834, "rouge2_precision": 0.03345718350937418, "rouge2_precision_stderr": 0.0007942980850551589, "rouge2_recall": 0.06070606915521197, "rouge2_recall_stderr": 0.0015158618040175397, "rougeL_fmeasure": 0.1392443163337954, "rougeL_fmeasure_stderr": 0.0012458799774916409, "rougeL_precision": 0.11667958865167334, "rougeL_precision_stderr": 0.001269157880124843, "rougeL_recall": 0.2109918938185312, "rougeL_recall_stderr": 0.002085058620844339, "rougeLsum_fmeasure": 0.1795453015126057, "rougeLsum_fmeasure_stderr": 0.001766867622602647, "rougeLsum_precision": 0.15221459831627412, "rougeLsum_precision_stderr": 0.0018171164600694356, "rougeLsum_recall": 0.2643817230667675, "rougeLsum_recall_stderr": 0.0025380641829612694}, "rephrase_en": {"bleu": 0.5431527952918177, "bleu_stderr": 0.032104868441593624, "rouge1_fmeasure": 0.10088674481147626, "rouge1_fmeasure_stderr": 0.0014869970606006355, "rouge1_precision": 0.08776029171301661, "rouge1_precision_stderr": 0.0014617422175453763, "rouge1_recall": 0.14351167622490646, "rouge1_recall_stderr": 0.0021269699450294326, "rouge2_fmeasure": 0.012721191178839145, "rouge2_fmeasure_stderr": 0.0005085457582114725, "rouge2_precision": 0.01087054186081413, "rouge2_precision_stderr": 0.0004430740620268589, "rouge2_recall": 0.018812282252899392, "rouge2_recall_stderr": 0.000824752401096914, "rougeL_fmeasure": 0.08932496723516803, "rougeL_fmeasure_stderr": 0.0012301685164588575, "rougeL_precision": 0.07683211106766431, "rougeL_precision_stderr": 0.0011750417491257104, "rougeL_recall": 0.12945760989081237, "rougeL_recall_stderr": 0.001895510928096148, "rougeLsum_fmeasure": 0.09415637820011638, "rougeLsum_fmeasure_stderr": 0.0013706067956967248, "rougeLsum_precision": 0.08173215113769984, "rougeLsum_precision_stderr": 0.0013420824668087022, "rougeLsum_recall": 0.13446123935161725, "rougeLsum_recall_stderr": 0.0019889224163928713}, "summarize_above_en": {"bleu": 0.9508280856877127, "bleu_stderr": 0.03110425637842823, "rouge1_fmeasure": 0.1545325646812761, "rouge1_fmeasure_stderr": 0.0017491504584820998, "rouge1_precision": 0.13940638469106723, "rouge1_precision_stderr": 0.0020766596739565735, "rouge1_recall": 0.22027868660038527, "rouge1_recall_stderr": 0.0024555989238055657, "rouge2_fmeasure": 0.024434310470910135, "rouge2_fmeasure_stderr": 0.0007015086906141807, "rouge2_precision": 0.02295354423344447, "rouge2_precision_stderr": 0.0008596166860623005, "rouge2_recall": 0.03516541724082877, "rouge2_recall_stderr": 0.0011192218045994813, "rougeL_fmeasure": 0.12236419999758073, "rougeL_fmeasure_stderr": 0.0012480669383126359, "rougeL_precision": 0.1093757017485915, "rougeL_precision_stderr": 0.0015570564427566638, "rougeL_recall": 0.1788578203459584, "rougeL_recall_stderr": 0.0019850983591823343, "rougeLsum_fmeasure": 0.14268473186817657, "rougeLsum_fmeasure_stderr": 0.0016026623127596423, "rougeLsum_precision": 0.12869616248408597, "rougeLsum_precision_stderr": 0.0019249850326154927, "rougeLsum_recall": 0.20402125079021746, "rougeLsum_recall_stderr": 0.002281736333276174}, "tldr_en": {"bleu": 1.414984862410896, "bleu_stderr": 0.05940234677879655, "rouge1_fmeasure": 0.172029086258773, "rouge1_fmeasure_stderr": 0.0018269882294786313, "rouge1_precision": 0.14692825480864516, "rouge1_precision_stderr": 0.0018425931958625118, "rouge1_recall": 0.2512970070836738, "rouge1_recall_stderr": 0.002648136861575683, "rouge2_fmeasure": 0.03327297097578151, "rouge2_fmeasure_stderr": 0.0008291280040639999, "rouge2_precision": 0.027966624066931015, "rouge2_precision_stderr": 0.0007198127679157396, "rouge2_recall": 0.050572163826476904, "rouge2_recall_stderr": 0.00137135380680017, "rougeL_fmeasure": 0.13597766469494962, "rougeL_fmeasure_stderr": 0.0013133399966305693, "rougeL_precision": 0.11468189898145331, "rougeL_precision_stderr": 0.0012927912945671872, "rougeL_recall": 0.20361426191617446, "rougeL_recall_stderr": 0.002161893564032071, "rougeLsum_fmeasure": 0.15820053127675784, "rougeLsum_fmeasure_stderr": 0.0016635461401018714, "rougeLsum_precision": 0.1348221046642956, "rougeLsum_precision_stderr": 0.0016706708007563362, "rougeLsum_recall": 0.23199038101436165, "rougeLsum_recall_stderr": 0.0024449378448124903}, "write_abstract_en": {"bleu": 0.6124556246903006, "bleu_stderr": 0.038254977576635665, "rouge1_fmeasure": 0.1005296177122325, "rouge1_fmeasure_stderr": 0.0015389563787558595, "rouge1_precision": 0.08966310598899173, "rouge1_precision_stderr": 0.0016283594159426443, "rouge1_recall": 0.14086823338303966, "rouge1_recall_stderr": 0.002174573651185083, "rouge2_fmeasure": 0.011216041205494898, "rouge2_fmeasure_stderr": 0.00047851956788785876, "rouge2_precision": 0.009747347404058394, "rouge2_precision_stderr": 0.00042465671299048196, "rouge2_recall": 0.01702445557173677, "rouge2_recall_stderr": 0.0008431303828342194, "rougeL_fmeasure": 0.09330112527692144, "rougeL_fmeasure_stderr": 0.001337771293407071, "rougeL_precision": 0.08266300329867055, "rougeL_precision_stderr": 0.001411373389832569, "rougeL_recall": 0.13215740335454515, "rougeL_recall_stderr": 0.0019780686878669556, "rougeLsum_fmeasure": 0.09281985594753377, "rougeLsum_fmeasure_stderr": 0.0013961291968416943, "rougeLsum_precision": 0.08265536975136227, "rougeLsum_precision_stderr": 0.0014906774813849123, "rougeLsum_recall": 0.13084094141254124, "rougeLsum_recall_stderr": 0.002017106703749903}}, "1": {"article_summary_en": {"bleu": 2.0675085561659943, "bleu_stderr": 0.05768833984906303, "rouge1_fmeasure": 0.19183391025210358, "rouge1_fmeasure_stderr": 0.002001582947927051, "rouge1_precision": 0.1640591875496648, "rouge1_precision_stderr": 0.002043851896737318, "rouge1_recall": 0.27910878442257525, "rouge1_recall_stderr": 0.002898171924167552, "rouge2_fmeasure": 0.042828281320032364, "rouge2_fmeasure_stderr": 0.0009516028511268984, "rouge2_precision": 0.036188393072658635, "rouge2_precision_stderr": 0.0008561684568777711, "rouge2_recall": 0.06424101549232061, "rouge2_recall_stderr": 0.0015449018680878575, "rougeL_fmeasure": 0.14451746994762324, "rougeL_fmeasure_stderr": 0.0014033068919906031, "rougeL_precision": 0.12217744504962826, "rougeL_precision_stderr": 0.0014008167708105845, "rougeL_recall": 0.215360249932899, "rougeL_recall_stderr": 0.002288377671801132, "rougeLsum_fmeasure": 0.1777996809470896, "rougeLsum_fmeasure_stderr": 0.001845289272394589, "rougeLsum_precision": 0.15179552758825512, "rougeLsum_precision_stderr": 0.001877771950198993, "rougeLsum_recall": 0.259675418847687, "rougeLsum_recall_stderr": 0.0027198844155353788}, "rephrase_en": {"bleu": 1.5569271440681214, "bleu_stderr": 0.06403146716065812, "rouge1_fmeasure": 0.14631930516448613, "rouge1_fmeasure_stderr": 0.0019879191160377323, "rouge1_precision": 0.12905529214461575, "rouge1_precision_stderr": 0.001979491067372841, "rouge1_recall": 0.20839846358975436, "rouge1_recall_stderr": 0.002894759247759541, "rouge2_fmeasure": 0.02711335795670395, "rouge2_fmeasure_stderr": 0.000808976754077905, "rouge2_precision": 0.02375449052867389, "rouge2_precision_stderr": 0.0007917088515716603, "rouge2_recall": 0.04007317722615956, "rouge2_recall_stderr": 0.0013050110520617926, "rougeL_fmeasure": 0.10882491551274338, "rougeL_fmeasure_stderr": 0.0013993629759518366, "rougeL_precision": 0.09534534429433016, "rougeL_precision_stderr": 0.001414874985280666, "rougeL_recall": 0.15863360428057022, "rougeL_recall_stderr": 0.002229164497978125, "rougeLsum_fmeasure": 0.13618798520877723, "rougeLsum_fmeasure_stderr": 0.0018366248198839138, "rougeLsum_precision": 0.12009612909743919, "rougeLsum_precision_stderr": 0.0018400178492989732, "rougeLsum_recall": 0.19446607740504704, "rougeLsum_recall_stderr": 0.0027009360475646477}, "summarize_above_en": {"bleu": 1.7665424794124964, "bleu_stderr": 0.08195279766239584, "rouge1_fmeasure": 0.1670563070726708, "rouge1_fmeasure_stderr": 0.0019178617354935641, "rouge1_precision": 0.1443105260666999, "rouge1_precision_stderr": 0.001976066139534122, "rouge1_recall": 0.24132371040856623, "rouge1_recall_stderr": 0.002737769238603288, "rouge2_fmeasure": 0.03141526988382421, "rouge2_fmeasure_stderr": 0.0008646679643430015, "rouge2_precision": 0.027398570547821213, "rouge2_precision_stderr": 0.000827158978199139, "rouge2_recall": 0.04638771664106042, "rouge2_recall_stderr": 0.0014224849717481463, "rougeL_fmeasure": 0.12809102133791261, "rougeL_fmeasure_stderr": 0.0013663009524185852, "rougeL_precision": 0.10951134631545736, "rougeL_precision_stderr": 0.0013991751146943878, "rougeL_recall": 0.18961514474322305, "rougeL_recall_stderr": 0.0021768119549567286, "rougeLsum_fmeasure": 0.1556006928965094, "rougeLsum_fmeasure_stderr": 0.0017716161324128888, "rougeLsum_precision": 0.13436615559009527, "rougeLsum_precision_stderr": 0.00183327155604958, "rougeLsum_recall": 0.22552793868897747, "rougeLsum_recall_stderr": 0.00256618921407864}, "tldr_en": {"bleu": 2.836407401710481, "bleu_stderr": 0.04043755617370114, "rouge1_fmeasure": 0.2221915237870407, "rouge1_fmeasure_stderr": 0.0019942889407287214, "rouge1_precision": 0.20096070137159358, "rouge1_precision_stderr": 0.0024183404012715405, "rouge1_recall": 0.31553844384377067, "rouge1_recall_stderr": 0.002899492197080727, "rouge2_fmeasure": 0.056766090400891124, "rouge2_fmeasure_stderr": 0.0010805710237233974, "rouge2_precision": 0.05245104495325915, "rouge2_precision_stderr": 0.0012835508781176563, "rouge2_recall": 0.0826631709797398, "rouge2_recall_stderr": 0.0017188300283128646, "rougeL_fmeasure": 0.15912396957362257, "rougeL_fmeasure_stderr": 0.0013601645244304275, "rougeL_precision": 0.14393356016704015, "rougeL_precision_stderr": 0.001799791687854001, "rougeL_recall": 0.23074840768589486, "rougeL_recall_stderr": 0.0022312022411283892, "rougeLsum_fmeasure": 0.20821150035815492, "rougeLsum_fmeasure_stderr": 0.0018771242477053024, "rougeLsum_precision": 0.18819591770649413, "rougeLsum_precision_stderr": 0.002281534419296507, "rougeLsum_recall": 0.2963346702924994, "rougeLsum_recall_stderr": 0.0027562668344971878}, "write_abstract_en": {"bleu": 1.0361019783677474, "bleu_stderr": 0.06745079097957657, "rouge1_fmeasure": 0.14247720198956396, "rouge1_fmeasure_stderr": 0.0018061896391544346, "rouge1_precision": 0.12619327348032797, "rouge1_precision_stderr": 0.0017890629958271637, "rouge1_recall": 0.20045647375518888, "rouge1_recall_stderr": 0.0026946167754429164, "rouge2_fmeasure": 0.021015379090322476, "rouge2_fmeasure_stderr": 0.0007116134026935783, "rouge2_precision": 0.018133516802832303, "rouge2_precision_stderr": 0.000645611933600182, "rouge2_recall": 0.031827347668963016, "rouge2_recall_stderr": 0.0012365216961313032, "rougeL_fmeasure": 0.10275833507189983, "rougeL_fmeasure_stderr": 0.0011818702954760711, "rougeL_precision": 0.09050018965284994, "rougeL_precision_stderr": 0.0011814640560731773, "rougeL_recall": 0.14784088101449194, "rougeL_recall_stderr": 0.001990541308747493, "rougeLsum_fmeasure": 0.13395201808210386, "rougeLsum_fmeasure_stderr": 0.0016761350624605734, "rougeLsum_precision": 0.11860200033066597, "rougeLsum_precision_stderr": 0.001670916163668385, "rougeLsum_recall": 0.1888009611952507, "rougeLsum_recall_stderr": 0.002505695512304149}}, "2": {"article_summary_en": {"bleu": 2.5483385377460994, "bleu_stderr": 0.09290482394504518, "rouge1_fmeasure": 0.2094263617487459, "rouge1_fmeasure_stderr": 0.001995618924716392, "rouge1_precision": 0.1797105018003507, "rouge1_precision_stderr": 0.002084859264572737, "rouge1_recall": 0.3033184413058786, "rouge1_recall_stderr": 0.002905580486984753, "rouge2_fmeasure": 0.051091351936466585, "rouge2_fmeasure_stderr": 0.0010020514723555068, "rouge2_precision": 0.04358087823681392, "rouge2_precision_stderr": 0.0009101606204514657, "rouge2_recall": 0.07652593936670125, "rouge2_recall_stderr": 0.0016877876980321972, "rougeL_fmeasure": 0.15636202509062477, "rougeL_fmeasure_stderr": 0.001396569835369375, "rougeL_precision": 0.1327550835742571, "rougeL_precision_stderr": 0.0014273687313135473, "rougeL_recall": 0.2317387220648072, "rougeL_recall_stderr": 0.0023325790748031853, "rougeLsum_fmeasure": 0.19393075382172492, "rougeLsum_fmeasure_stderr": 0.0018522616086280982, "rougeLsum_precision": 0.16605597026357832, "rougeLsum_precision_stderr": 0.001925105257047686, "rougeLsum_recall": 0.2820819142889093, "rougeLsum_recall_stderr": 0.002744826890973276}, "rephrase_en": {"bleu": 2.588542343303483, "bleu_stderr": 0.07430586558531979, "rouge1_fmeasure": 0.18306178424233208, "rouge1_fmeasure_stderr": 0.0020567981461233308, "rouge1_precision": 0.15985727072408007, "rouge1_precision_stderr": 0.0021342704211066905, "rouge1_recall": 0.26171420796191186, "rouge1_recall_stderr": 0.0030128510679455042, "rouge2_fmeasure": 0.04454347983688272, "rouge2_fmeasure_stderr": 0.000997348114141261, "rouge2_precision": 0.03897138221781143, "rouge2_precision_stderr": 0.0009589153266609357, "rouge2_recall": 0.06532718295828605, "rouge2_recall_stderr": 0.0015904932736581785, "rougeL_fmeasure": 0.14055026158657788, "rougeL_fmeasure_stderr": 0.001469411947880256, "rougeL_precision": 0.12182921478329327, "rougeL_precision_stderr": 0.0015428600740151884, "rougeL_recall": 0.20506194499128635, "rougeL_recall_stderr": 0.002398145917032908, "rougeLsum_fmeasure": 0.17048750507518073, "rougeLsum_fmeasure_stderr": 0.0019155865863537292, "rougeLsum_precision": 0.14876297631779306, "rougeLsum_precision_stderr": 0.0019916419469703283, "rougeLsum_recall": 0.24429220231662616, "rougeLsum_recall_stderr": 0.0028317427638311745}, "summarize_above_en": {"bleu": 2.803596179721391, "bleu_stderr": 0.06743147958318904, "rouge1_fmeasure": 0.201225172676659, "rouge1_fmeasure_stderr": 0.0019811525577731063, "rouge1_precision": 0.18040456283641296, "rouge1_precision_stderr": 0.0023668115610486376, "rouge1_recall": 0.2870648086297508, "rouge1_recall_stderr": 0.0027667857701435265, "rouge2_fmeasure": 0.048234719869698274, "rouge2_fmeasure_stderr": 0.0010310275655512609, "rouge2_precision": 0.044638474881729014, "rouge2_precision_stderr": 0.001221684957935622, "rouge2_recall": 0.06999975662618645, "rouge2_recall_stderr": 0.0016116340703400012, "rougeL_fmeasure": 0.1566809756684086, "rougeL_fmeasure_stderr": 0.001423439082962154, "rougeL_precision": 0.13940596176892595, "rougeL_precision_stderr": 0.0017847331831893683, "rougeL_recall": 0.22920159148286207, "rougeL_recall_stderr": 0.002280659464388438, "rougeLsum_fmeasure": 0.18694734716033373, "rougeLsum_fmeasure_stderr": 0.0018516981957565392, "rougeLsum_precision": 0.16770892114701086, "rougeLsum_precision_stderr": 0.0022385377077080216, "rougeLsum_recall": 0.26750883501904393, "rougeLsum_recall_stderr": 0.002616055257421549}, "tldr_en": {"bleu": 3.0672692499706633, "bleu_stderr": 0.049106354646547744, "rouge1_fmeasure": 0.22254451379263313, "rouge1_fmeasure_stderr": 0.001915395486050023, "rouge1_precision": 0.20680907589782577, "rouge1_precision_stderr": 0.002488502986523018, "rouge1_recall": 0.30989444650796616, "rouge1_recall_stderr": 0.002753786143548591, "rouge2_fmeasure": 0.057748452491246806, "rouge2_fmeasure_stderr": 0.001071605656207478, "rouge2_precision": 0.05499744970071221, "rouge2_precision_stderr": 0.001313828428933835, "rouge2_recall": 0.08212190745009494, "rouge2_recall_stderr": 0.0016893036964833342, "rougeL_fmeasure": 0.15964814622161871, "rougeL_fmeasure_stderr": 0.0013347717397906025, "rougeL_precision": 0.14843146397136125, "rougeL_precision_stderr": 0.0018718282405222096, "rougeL_recall": 0.22714968032490238, "rougeL_recall_stderr": 0.0022095657686829933, "rougeLsum_fmeasure": 0.20960062484582434, "rougeLsum_fmeasure_stderr": 0.0018046035863626521, "rougeLsum_precision": 0.19471899528471098, "rougeLsum_precision_stderr": 0.002355009451192118, "rougeLsum_recall": 0.292336561157753, "rougeLsum_recall_stderr": 0.0026262120649372398}, "write_abstract_en": {"bleu": 1.0062083415777778, "bleu_stderr": 0.04711946186027571, "rouge1_fmeasure": 0.11404319152029233, "rouge1_fmeasure_stderr": 0.0019424688335624655, "rouge1_precision": 0.10326783555477309, "rouge1_precision_stderr": 0.001845941711723819, "rouge1_recall": 0.15774264721944228, "rouge1_recall_stderr": 0.0028918138893316418, "rouge2_fmeasure": 0.01856227196277119, "rouge2_fmeasure_stderr": 0.000747759698991048, "rouge2_precision": 0.015686648273678768, "rouge2_precision_stderr": 0.0006566075701044747, "rouge2_recall": 0.02846742132686831, "rouge2_recall_stderr": 0.0012449112557856106, "rougeL_fmeasure": 0.08879104640279128, "rougeL_fmeasure_stderr": 0.0013350757685037958, "rougeL_precision": 0.08074967691674281, "rougeL_precision_stderr": 0.0012838590605038145, "rougeL_recall": 0.1238571271772768, "rougeL_recall_stderr": 0.002138262266070014, "rougeLsum_fmeasure": 0.10533375181568398, "rougeLsum_fmeasure_stderr": 0.0018091025815919538, "rougeLsum_precision": 0.09519149494947343, "rougeLsum_precision_stderr": 0.0017162671701933705, "rougeLsum_recall": 0.1465542733502144, "rougeLsum_recall_stderr": 0.002715845708109963}}, "3": {"article_summary_en": {"bleu": 2.8242304451124287, "bleu_stderr": 0.11538371340257694, "rouge1_fmeasure": 0.1781210798922743, "rouge1_fmeasure_stderr": 0.002324914677271187, "rouge1_precision": 0.15943864465828597, "rouge1_precision_stderr": 0.002433310645340882, "rouge1_recall": 0.2568625432098374, "rouge1_recall_stderr": 0.003461904472445565, "rouge2_fmeasure": 0.04482124580351551, "rouge2_fmeasure_stderr": 0.0010283219957219916, "rouge2_precision": 0.03958355651727602, "rouge2_precision_stderr": 0.0010187263362724825, "rouge2_recall": 0.06658917213497038, "rouge2_recall_stderr": 0.0016962168841314238, "rougeL_fmeasure": 0.13242278894615536, "rougeL_fmeasure_stderr": 0.0016618685155073802, "rougeL_precision": 0.11796879235762016, "rougeL_precision_stderr": 0.0017626613778932401, "rougeL_recall": 0.19527007389438916, "rougeL_recall_stderr": 0.002725370936396766, "rougeLsum_fmeasure": 0.16564510211599384, "rougeLsum_fmeasure_stderr": 0.0021627154072008667, "rougeLsum_precision": 0.14827800010457695, "rougeLsum_precision_stderr": 0.0022750345989955767, "rougeLsum_recall": 0.23948635929233136, "rougeLsum_recall_stderr": 0.003251371946293944}, "rephrase_en": {"bleu": 2.731309122566432, "bleu_stderr": 0.12856901986784722, "rouge1_fmeasure": 0.15392307578099246, "rouge1_fmeasure_stderr": 0.0022338320016352573, "rouge1_precision": 0.1414603636700371, "rouge1_precision_stderr": 0.0023891742509933684, "rouge1_recall": 0.21860881858220305, "rouge1_recall_stderr": 0.0033065287240296643, "rouge2_fmeasure": 0.03895725296191616, "rouge2_fmeasure_stderr": 0.0009703362893100186, "rouge2_precision": 0.03540228251986772, "rouge2_precision_stderr": 0.001012362770647821, "rouge2_recall": 0.05680577059138669, "rouge2_recall_stderr": 0.0015537958394163272, "rougeL_fmeasure": 0.1193487893330064, "rougeL_fmeasure_stderr": 0.0016609278652029178, "rougeL_precision": 0.10910133183002717, "rougeL_precision_stderr": 0.0018029976378664805, "rougeL_recall": 0.1734226719612005, "rougeL_recall_stderr": 0.0026958607966266557, "rougeLsum_fmeasure": 0.14319992212601101, "rougeLsum_fmeasure_stderr": 0.002078026313828863, "rougeLsum_precision": 0.1316568373218448, "rougeLsum_precision_stderr": 0.0022375099093730815, "rougeLsum_recall": 0.20372540420257393, "rougeLsum_recall_stderr": 0.00310337674849783}, "summarize_above_en": {"bleu": 3.062227755689224, "bleu_stderr": 0.10516467661969281, "rouge1_fmeasure": 0.1703204755658047, "rouge1_fmeasure_stderr": 0.0023437253279040265, "rouge1_precision": 0.16137937728841734, "rouge1_precision_stderr": 0.002729028979629109, "rouge1_recall": 0.2387837109438451, "rouge1_recall_stderr": 0.0033294587186186273, "rouge2_fmeasure": 0.04242364396606276, "rouge2_fmeasure_stderr": 0.0010396516497661902, "rouge2_precision": 0.04090526043528262, "rouge2_precision_stderr": 0.001235838484277452, "rouge2_recall": 0.06101732506395295, "rouge2_recall_stderr": 0.00160920272003809, "rougeL_fmeasure": 0.1308133731843072, "rougeL_fmeasure_stderr": 0.001724287665682434, "rougeL_precision": 0.12349766223247734, "rougeL_precision_stderr": 0.002064619072302476, "rougeL_recall": 0.188129807231652, "rougeL_recall_stderr": 0.0026955200484454715, "rougeLsum_fmeasure": 0.158427617468449, "rougeLsum_fmeasure_stderr": 0.0021925425639389923, "rougeLsum_precision": 0.1502826430324195, "rougeLsum_precision_stderr": 0.002566907720178576, "rougeLsum_recall": 0.2228164439816456, "rougeLsum_recall_stderr": 0.003150083408193586}, "tldr_en": {"bleu": 3.0771001660724235, "bleu_stderr": 0.10166627860233955, "rouge1_fmeasure": 0.1820115847047415, "rouge1_fmeasure_stderr": 0.002231556986150065, "rouge1_precision": 0.17474019948410954, "rouge1_precision_stderr": 0.0027474429834987804, "rouge1_recall": 0.25521371457329683, "rouge1_recall_stderr": 0.0033152648455079072, "rouge2_fmeasure": 0.04689779702656875, "rouge2_fmeasure_stderr": 0.001034952296288115, "rouge2_precision": 0.045362687902130126, "rouge2_precision_stderr": 0.0012564421439368422, "rouge2_recall": 0.06804888392817302, "rouge2_recall_stderr": 0.0016992089106080323, "rougeL_fmeasure": 0.13086000378258256, "rougeL_fmeasure_stderr": 0.0015951624748139174, "rougeL_precision": 0.1265226291981258, "rougeL_precision_stderr": 0.002108936479136444, "rougeL_recall": 0.18709591094229805, "rougeL_recall_stderr": 0.002591170589040495, "rougeLsum_fmeasure": 0.17171382413552935, "rougeLsum_fmeasure_stderr": 0.0021029154857551075, "rougeLsum_precision": 0.1650854923795236, "rougeLsum_precision_stderr": 0.0026195222440337307, "rougeLsum_recall": 0.24111486429370707, "rougeLsum_recall_stderr": 0.003150424029915772}, "write_abstract_en": {"bleu": 1.0058220356103125, "bleu_stderr": 0.048299933628742284, "rouge1_fmeasure": 0.0826320712636738, "rouge1_fmeasure_stderr": 0.0018381499864311686, "rouge1_precision": 0.07857613907066331, "rouge1_precision_stderr": 0.0018325767254077753, "rouge1_recall": 0.11231507755062209, "rouge1_recall_stderr": 0.002709125328853898, "rouge2_fmeasure": 0.013461430147774255, "rouge2_fmeasure_stderr": 0.0006594173038785609, "rouge2_precision": 0.011896740680175497, "rouge2_precision_stderr": 0.000601828187026642, "rouge2_recall": 0.02057079275702888, "rouge2_recall_stderr": 0.0011498890895523576, "rougeL_fmeasure": 0.065544556988537, "rougeL_fmeasure_stderr": 0.0013299047040813018, "rougeL_precision": 0.06274112802109241, "rougeL_precision_stderr": 0.0013639990133955174, "rougeL_recall": 0.0899022034344876, "rougeL_recall_stderr": 0.0020795974250259583, "rougeLsum_fmeasure": 0.07624199938761006, "rougeLsum_fmeasure_stderr": 0.0017069202628844525, "rougeLsum_precision": 0.07257907609294534, "rougeLsum_precision_stderr": 0.0017086622186232854, "rougeLsum_recall": 0.10409821801585775, "rougeLsum_recall_stderr": 0.002535862979074873}}, "4": {"article_summary_en": {"bleu": 0.5910509935292934, "bleu_stderr": 0.04934236043571074, "rouge1_fmeasure": 0.057482791279950166, "rouge1_fmeasure_stderr": 0.0019817479295706616, "rouge1_precision": 0.053042765456545536, "rouge1_precision_stderr": 0.0019869824388181845, "rouge1_recall": 0.08545779450760739, "rouge1_recall_stderr": 0.003005122083966226, "rouge2_fmeasure": 0.014133038738676246, "rouge2_fmeasure_stderr": 0.000694610933915733, "rouge2_precision": 0.012522243430672486, "rouge2_precision_stderr": 0.0006528832775101719, "rouge2_recall": 0.02241566754614324, "rouge2_recall_stderr": 0.0012219610814187573, "rougeL_fmeasure": 0.043580733989129754, "rougeL_fmeasure_stderr": 0.0014829182403567982, "rougeL_precision": 0.04003813076428063, "rougeL_precision_stderr": 0.0014800060863990802, "rougeL_recall": 0.06635053217761794, "rougeL_recall_stderr": 0.0023888257381984475, "rougeLsum_fmeasure": 0.05286546828976805, "rougeLsum_fmeasure_stderr": 0.0018276666697922153, "rougeLsum_precision": 0.048851682561389824, "rougeLsum_precision_stderr": 0.001836524393749679, "rougeLsum_recall": 0.07862148294811239, "rougeLsum_recall_stderr": 0.002785921294419508}, "rephrase_en": {"bleu": 0.4824783196971761, "bleu_stderr": 0.04463252271552567, "rouge1_fmeasure": 0.047383213342466105, "rouge1_fmeasure_stderr": 0.0017667540426735513, "rouge1_precision": 0.04529026039810705, "rouge1_precision_stderr": 0.0018683347017330464, "rouge1_recall": 0.06985563534380705, "rouge1_recall_stderr": 0.0026707350721197195, "rouge2_fmeasure": 0.012100846520590037, "rouge2_fmeasure_stderr": 0.0006347499136665145, "rouge2_precision": 0.010911973549852157, "rouge2_precision_stderr": 0.000610311148352081, "rouge2_recall": 0.018743643702495896, "rouge2_recall_stderr": 0.001066843466304922, "rougeL_fmeasure": 0.03713795436852138, "rougeL_fmeasure_stderr": 0.0013568378761987527, "rougeL_precision": 0.03563453127314495, "rougeL_precision_stderr": 0.00147733951442123, "rougeL_recall": 0.05588201065370463, "rougeL_recall_stderr": 0.002163744865002187, "rougeLsum_fmeasure": 0.04386313053982596, "rougeLsum_fmeasure_stderr": 0.00163848796865913, "rougeLsum_precision": 0.041985085768643404, "rougeLsum_precision_stderr": 0.0017408930957524085, "rougeLsum_recall": 0.06471195868753603, "rougeLsum_recall_stderr": 0.002482277880366169}, "summarize_above_en": {"bleu": 0.416385920099641, "bleu_stderr": 0.040112901840196856, "rouge1_fmeasure": 0.048444404337984054, "rouge1_fmeasure_stderr": 0.0018220046283176402, "rouge1_precision": 0.04779731174149245, "rouge1_precision_stderr": 0.0020548178719198367, "rouge1_recall": 0.07123791494122411, "rouge1_recall_stderr": 0.00272031115150723, "rouge2_fmeasure": 0.012171454699532493, "rouge2_fmeasure_stderr": 0.0006624651441077691, "rouge2_precision": 0.012493623204348362, "rouge2_precision_stderr": 0.0009264858341368517, "rouge2_recall": 0.01866679464180306, "rouge2_recall_stderr": 0.001097487245025396, "rougeL_fmeasure": 0.038101293212841725, "rougeL_fmeasure_stderr": 0.0014065239228148929, "rougeL_precision": 0.03785063497182873, "rougeL_precision_stderr": 0.0016606407010599609, "rougeL_recall": 0.05742149931165803, "rougeL_recall_stderr": 0.0022331279191766247, "rougeLsum_fmeasure": 0.044795627250137825, "rougeLsum_fmeasure_stderr": 0.0016806124326569428, "rougeLsum_precision": 0.04434190566928883, "rougeLsum_precision_stderr": 0.0019234474448849314, "rougeLsum_recall": 0.06599904255640211, "rougeLsum_recall_stderr": 0.0025261730166920423}, "tldr_en": {"bleu": 0.6074700254086013, "bleu_stderr": 0.05423743996990111, "rouge1_fmeasure": 0.05660115935642753, "rouge1_fmeasure_stderr": 0.0019128121580337133, "rouge1_precision": 0.05686654081896349, "rouge1_precision_stderr": 0.002233440497229162, "rouge1_recall": 0.08353923265042008, "rouge1_recall_stderr": 0.0029077074042461065, "rouge2_fmeasure": 0.013741746630537094, "rouge2_fmeasure_stderr": 0.0006742800088300104, "rouge2_precision": 0.012648534117560613, "rouge2_precision_stderr": 0.000714480304383431, "rouge2_recall": 0.02199374005627044, "rouge2_recall_stderr": 0.0012054249033507543, "rougeL_fmeasure": 0.04194122086488744, "rougeL_fmeasure_stderr": 0.0014016615434499398, "rougeL_precision": 0.04243723633575883, "rougeL_precision_stderr": 0.0017020140807939923, "rougeL_recall": 0.0631413785577323, "rougeL_recall_stderr": 0.002247650978929538, "rougeLsum_fmeasure": 0.05334942672891278, "rougeLsum_fmeasure_stderr": 0.0018012620317959791, "rougeLsum_precision": 0.05323619772247561, "rougeLsum_precision_stderr": 0.0020619540523883185, "rougeLsum_recall": 0.07914684184702428, "rougeLsum_recall_stderr": 0.0027725917227984414}, "write_abstract_en": {"bleu": 0.07242667655921821, "bleu_stderr": 0.008932967771763006, "rouge1_fmeasure": 0.02010596120483132, "rouge1_fmeasure_stderr": 0.0010917797901789433, "rouge1_precision": 0.019168420675696286, "rouge1_precision_stderr": 0.0010795160831791957, "rouge1_recall": 0.02879908909496583, "rouge1_recall_stderr": 0.0016458129256000573, "rouge2_fmeasure": 0.003266077788148105, "rouge2_fmeasure_stderr": 0.0003273226353783014, "rouge2_precision": 0.0028556430231600335, "rouge2_precision_stderr": 0.000302994114231941, "rouge2_recall": 0.005469741300282359, "rouge2_recall_stderr": 0.000651898645432401, "rougeL_fmeasure": 0.015989909215756735, "rougeL_fmeasure_stderr": 0.0008105738923510461, "rougeL_precision": 0.015404523683234762, "rougeL_precision_stderr": 0.0008241756698203975, "rougeL_recall": 0.023150586278677637, "rougeL_recall_stderr": 0.0012827505595113496, "rougeLsum_fmeasure": 0.01839170907812179, "rougeLsum_fmeasure_stderr": 0.001003818069703428, "rougeLsum_precision": 0.01763922638803302, "rougeLsum_precision_stderr": 0.0010041223908314812, "rougeLsum_recall": 0.026296995107134465, "rougeLsum_recall_stderr": 0.0015071738271990678}}, "5": {"article_summary_en": {"bleu": 4.297671653759199e-07, "bleu_stderr": 7.785583349989124e-07, "rouge1_fmeasure": 0.008704060149453078, "rouge1_fmeasure_stderr": 0.0008478567145710981, "rouge1_precision": 0.008816748761467577, "rouge1_precision_stderr": 0.001006265857562293, "rouge1_recall": 0.012997624430774934, "rouge1_recall_stderr": 0.001263899892629013, "rouge2_fmeasure": 0.002017912016348425, "rouge2_fmeasure_stderr": 0.00025853872404257736, "rouge2_precision": 0.0017435776785760806, "rouge2_precision_stderr": 0.00023533716916175653, "rouge2_recall": 0.00314320442872562, "rouge2_recall_stderr": 0.00042253899620955503, "rougeL_fmeasure": 0.006723293959394975, "rougeL_fmeasure_stderr": 0.0006442903081428328, "rougeL_precision": 0.006962484807564813, "rougeL_precision_stderr": 0.000844699745065237, "rougeL_recall": 0.010341885117026819, "rougeL_recall_stderr": 0.0010188524693231964, "rougeLsum_fmeasure": 0.008149504005547098, "rougeLsum_fmeasure_stderr": 0.0007911718409998427, "rougeLsum_precision": 0.008338960824215367, "rougeLsum_precision_stderr": 0.0009673737935465667, "rougeLsum_recall": 0.012211796140355531, "rougeLsum_recall_stderr": 0.001190300176358689}, "rephrase_en": {"bleu": 1.1023797520667392e-07, "bleu_stderr": 2.3615613872249627e-07, "rouge1_fmeasure": 0.006986757193006964, "rouge1_fmeasure_stderr": 0.0007246985354434017, "rouge1_precision": 0.0070764690349056645, "rouge1_precision_stderr": 0.0008075324699720278, "rouge1_recall": 0.010036534764679986, "rouge1_recall_stderr": 0.0010720722462779057, "rouge2_fmeasure": 0.0018127250079443033, "rouge2_fmeasure_stderr": 0.000246838125245964, "rouge2_precision": 0.001854426650392226, "rouge2_precision_stderr": 0.00029520732928741403, "rouge2_recall": 0.0026116976110932373, "rouge2_recall_stderr": 0.0003696132049834325, "rougeL_fmeasure": 0.0054709515272040035, "rougeL_fmeasure_stderr": 0.0005631353830893648, "rougeL_precision": 0.00560973325227454, "rougeL_precision_stderr": 0.0006578100417105013, "rougeL_recall": 0.007965123538597381, "rougeL_recall_stderr": 0.0008557620706126542, "rougeLsum_fmeasure": 0.006566961010689614, "rougeLsum_fmeasure_stderr": 0.0006818902290682459, "rougeLsum_precision": 0.006686937698828636, "rougeLsum_precision_stderr": 0.0007687710652974154, "rougeLsum_recall": 0.00941357446197627, "rougeLsum_recall_stderr": 0.0010074978588991074}, "summarize_above_en": {"bleu": 1.5592181183403974e-09, "bleu_stderr": 3.9017186369415516e-09, "rouge1_fmeasure": 0.006175541531128815, "rouge1_fmeasure_stderr": 0.0006856911987577207, "rouge1_precision": 0.005887065759378087, "rouge1_precision_stderr": 0.0007186584994339659, "rouge1_recall": 0.009025529640709563, "rouge1_recall_stderr": 0.0010109891028141976, "rouge2_fmeasure": 0.0013784838984500552, "rouge2_fmeasure_stderr": 0.00021928864199571185, "rouge2_precision": 0.0013791403401351644, "rouge2_precision_stderr": 0.00025334887590687766, "rouge2_recall": 0.0019778725350326183, "rouge2_recall_stderr": 0.0003002459206151824, "rougeL_fmeasure": 0.004937632375856857, "rougeL_fmeasure_stderr": 0.0005388614915141283, "rougeL_precision": 0.004586578522696645, "rougeL_precision_stderr": 0.0005392063119391484, "rougeL_recall": 0.00746110009096202, "rougeL_recall_stderr": 0.0008469892635174075, "rougeLsum_fmeasure": 0.005738594434124072, "rougeLsum_fmeasure_stderr": 0.0006340338087020388, "rougeLsum_precision": 0.0055058688026879445, "rougeLsum_precision_stderr": 0.0006742675837988185, "rougeLsum_recall": 0.008392335211224188, "rougeLsum_recall_stderr": 0.000940745680937788}, "tldr_en": {"bleu": 1.0033020706801269e-06, "bleu_stderr": 1.9077024795694913e-06, "rouge1_fmeasure": 0.009252067005496314, "rouge1_fmeasure_stderr": 0.0008857404763954463, "rouge1_precision": 0.008781294870051274, "rouge1_precision_stderr": 0.0009050499890841822, "rouge1_recall": 0.013677582922976033, "rouge1_recall_stderr": 0.0013193791852402054, "rouge2_fmeasure": 0.0023512305693387013, "rouge2_fmeasure_stderr": 0.0002931024418143357, "rouge2_precision": 0.002216585419720102, "rouge2_precision_stderr": 0.0003047584752236144, "rouge2_recall": 0.0035060227812251585, "rouge2_recall_stderr": 0.0004475043348074056, "rougeL_fmeasure": 0.007007677157867705, "rougeL_fmeasure_stderr": 0.0006708462523675694, "rougeL_precision": 0.00679769925058833, "rougeL_precision_stderr": 0.000718459101307871, "rougeL_recall": 0.010367182111106583, "rougeL_recall_stderr": 0.0010118399277303067, "rougeLsum_fmeasure": 0.008671543071872297, "rougeLsum_fmeasure_stderr": 0.0008282868408056053, "rougeLsum_precision": 0.0082009891177148, "rougeLsum_precision_stderr": 0.0008441371445186326, "rougeLsum_recall": 0.012916195361676434, "rougeLsum_recall_stderr": 0.0012503160451136067}, "write_abstract_en": {"bleu": 3.936862342904368e-14, "bleu_stderr": 1.4388487630942075e-13, "rouge1_fmeasure": 0.001794950456581436, "rouge1_fmeasure_stderr": 0.00030728853559277316, "rouge1_precision": 0.0015780841392649505, "rouge1_precision_stderr": 0.00030145052547005737, "rouge1_recall": 0.0027069709166605916, "rouge1_recall_stderr": 0.0004523943421663273, "rouge2_fmeasure": 0.0001630519530024726, "rouge2_fmeasure_stderr": 5.716887646946743e-05, "rouge2_precision": 0.00016207296070587727, "rouge2_precision_stderr": 6.383944014856069e-05, "rouge2_recall": 0.00022581084966854161, "rouge2_recall_stderr": 7.606359517652543e-05, "rougeL_fmeasure": 0.0014936361251205509, "rougeL_fmeasure_stderr": 0.00023979795330628012, "rougeL_precision": 0.0012686454818619593, "rougeL_precision_stderr": 0.0002161999716520848, "rougeL_recall": 0.0023860171187140397, "rougeL_recall_stderr": 0.0004079367273708954, "rougeLsum_fmeasure": 0.0016051321840952642, "rougeLsum_fmeasure_stderr": 0.00027664878910365937, "rougeLsum_precision": 0.0013993374329327448, "rougeLsum_precision_stderr": 0.00026598516241118476, "rougeLsum_recall": 0.002450141316005954, "rougeLsum_recall_stderr": 0.00042236431455605384}}}, "anli_r1": {"0": {"GPT-3 style": {"acc": 0.323, "acc_norm": 0.335, "acc_norm_stderr": 0.014933117490932573, "acc_stderr": 0.014794927843348632, "subset": 1}, "MNLI crowdsource": {"acc": 0.334, "acc_norm": 0.34, "acc_norm_stderr": 0.014987482264363937, "acc_stderr": 0.014922019523732954, "subset": 1}, "can we infer": {"acc": 0.342, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456732, "acc_stderr": 0.01500870618212173, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.334, "acc_norm": 0.335, "acc_norm_stderr": 0.014933117490932575, "acc_stderr": 0.014922019523732968, "subset": 1}, "justified in saying": {"acc": 0.342, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456732, "acc_stderr": 0.015008706182121731, "subset": 1}}, "1": {"GPT-3 style": {"acc": 0.324, "acc_norm": 0.332, "acc_norm_stderr": 0.014899597242811482, "acc_stderr": 0.014806864733738857, "subset": 1}, "MNLI crowdsource": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}, "can we infer": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.331, "acc_norm": 0.339, "acc_norm_stderr": 0.014976758771620342, "acc_stderr": 0.014888272588203933, "subset": 1}, "justified in saying": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}}, "2": {"GPT-3 style": {"acc": 0.346, "acc_norm": 0.349, "acc_norm_stderr": 0.015080663991563098, "acc_stderr": 0.015050266127564438, "subset": 1}, "MNLI crowdsource": {"acc": 0.359, "acc_norm": 0.362, "acc_norm_stderr": 0.0152048409129195, "acc_stderr": 0.015177264224798596, "subset": 1}, "can we infer": {"acc": 0.341, "acc_norm": 0.332, "acc_norm_stderr": 0.014899597242811488, "acc_stderr": 0.014998131348402707, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.331, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456732, "acc_stderr": 0.014888272588203933, "subset": 1}, "justified in saying": {"acc": 0.337, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422987, "acc_stderr": 0.014955087918653593, "subset": 1}}, "3": {"GPT-3 style": {"acc": 0.348, "acc_norm": 0.342, "acc_norm_stderr": 0.01500870618212173, "acc_stderr": 0.01507060460376841, "subset": 1}, "MNLI crowdsource": {"acc": 0.356, "acc_norm": 0.348, "acc_norm_stderr": 0.015070604603768408, "acc_stderr": 0.015149042659306625, "subset": 1}, "can we infer": {"acc": 0.366, "acc_norm": 0.336, "acc_norm_stderr": 0.014944140233795027, "acc_stderr": 0.015240612726405749, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.334, "acc_norm": 0.331, "acc_norm_stderr": 0.014888272588203928, "acc_stderr": 0.014922019523732956, "subset": 1}, "justified in saying": {"acc": 0.356, "acc_norm": 0.342, "acc_norm_stderr": 0.015008706182121728, "acc_stderr": 0.015149042659306623, "subset": 1}}, "4": {"GPT-3 style": {"acc": 0.325, "acc_norm": 0.318, "acc_norm_stderr": 0.014734079309311901, "acc_stderr": 0.014818724459095526, "subset": 1}, "MNLI crowdsource": {"acc": 0.345, "acc_norm": 0.346, "acc_norm_stderr": 0.015050266127564445, "acc_stderr": 0.015039986742055238, "subset": 1}, "can we infer": {"acc": 0.335, "acc_norm": 0.338, "acc_norm_stderr": 0.014965960710224473, "acc_stderr": 0.014933117490932572, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.341, "acc_norm": 0.338, "acc_norm_stderr": 0.014965960710224466, "acc_stderr": 0.014998131348402714, "subset": 1}, "justified in saying": {"acc": 0.343, "acc_norm": 0.325, "acc_norm_stderr": 0.014818724459095524, "acc_stderr": 0.015019206922356953, "subset": 1}}, "5": {"GPT-3 style": {"acc": 0.314, "acc_norm": 0.312, "acc_norm_stderr": 0.014658474370509012, "acc_stderr": 0.01468399195108797, "subset": 1}, "MNLI crowdsource": {"acc": 0.353, "acc_norm": 0.352, "acc_norm_stderr": 0.015110404505648664, "acc_stderr": 0.015120172605483697, "subset": 1}, "can we infer": {"acc": 0.328, "acc_norm": 0.327, "acc_norm_stderr": 0.014842213153411242, "acc_stderr": 0.014853842487270333, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.327, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732958, "acc_stderr": 0.014842213153411239, "subset": 1}, "justified in saying": {"acc": 0.326, "acc_norm": 0.327, "acc_norm_stderr": 0.01484221315341124, "acc_stderr": 0.014830507204541031, "subset": 1}}}, "anli_r2": {"0": {"GPT-3 style": {"acc": 0.327, "acc_norm": 0.351, "acc_norm_stderr": 0.015100563798316405, "acc_stderr": 0.014842213153411245, "subset": 2}, "MNLI crowdsource": {"acc": 0.334, "acc_norm": 0.323, "acc_norm_stderr": 0.014794927843348632, "acc_stderr": 0.014922019523732958, "subset": 2}, "can we infer": {"acc": 0.348, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732965, "acc_stderr": 0.01507060460376841, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.336, "acc_norm": 0.338, "acc_norm_stderr": 0.0149659607102245, "acc_stderr": 0.014944140233795018, "subset": 2}, "justified in saying": {"acc": 0.34, "acc_norm": 0.332, "acc_norm_stderr": 0.014899597242811483, "acc_stderr": 0.014987482264363937, "subset": 2}}, "1": {"GPT-3 style": {"acc": 0.309, "acc_norm": 0.313, "acc_norm_stderr": 0.014671272822977885, "acc_stderr": 0.01461960097720649, "subset": 2}, "MNLI crowdsource": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}, "can we infer": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.308, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792508, "acc_stderr": 0.01460648312734276, "subset": 2}, "justified in saying": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}}, "2": {"GPT-3 style": {"acc": 0.317, "acc_norm": 0.314, "acc_norm_stderr": 0.01468399195108798, "acc_stderr": 0.014721675438880215, "subset": 2}, "MNLI crowdsource": {"acc": 0.312, "acc_norm": 0.309, "acc_norm_stderr": 0.014619600977206493, "acc_stderr": 0.014658474370509008, "subset": 2}, "can we infer": {"acc": 0.316, "acc_norm": 0.32, "acc_norm_stderr": 0.014758652303574891, "acc_stderr": 0.014709193056057139, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.324, "acc_norm": 0.331, "acc_norm_stderr": 0.01488827258820392, "acc_stderr": 0.014806864733738864, "subset": 2}, "justified in saying": {"acc": 0.322, "acc_norm": 0.321, "acc_norm_stderr": 0.014770821817934652, "acc_stderr": 0.014782913600996693, "subset": 2}}, "3": {"GPT-3 style": {"acc": 0.333, "acc_norm": 0.328, "acc_norm_stderr": 0.014853842487270333, "acc_stderr": 0.014910846164229873, "subset": 2}, "MNLI crowdsource": {"acc": 0.305, "acc_norm": 0.305, "acc_norm_stderr": 0.014566646394664396, "acc_stderr": 0.014566646394664396, "subset": 2}, "can we infer": {"acc": 0.32, "acc_norm": 0.32, "acc_norm_stderr": 0.014758652303574891, "acc_stderr": 0.014758652303574888, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.33, "acc_norm": 0.321, "acc_norm_stderr": 0.014770821817934652, "acc_stderr": 0.01487687202745673, "subset": 2}, "justified in saying": {"acc": 0.315, "acc_norm": 0.32, "acc_norm_stderr": 0.014758652303574886, "acc_stderr": 0.01469663196079251, "subset": 2}}, "4": {"GPT-3 style": {"acc": 0.323, "acc_norm": 0.3, "acc_norm_stderr": 0.014498627873361427, "acc_stderr": 0.014794927843348628, "subset": 2}, "MNLI crowdsource": {"acc": 0.306, "acc_norm": 0.295, "acc_norm_stderr": 0.014428554438445526, "acc_stderr": 0.014580006055436972, "subset": 2}, "can we infer": {"acc": 0.308, "acc_norm": 0.312, "acc_norm_stderr": 0.014658474370509012, "acc_stderr": 0.014606483127342758, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.311, "acc_norm": 0.316, "acc_norm_stderr": 0.014709193056057128, "acc_stderr": 0.01464559638572269, "subset": 2}, "justified in saying": {"acc": 0.306, "acc_norm": 0.305, "acc_norm_stderr": 0.014566646394664378, "acc_stderr": 0.014580006055436969, "subset": 2}}, "5": {"GPT-3 style": {"acc": 0.327, "acc_norm": 0.316, "acc_norm_stderr": 0.014709193056057121, "acc_stderr": 0.014842213153411244, "subset": 2}, "MNLI crowdsource": {"acc": 0.315, "acc_norm": 0.314, "acc_norm_stderr": 0.01468399195108796, "acc_stderr": 0.014696631960792498, "subset": 2}, "can we infer": {"acc": 0.326, "acc_norm": 0.319, "acc_norm_stderr": 0.014746404865473477, "acc_stderr": 0.014830507204541038, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.319, "acc_norm": 0.323, "acc_norm_stderr": 0.01479492784334863, "acc_stderr": 0.014746404865473484, "subset": 2}, "justified in saying": {"acc": 0.319, "acc_norm": 0.312, "acc_norm_stderr": 0.01465847437050901, "acc_stderr": 0.014746404865473477, "subset": 2}}}, "anli_r3": {"0": {"GPT-3 style": {"acc": 0.35083333333333333, "acc_norm": 0.3475, "acc_norm_stderr": 0.013751753243291852, "acc_stderr": 0.013782212417178193, "subset": 3}, "MNLI crowdsource": {"acc": 0.33416666666666667, "acc_norm": 0.3175, "acc_norm_stderr": 0.01344353868134805, "acc_stderr": 0.013622434813136774, "subset": 3}, "can we infer": {"acc": 0.3325, "acc_norm": 0.33416666666666667, "acc_norm_stderr": 0.013622434813136774, "acc_stderr": 0.013605417345710526, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3275, "acc_norm": 0.32, "acc_norm_stderr": 0.01347162092976914, "acc_stderr": 0.013553211167251956, "subset": 3}, "justified in saying": {"acc": 0.33916666666666667, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.013613950010225608, "acc_stderr": 0.013672343491681819, "subset": 3}}, "1": {"GPT-3 style": {"acc": 0.3441666666666667, "acc_norm": 0.3416666666666667, "acc_norm_stderr": 0.013696658778002515, "acc_stderr": 0.013720551062295756, "subset": 3}, "MNLI crowdsource": {"acc": 0.33666666666666667, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.013647602942406393, "acc_stderr": 0.013647602942406393, "subset": 3}, "can we infer": {"acc": 0.33666666666666667, "acc_norm": 0.3375, "acc_norm_stderr": 0.013655897185463653, "acc_stderr": 0.013647602942406393, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3283333333333333, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.013613950010225606, "acc_stderr": 0.013562032919529019, "subset": 3}, "justified in saying": {"acc": 0.33666666666666667, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.013647602942406393, "acc_stderr": 0.013647602942406393, "subset": 3}}, "2": {"GPT-3 style": {"acc": 0.3275, "acc_norm": 0.3125, "acc_norm_stderr": 0.013386029277441229, "acc_stderr": 0.013553211167251951, "subset": 3}, "MNLI crowdsource": {"acc": 0.31916666666666665, "acc_norm": 0.325, "acc_norm_stderr": 0.013526454480351021, "acc_stderr": 0.013462309712005136, "subset": 3}, "can we infer": {"acc": 0.32, "acc_norm": 0.305, "acc_norm_stderr": 0.013296358936471105, "acc_stderr": 0.01347162092976915, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3125, "acc_norm": 0.30916666666666665, "acc_norm_stderr": 0.013346684134591945, "acc_stderr": 0.013386029277441229, "subset": 3}, "justified in saying": {"acc": 0.3275, "acc_norm": 0.30833333333333335, "acc_norm_stderr": 0.01333672114313647, "acc_stderr": 0.013553211167251953, "subset": 3}}, "3": {"GPT-3 style": {"acc": 0.33666666666666667, "acc_norm": 0.325, "acc_norm_stderr": 0.013526454480351018, "acc_stderr": 0.01364760294240639, "subset": 3}, "MNLI crowdsource": {"acc": 0.32916666666666666, "acc_norm": 0.32666666666666666, "acc_norm_stderr": 0.013544340907003665, "acc_stderr": 0.013570806258433626, "subset": 3}, "can we infer": {"acc": 0.335, "acc_norm": 0.32, "acc_norm_stderr": 0.013471620929769149, "acc_stderr": 0.013630871843821474, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3258333333333333, "acc_norm": 0.33, "acc_norm_stderr": 0.013579531277800925, "acc_stderr": 0.01353542204341745, "subset": 3}, "justified in saying": {"acc": 0.3383333333333333, "acc_norm": 0.3275, "acc_norm_stderr": 0.013553211167251951, "acc_stderr": 0.013664144006618266, "subset": 3}}, "4": {"GPT-3 style": {"acc": 0.30666666666666664, "acc_norm": 0.3175, "acc_norm_stderr": 0.013443538681348052, "acc_stderr": 0.013316642319070699, "subset": 3}, "MNLI crowdsource": {"acc": 0.3275, "acc_norm": 0.325, "acc_norm_stderr": 0.013526454480351016, "acc_stderr": 0.013553211167251947, "subset": 3}, "can we infer": {"acc": 0.3233333333333333, "acc_norm": 0.3358333333333333, "acc_norm_stderr": 0.013639261190932887, "acc_stderr": 0.013508372867300215, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.31583333333333335, "acc_norm": 0.31166666666666665, "acc_norm_stderr": 0.013376268790982105, "acc_stderr": 0.013424568830356446, "subset": 3}, "justified in saying": {"acc": 0.32166666666666666, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.013647602942406398, "acc_stderr": 0.013490095282989521, "subset": 3}}, "5": {"GPT-3 style": {"acc": 0.31166666666666665, "acc_norm": 0.30416666666666664, "acc_norm_stderr": 0.013286140243317441, "acc_stderr": 0.013376268790982098, "subset": 3}, "MNLI crowdsource": {"acc": 0.30833333333333335, "acc_norm": 0.32166666666666666, "acc_norm_stderr": 0.013490095282989521, "acc_stderr": 0.013336721143136464, "subset": 3}, "can we infer": {"acc": 0.32666666666666666, "acc_norm": 0.3233333333333333, "acc_norm_stderr": 0.013508372867300217, "acc_stderr": 0.013544340907003663, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.31416666666666665, "acc_norm": 0.3075, "acc_norm_stderr": 0.013326707242912057, "acc_stderr": 0.0134053993149841, "subset": 3}, "justified in saying": {"acc": 0.3233333333333333, "acc_norm": 0.3325, "acc_norm_stderr": 0.01360541734571053, "acc_stderr": 0.01350837286730022, "subset": 3}}}, "arc_easy": {"0": {"heres_a_problem": {"acc": 0.24494949494949494, "acc_norm": 0.24494949494949494, "acc_norm_stderr": 0.00882458861121907, "acc_stderr": 0.00882458861121907}, "i_am_hesitating": {"acc": 0.25170648464163825, "acc_norm": 0.2841296928327645, "acc_norm_stderr": 0.01317944244765389, "acc_stderr": 0.012682496334042961}, "multiple_choice": {"acc": 0.24488054607508533, "acc_norm": 0.27047781569965873, "acc_norm_stderr": 0.012980954547659556, "acc_stderr": 0.012566273985131358}, "pick_the_most_correct_option": {"acc": 0.23947811447811448, "acc_norm": 0.23947811447811448, "acc_norm_stderr": 0.008757032594354022, "acc_stderr": 0.008757032594354022}, "qa_options": {"acc": 0.2619453924914676, "acc_norm": 0.2832764505119454, "acc_norm_stderr": 0.013167478735134576, "acc_stderr": 0.012849054826858114}}, "1": {"heres_a_problem": {"acc": 0.2380546075085324, "acc_norm": 0.2380546075085324, "acc_norm_stderr": 0.0124457700280262, "acc_stderr": 0.0124457700280262}, "i_am_hesitating": {"acc": 0.3560606060606061, "acc_norm": 0.31607744107744107, "acc_norm_stderr": 0.009540440071928285, "acc_stderr": 0.009825454608416303}, "multiple_choice": {"acc": 0.24914675767918087, "acc_norm": 0.2696245733788396, "acc_norm_stderr": 0.01296804068686916, "acc_stderr": 0.012639407111926433}, "pick_the_most_correct_option": {"acc": 0.23526936026936027, "acc_norm": 0.23526936026936027, "acc_norm_stderr": 0.008703724269718638, "acc_stderr": 0.008703724269718638}, "qa_options": {"acc": 0.26791808873720135, "acc_norm": 0.29436860068259385, "acc_norm_stderr": 0.013318528460539427, "acc_stderr": 0.012942030195136425}}, "2": {"heres_a_problem": {"acc": 0.242003367003367, "acc_norm": 0.242003367003367, "acc_norm_stderr": 0.008788455043255566, "acc_stderr": 0.008788455043255566}, "i_am_hesitating": {"acc": 0.3480639730639731, "acc_norm": 0.3143939393939394, "acc_norm_stderr": 0.009526702423162909, "acc_stderr": 0.009774627600259012}, "multiple_choice": {"acc": 0.35353535353535354, "acc_norm": 0.3581649831649832, "acc_norm_stderr": 0.009838331651451844, "acc_stderr": 0.00980972894815149}, "pick_the_most_correct_option": {"acc": 0.2431740614334471, "acc_norm": 0.2431740614334471, "acc_norm_stderr": 0.012536554144587087, "acc_stderr": 0.012536554144587087}, "qa_options": {"acc": 0.34553872053872053, "acc_norm": 0.30765993265993263, "acc_norm_stderr": 0.009470292575831181, "acc_stderr": 0.00975794873067031}}, "3": {"heres_a_problem": {"acc": 0.2478956228956229, "acc_norm": 0.2478956228956229, "acc_norm_stderr": 0.008860162361464025, "acc_stderr": 0.008860162361464025}, "i_am_hesitating": {"acc": 0.26535836177474403, "acc_norm": 0.27047781569965873, "acc_norm_stderr": 0.012980954547659556, "acc_stderr": 0.012902554762313969}, "multiple_choice": {"acc": 0.34553872053872053, "acc_norm": 0.3531144781144781, "acc_norm_stderr": 0.009807078935467613, "acc_stderr": 0.0097579487306703}, "pick_the_most_correct_option": {"acc": 0.2474747474747475, "acc_norm": 0.2474747474747475, "acc_norm_stderr": 0.008855114414834709, "acc_stderr": 0.008855114414834709}, "qa_options": {"acc": 0.257679180887372, "acc_norm": 0.2713310580204778, "acc_norm_stderr": 0.012993807727545777, "acc_stderr": 0.012780770562768409}}, "4": {"heres_a_problem": {"acc": 0.25341296928327645, "acc_norm": 0.25341296928327645, "acc_norm_stderr": 0.012710896778378606, "acc_stderr": 0.012710896778378606}, "i_am_hesitating": {"acc": 0.2593856655290102, "acc_norm": 0.28242320819112626, "acc_norm_stderr": 0.013155456884097218, "acc_stderr": 0.012808273573927104}, "multiple_choice": {"acc": 0.2525597269624573, "acc_norm": 0.2790102389078498, "acc_norm_stderr": 0.01310678488360134, "acc_stderr": 0.012696728980207704}, "pick_the_most_correct_option": {"acc": 0.2551194539249147, "acc_norm": 0.2551194539249147, "acc_norm_stderr": 0.012739038695202104, "acc_stderr": 0.012739038695202104}, "qa_options": {"acc": 0.3341750841750842, "acc_norm": 0.29503367003367004, "acc_norm_stderr": 0.009358110551087425, "acc_stderr": 0.00967910603291906}}, "5": {"heres_a_problem": {"acc": 0.2354948805460751, "acc_norm": 0.2354948805460751, "acc_norm_stderr": 0.012399451855004759, "acc_stderr": 0.012399451855004759}, "i_am_hesitating": {"acc": 0.32365319865319864, "acc_norm": 0.3047138047138047, "acc_norm_stderr": 0.009444871667360211, "acc_stderr": 0.009600478182273787}, "multiple_choice": {"acc": 0.2508532423208191, "acc_norm": 0.2627986348122867, "acc_norm_stderr": 0.012862523175351333, "acc_stderr": 0.01266819862131543}, "pick_the_most_correct_option": {"acc": 0.25252525252525254, "acc_norm": 0.25252525252525254, "acc_norm_stderr": 0.008914948991495718, "acc_stderr": 0.008914948991495718}, "qa_options": {"acc": 0.3261784511784512, "acc_norm": 0.3021885521885522, "acc_norm_stderr": 0.009422719042483192, "acc_stderr": 0.009619849417035172}}}, "boolq": {"0": {"GPT-3 Style": {"acc": 0.538, "acc_norm": 0.6383333333333333, "acc_norm_stderr": 0.008773841218429196, "acc_stderr": 0.00910382483037647}, "after_reading": {"acc": 0.6233333333333333, "acc_norm": 0.43366666666666664, "acc_norm_stderr": 0.009049526374650795, "acc_stderr": 0.00884811049411477}, "exercise": {"acc": 0.623, "acc_norm": 0.5946666666666667, "acc_norm_stderr": 0.008965091467970754, "acc_stderr": 0.00884965755342756}, "valid_binary": {"acc": 0.5896666666666667, "acc_norm": 0.402, "acc_norm_stderr": 0.008953140207390567, "acc_stderr": 0.008982215188519143}, "yes_no_question": {"acc": 0.5293333333333333, "acc_norm": 0.6236666666666667, "acc_norm_stderr": 0.008846558976258922, "acc_stderr": 0.009114505467759737}}, "1": {"GPT-3 Style": {"acc": 0.5356666666666666, "acc_norm": 0.6116666666666667, "acc_norm_stderr": 0.008899620943397685, "acc_stderr": 0.009106972161130879}, "after_reading": {"acc": 0.5406666666666666, "acc_norm": 0.5406666666666666, "acc_norm_stderr": 0.009099982269204863, "acc_stderr": 0.009099982269204863}, "exercise": {"acc": 0.5566666666666666, "acc_norm": 0.5456666666666666, "acc_norm_stderr": 0.009092070195065412, "acc_stderr": 0.009071405243621038}, "valid_binary": {"acc": 0.5423333333333333, "acc_norm": 0.541, "acc_norm_stderr": 0.009099483512819305, "acc_stderr": 0.009097447488896774}, "yes_no_question": {"acc": 0.5406666666666666, "acc_norm": 0.5406666666666666, "acc_norm_stderr": 0.009099982269204863, "acc_stderr": 0.009099982269204863}}, "2": {"GPT-3 Style": {"acc": 0.5443333333333333, "acc_norm": 0.6156666666666667, "acc_norm_stderr": 0.008882569490543052, "acc_stderr": 0.009094270381387362}, "after_reading": {"acc": 0.5396666666666666, "acc_norm": 0.53, "acc_norm_stderr": 0.009113781890088811, "acc_stderr": 0.00910145395014027}, "exercise": {"acc": 0.5536666666666666, "acc_norm": 0.5426666666666666, "acc_norm_stderr": 0.009096928229880426, "acc_stderr": 0.009077486613450291}, "valid_binary": {"acc": 0.5706666666666667, "acc_norm": 0.561, "acc_norm_stderr": 0.009062029213030572, "acc_stderr": 0.009038582451449426}, "yes_no_question": {"acc": 0.48233333333333334, "acc_norm": 0.496, "acc_norm_stderr": 0.009129938951699208, "acc_stderr": 0.009124530050684579}}, "3": {"GPT-3 Style": {"acc": 0.5566666666666666, "acc_norm": 0.617, "acc_norm_stderr": 0.00887674483503323, "acc_stderr": 0.00907140524362105}, "after_reading": {"acc": 0.539, "acc_norm": 0.5166666666666667, "acc_norm_stderr": 0.009125157363376123, "acc_stderr": 0.009102414587191052}, "exercise": {"acc": 0.5583333333333333, "acc_norm": 0.5496666666666666, "acc_norm_stderr": 0.009085074954912703, "acc_stderr": 0.009067881941319675}, "valid_binary": {"acc": 0.5633333333333334, "acc_norm": 0.5506666666666666, "acc_norm_stderr": 0.009083233528874798, "acc_stderr": 0.009056690207178128}, "yes_no_question": {"acc": 0.4676666666666667, "acc_norm": 0.5126666666666667, "acc_norm_stderr": 0.009127300863830172, "acc_stderr": 0.0091111208252746}}, "4": {"GPT-3 Style": {"acc": 0.5656666666666667, "acc_norm": 0.6216666666666667, "acc_norm_stderr": 0.008855801251873009, "acc_stderr": 0.009051147480837464}, "after_reading": {"acc": 0.527, "acc_norm": 0.5076666666666667, "acc_norm_stderr": 0.009129157751283581, "acc_stderr": 0.009116909528258622}, "exercise": {"acc": 0.57, "acc_norm": 0.556, "acc_norm_stderr": 0.009072785596468857, "acc_stderr": 0.00904031207504128}, "valid_binary": {"acc": 0.5543333333333333, "acc_norm": 0.5446666666666666, "acc_norm_stderr": 0.009093726495969151, "acc_stderr": 0.009076164124491365}, "yes_no_question": {"acc": 0.481, "acc_norm": 0.513, "acc_norm_stderr": 0.009127144583936549, "acc_stderr": 0.00912363671545717}}, "5": {"GPT-3 Style": {"acc": 0.5716666666666667, "acc_norm": 0.6206666666666667, "acc_norm_stderr": 0.008860362324722518, "acc_stderr": 0.00903595664371605}, "after_reading": {"acc": 0.5133333333333333, "acc_norm": 0.49533333333333335, "acc_norm_stderr": 0.009129833442820515, "acc_stderr": 0.009126984242044514}, "exercise": {"acc": 0.567, "acc_norm": 0.556, "acc_norm_stderr": 0.009072785596468855, "acc_stderr": 0.009047888598785733}, "valid_binary": {"acc": 0.561, "acc_norm": 0.5476666666666666, "acc_norm_stderr": 0.009088646624339615, "acc_stderr": 0.009062029213030572}, "yes_no_question": {"acc": 0.47733333333333333, "acc_norm": 0.505, "acc_norm_stderr": 0.009129774600800656, "acc_stderr": 0.009120844478925309}}}, "cb": {"0": {"GPT-3 style": {"acc": 0.35714285714285715, "acc_stderr": 0.06460957383809221, "f1": 0.1754385964912281}, "MNLI crowdsource": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359538, "f1": 0.1940928270042194}, "can we infer": {"acc": 0.4642857142857143, "acc_stderr": 0.06724777654937658, "f1": 0.3398692810457516}, "guaranteed/possible/impossible": {"acc": 0.14285714285714285, "acc_stderr": 0.0471841613625583, "f1": 0.11887125220458554}, "justified in saying": {"acc": 0.35714285714285715, "acc_stderr": 0.0646095738380922, "f1": 0.27465986394557823}}, "1": {"GPT-3 style": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "MNLI crowdsource": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "can we infer": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "guaranteed/possible/impossible": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.27314814814814814}, "justified in saying": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}}, "2": {"GPT-3 style": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.26666666666666666}, "MNLI crowdsource": {"acc": 0.4642857142857143, "acc_stderr": 0.0672477765493766, "f1": 0.32236227824463115}, "can we infer": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "guaranteed/possible/impossible": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.2567567567567568}, "justified in saying": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359541, "f1": 0.3006032601719933}}, "3": {"GPT-3 style": {"acc": 0.35714285714285715, "acc_stderr": 0.0646095738380922, "f1": 0.24857881136950902}, "MNLI crowdsource": {"acc": 0.5357142857142857, "acc_stderr": 0.06724777654937658, "f1": 0.3757011576560449}, "can we infer": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.31761006289308175}, "guaranteed/possible/impossible": {"acc": 0.35714285714285715, "acc_stderr": 0.0646095738380922, "f1": 0.24074074074074078}, "justified in saying": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.3143399810066477}}, "4": {"GPT-3 style": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.22990271377368152}, "MNLI crowdsource": {"acc": 0.4642857142857143, "acc_stderr": 0.0672477765493766, "f1": 0.3196798493408663}, "can we infer": {"acc": 0.5357142857142857, "acc_stderr": 0.06724777654937658, "f1": 0.37694592988710635}, "guaranteed/possible/impossible": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.28992628992628994}, "justified in saying": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.31636363636363635}}, "5": {"GPT-3 style": {"acc": 0.30357142857142855, "acc_stderr": 0.06199938655510755, "f1": 0.22028985507246376}, "MNLI crowdsource": {"acc": 0.5, "acc_stderr": 0.06741998624632421, "f1": 0.3416488477072939}, "can we infer": {"acc": 0.5, "acc_stderr": 0.06741998624632421, "f1": 0.3548587781510513}, "guaranteed/possible/impossible": {"acc": 0.35714285714285715, "acc_stderr": 0.06460957383809221, "f1": 0.23318250377073904}, "justified in saying": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.3073128622518189}}}, "copa": {"0": {"best_option": {"acc": 0.54, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.05009082659620333}, "cause_effect": {"acc": 0.6, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956911, "acc_stderr": 0.04923659639173309}, "choose": {"acc": 0.61, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956911, "acc_stderr": 0.04902071300001975}, "i_am_hesitating": {"acc": 0.62, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.04878317312145633}, "plausible_alternatives": {"acc": 0.61, "acc_norm": 0.55, "acc_norm_stderr": 0.049999999999999996, "acc_stderr": 0.04902071300001975}}, "1": {"best_option": {"acc": 0.57, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.04975698519562428}, "cause_effect": {"acc": 0.47, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.05016135580465919}, "choose": {"acc": 0.47, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.05016135580465919}, "i_am_hesitating": {"acc": 0.49, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.05024183937956912}, "plausible_alternatives": {"acc": 0.42, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084, "acc_stderr": 0.049604496374885836}}, "2": {"best_option": {"acc": 0.57, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.04975698519562428}, "cause_effect": {"acc": 0.44, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.04988876515698589}, "choose": {"acc": 0.48, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025, "acc_stderr": 0.05021167315686779}, "i_am_hesitating": {"acc": 0.45, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.04999999999999999}, "plausible_alternatives": {"acc": 0.43, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.04975698519562428}}, "3": {"best_option": {"acc": 0.57, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309, "acc_stderr": 0.04975698519562428}, "cause_effect": {"acc": 0.48, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.05021167315686779}, "choose": {"acc": 0.49, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.05024183937956912}, "i_am_hesitating": {"acc": 0.44, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284, "acc_stderr": 0.04988876515698589}, "plausible_alternatives": {"acc": 0.43, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.049756985195624284}}, "4": {"best_option": {"acc": 0.59, "acc_norm": 0.57, "acc_norm_stderr": 0.049756985195624284, "acc_stderr": 0.04943110704237102}, "cause_effect": {"acc": 0.48, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.050211673156867795}, "choose": {"acc": 0.47, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.05016135580465919}, "i_am_hesitating": {"acc": 0.48, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.050211673156867795}, "plausible_alternatives": {"acc": 0.45, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284, "acc_stderr": 0.04999999999999999}}, "5": {"best_option": {"acc": 0.55, "acc_norm": 0.53, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.04999999999999999}, "cause_effect": {"acc": 0.45, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084, "acc_stderr": 0.04999999999999999}, "choose": {"acc": 0.45, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.04999999999999999}, "i_am_hesitating": {"acc": 0.44, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.04988876515698589}, "plausible_alternatives": {"acc": 0.45, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.04999999999999999}}}, "e2e_nlg_cleaned": {"0": {"coherent_text": {"bleu": 3.498486545620417, "bleu_stderr": 0.028599955391489566, "rouge1_fmeasure": 0.22495159490910463, "rouge1_fmeasure_stderr": 0.0019053847340024682, "rouge1_precision": 0.17483459362877837, "rouge1_precision_stderr": 0.001705017286322628, "rouge1_recall": 0.33493639354854016, "rouge1_recall_stderr": 0.0024663398758924945, "rouge2_fmeasure": 0.0768086035487303, "rouge2_fmeasure_stderr": 0.0012111613779042028, "rouge2_precision": 0.05964515275724504, "rouge2_precision_stderr": 0.0009755619175694137, "rouge2_recall": 0.1142906263856143, "rouge2_recall_stderr": 0.001814336091779065, "rougeL_fmeasure": 0.19419308478997838, "rougeL_fmeasure_stderr": 0.0015170582220134538, "rougeL_precision": 0.15005963762480484, "rougeL_precision_stderr": 0.0013171712778281752, "rougeL_recall": 0.2919690872282272, "rougeL_recall_stderr": 0.0021251924476385774, "rougeLsum_fmeasure": 0.1917621810854489, "rougeLsum_fmeasure_stderr": 0.0016819035833037649, "rougeLsum_precision": 0.14893285461116787, "rougeLsum_precision_stderr": 0.0014850614932496661, "rougeLsum_recall": 0.2860286356508758, "rougeLsum_recall_stderr": 0.0022430688988452152}, "create_text_for_me": {"bleu": 1.1558513353609208, "bleu_stderr": 0.04074903284344169, "rouge1_fmeasure": 0.1493845282584616, "rouge1_fmeasure_stderr": 0.0017612814690175535, "rouge1_precision": 0.11516247787644035, "rouge1_precision_stderr": 0.001524841446986784, "rouge1_recall": 0.22747710284617287, "rouge1_recall_stderr": 0.0024176480638780427, "rouge2_fmeasure": 0.02963754911980315, "rouge2_fmeasure_stderr": 0.0008583825054508186, "rouge2_precision": 0.023214286463636956, "rouge2_precision_stderr": 0.0006999863181351649, "rouge2_recall": 0.044410334346547925, "rouge2_recall_stderr": 0.0012895264732825, "rougeL_fmeasure": 0.12196365835192624, "rougeL_fmeasure_stderr": 0.001376970711141067, "rougeL_precision": 0.09351085926051252, "rougeL_precision_stderr": 0.0011626517440736233, "rougeL_recall": 0.1875911020135083, "rougeL_recall_stderr": 0.001982311694030516, "rougeLsum_fmeasure": 0.12556472090225665, "rougeLsum_fmeasure_stderr": 0.0014533252243555674, "rougeLsum_precision": 0.09651005871721619, "rougeLsum_precision_stderr": 0.001243153944300717, "rougeLsum_recall": 0.19223482953366502, "rougeLsum_recall_stderr": 0.002042794088449503}, "generate_gramatically_correct_text": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 7.474747474747475e-05, "rouge1_fmeasure_stderr": 5.378367894229668e-05, "rouge1_precision": 0.0006666666666666666, "rouge1_precision_stderr": 0.00047132592062028273, "rouge1_recall": 3.968253968253968e-05, "rouge1_recall_stderr": 2.8611082123824066e-05, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 7.474747474747475e-05, "rougeL_fmeasure_stderr": 5.378367894229668e-05, "rougeL_precision": 0.0006666666666666666, "rougeL_precision_stderr": 0.00047132592062028273, "rougeL_recall": 3.968253968253968e-05, "rougeL_recall_stderr": 2.8611082123824066e-05, "rougeLsum_fmeasure": 7.474747474747475e-05, "rougeLsum_fmeasure_stderr": 5.378367894229668e-05, "rougeLsum_precision": 0.0006666666666666666, "rougeLsum_precision_stderr": 0.00047132592062028273, "rougeLsum_recall": 3.968253968253968e-05, "rougeLsum_recall_stderr": 2.8611082123824066e-05}, "generate_text_restaurant": {"bleu": 0.5451634050753023, "bleu_stderr": 0.03476122561593657, "rouge1_fmeasure": 0.04310290552413315, "rouge1_fmeasure_stderr": 0.0013354170612674893, "rouge1_precision": 0.046590041682019065, "rouge1_precision_stderr": 0.0018113877733116784, "rouge1_recall": 0.05509845941495712, "rouge1_recall_stderr": 0.0017154015403554534, "rouge2_fmeasure": 0.0063826724183375155, "rouge2_fmeasure_stderr": 0.0004428905323251431, "rouge2_precision": 0.0061050451406738674, "rouge2_precision_stderr": 0.0004913565515769247, "rouge2_recall": 0.009015504658999519, "rouge2_recall_stderr": 0.0006494884515366231, "rougeL_fmeasure": 0.041469788976882346, "rougeL_fmeasure_stderr": 0.0012468639228984321, "rougeL_precision": 0.04442283664669869, "rougeL_precision_stderr": 0.0016857127584432392, "rougeL_recall": 0.053364978373077035, "rougeL_recall_stderr": 0.0016220526842467798, "rougeLsum_fmeasure": 0.03867299387707548, "rougeLsum_fmeasure_stderr": 0.0011657405780310776, "rougeLsum_precision": 0.04212798093112617, "rougeLsum_precision_stderr": 0.00163957458202675, "rougeLsum_recall": 0.04928251726552432, "rougeLsum_recall_stderr": 0.0014753429112820023}, "text": {"bleu": 5.202591268830205, "bleu_stderr": 0.054922717477209206, "rouge1_fmeasure": 0.366483471533585, "rouge1_fmeasure_stderr": 0.002386281579917487, "rouge1_precision": 0.2917166159957922, "rouge1_precision_stderr": 0.002219184062207322, "rouge1_recall": 0.5216732757880385, "rouge1_recall_stderr": 0.003067680737337117, "rouge2_fmeasure": 0.16205539672246214, "rouge2_fmeasure_stderr": 0.0015773137142455621, "rouge2_precision": 0.12771487727214673, "rouge2_precision_stderr": 0.001331726066831988, "rouge2_recall": 0.2362911624934372, "rouge2_recall_stderr": 0.0023048686809464653, "rougeL_fmeasure": 0.2763256240716396, "rougeL_fmeasure_stderr": 0.0016051756708536963, "rougeL_precision": 0.2181802591620801, "rougeL_precision_stderr": 0.0014357601973536051, "rougeL_recall": 0.3992804600446685, "rougeL_recall_stderr": 0.002414052410394236, "rougeLsum_fmeasure": 0.30348032953012855, "rougeLsum_fmeasure_stderr": 0.0020484158677831262, "rougeLsum_precision": 0.24125596921065487, "rougeLsum_precision_stderr": 0.0018822466477212397, "rougeLsum_recall": 0.43323632221522257, "rougeLsum_recall_stderr": 0.0026995804538005315}}, "1": {"coherent_text": {"bleu": 5.552260199704303, "bleu_stderr": 0.057401553819660216, "rouge1_fmeasure": 0.40600971218219595, "rouge1_fmeasure_stderr": 0.0019372178068667871, "rouge1_precision": 0.3259195028808269, "rouge1_precision_stderr": 0.001929323968999145, "rouge1_recall": 0.572157323196931, "rouge1_recall_stderr": 0.0026426040314961857, "rouge2_fmeasure": 0.1695476123074274, "rouge2_fmeasure_stderr": 0.0014000703195505763, "rouge2_precision": 0.13492928471559051, "rouge2_precision_stderr": 0.0011996866679482643, "rouge2_recall": 0.24424415700293647, "rouge2_recall_stderr": 0.002141030333559227, "rougeL_fmeasure": 0.27934677067641567, "rougeL_fmeasure_stderr": 0.0014140720869577354, "rougeL_precision": 0.22280765716264103, "rougeL_precision_stderr": 0.0012944708889095894, "rougeL_recall": 0.39909766220453646, "rougeL_recall_stderr": 0.0023618899336078415, "rougeLsum_fmeasure": 0.3357354638134433, "rougeLsum_fmeasure_stderr": 0.0017725485372938805, "rougeLsum_precision": 0.26925744025419523, "rougeLsum_precision_stderr": 0.0016959347663601083, "rougeLsum_recall": 0.4741109359252275, "rougeLsum_recall_stderr": 0.0025232454404190492}, "create_text_for_me": {"bleu": 5.632510593428182, "bleu_stderr": 0.06797359125001885, "rouge1_fmeasure": 0.3950199600478533, "rouge1_fmeasure_stderr": 0.0016709891545967043, "rouge1_precision": 0.31416640095368, "rouge1_precision_stderr": 0.001676578347273029, "rouge1_recall": 0.5652729230635114, "rouge1_recall_stderr": 0.0024927456901652216, "rouge2_fmeasure": 0.16266540713305758, "rouge2_fmeasure_stderr": 0.0013365594771830832, "rouge2_precision": 0.12825863146984678, "rouge2_precision_stderr": 0.0011505311700009703, "rouge2_recall": 0.2387490220507428, "rouge2_recall_stderr": 0.002140418284139839, "rougeL_fmeasure": 0.26463608485476275, "rougeL_fmeasure_stderr": 0.0013170335870528055, "rougeL_precision": 0.20916346047637208, "rougeL_precision_stderr": 0.0011837672533766093, "rougeL_recall": 0.3840042932400857, "rougeL_recall_stderr": 0.002349227665339613, "rougeLsum_fmeasure": 0.32581485051561293, "rougeLsum_fmeasure_stderr": 0.0016289808132726072, "rougeLsum_precision": 0.25910749063743543, "rougeLsum_precision_stderr": 0.0015532175334562622, "rougeLsum_recall": 0.466741053856193, "rougeLsum_recall_stderr": 0.0024637010906840514}, "generate_gramatically_correct_text": {"bleu": 4.659796079611401, "bleu_stderr": 0.1368931758688134, "rouge1_fmeasure": 0.17808584653315795, "rouge1_fmeasure_stderr": 0.0036047878484278886, "rouge1_precision": 0.17644909352160107, "rouge1_precision_stderr": 0.0038149287890057383, "rouge1_recall": 0.21230940827718442, "rouge1_recall_stderr": 0.00454974484640084, "rouge2_fmeasure": 0.06654255399675307, "rouge2_fmeasure_stderr": 0.0016555414433965217, "rouge2_precision": 0.06341138057720765, "rouge2_precision_stderr": 0.0017418196750443745, "rouge2_recall": 0.08183158513848933, "rouge2_recall_stderr": 0.0021584079306890677, "rougeL_fmeasure": 0.12177858627685004, "rougeL_fmeasure_stderr": 0.0025003205364259196, "rougeL_precision": 0.1238963811146196, "rougeL_precision_stderr": 0.0029398927249347016, "rougeL_recall": 0.14507709522146778, "rougeL_recall_stderr": 0.0031870533189973928, "rougeLsum_fmeasure": 0.1509777229111984, "rougeLsum_fmeasure_stderr": 0.00309077855012835, "rougeLsum_precision": 0.1513576373029321, "rougeLsum_precision_stderr": 0.003401957689671663, "rougeLsum_recall": 0.17927286088246633, "rougeLsum_recall_stderr": 0.0038765489000090123}, "generate_text_restaurant": {"bleu": 11.396112958955289, "bleu_stderr": 0.12774001717020914, "rouge1_fmeasure": 0.4479642771715969, "rouge1_fmeasure_stderr": 0.0023079960050490524, "rouge1_precision": 0.5339579313432855, "rouge1_precision_stderr": 0.0031227264254700595, "rouge1_recall": 0.42523020862127436, "rouge1_recall_stderr": 0.002981115684302531, "rouge2_fmeasure": 0.2056915755809246, "rouge2_fmeasure_stderr": 0.0019163170514045066, "rouge2_precision": 0.248732624279182, "rouge2_precision_stderr": 0.002509203740513705, "rouge2_recall": 0.1952491238459977, "rouge2_recall_stderr": 0.0020864674211335237, "rougeL_fmeasure": 0.3204968358680575, "rougeL_fmeasure_stderr": 0.001985520070286022, "rougeL_precision": 0.3850344789575973, "rougeL_precision_stderr": 0.00278033592883375, "rougeL_recall": 0.3033612056952134, "rougeL_recall_stderr": 0.002388739694886294, "rougeLsum_fmeasure": 0.36313254630951347, "rougeLsum_fmeasure_stderr": 0.0022360796971689707, "rougeLsum_precision": 0.4342921156865786, "rougeLsum_precision_stderr": 0.003003575710929277, "rougeLsum_recall": 0.34421205733760346, "rougeLsum_recall_stderr": 0.0026942002219059206}, "text": {"bleu": 6.510370404578311, "bleu_stderr": 0.07017652112732815, "rouge1_fmeasure": 0.45209118860105924, "rouge1_fmeasure_stderr": 0.0020375402593377735, "rouge1_precision": 0.37679150052732646, "rouge1_precision_stderr": 0.0021690658868186385, "rouge1_recall": 0.5994842792127021, "rouge1_recall_stderr": 0.00255563453868782, "rouge2_fmeasure": 0.20190477038644114, "rouge2_fmeasure_stderr": 0.0015696316292779203, "rouge2_precision": 0.16696448229806954, "rouge2_precision_stderr": 0.0014179634395889713, "rouge2_recall": 0.27258591423013007, "rouge2_recall_stderr": 0.002211872159153144, "rougeL_fmeasure": 0.31497173633802084, "rougeL_fmeasure_stderr": 0.0015604606735793588, "rougeL_precision": 0.2606246077766141, "rougeL_precision_stderr": 0.00149866573783182, "rougeL_recall": 0.4234162076398136, "rougeL_recall_stderr": 0.0024102400017687316, "rougeLsum_fmeasure": 0.3738662228300479, "rougeLsum_fmeasure_stderr": 0.0019441791456726448, "rougeLsum_precision": 0.311329828827097, "rougeLsum_precision_stderr": 0.0019564900632477846, "rougeLsum_recall": 0.4967568002020189, "rougeLsum_recall_stderr": 0.002564011157133322}}, "2": {"coherent_text": {"bleu": 6.320043236974254, "bleu_stderr": 0.0703674127483568, "rouge1_fmeasure": 0.41458290792353053, "rouge1_fmeasure_stderr": 0.0017754408662774433, "rouge1_precision": 0.3311240341080181, "rouge1_precision_stderr": 0.0017853276326339404, "rouge1_recall": 0.5884284298124635, "rouge1_recall_stderr": 0.0025141637691686656, "rouge2_fmeasure": 0.18139573177296225, "rouge2_fmeasure_stderr": 0.0014279319814940361, "rouge2_precision": 0.14343528305064893, "rouge2_precision_stderr": 0.0012055773188276925, "rouge2_recall": 0.2640886894644106, "rouge2_recall_stderr": 0.0022528239016002664, "rougeL_fmeasure": 0.28718354333461665, "rougeL_fmeasure_stderr": 0.0013718939798951697, "rougeL_precision": 0.22789410567174723, "rougeL_precision_stderr": 0.001239923319192286, "rougeL_recall": 0.4134133885978249, "rougeL_recall_stderr": 0.002390777095443192, "rougeLsum_fmeasure": 0.3456449838143285, "rougeLsum_fmeasure_stderr": 0.001679248842503913, "rougeLsum_precision": 0.2757280227521462, "rougeLsum_precision_stderr": 0.0016014662456065143, "rougeLsum_recall": 0.4919426398639349, "rougeLsum_recall_stderr": 0.0024913442742156007}, "create_text_for_me": {"bleu": 6.392142200831627, "bleu_stderr": 0.10315877163155605, "rouge1_fmeasure": 0.4015506196777932, "rouge1_fmeasure_stderr": 0.0016768573292908987, "rouge1_precision": 0.3180209569566378, "rouge1_precision_stderr": 0.0016737440721134676, "rouge1_recall": 0.5778453320981264, "rouge1_recall_stderr": 0.00246975929714272, "rouge2_fmeasure": 0.174575698484694, "rouge2_fmeasure_stderr": 0.0014096927083128974, "rouge2_precision": 0.1368729144877671, "rouge2_precision_stderr": 0.0011691195535521908, "rouge2_recall": 0.257752868294542, "rouge2_recall_stderr": 0.0022701334543955894, "rougeL_fmeasure": 0.2761053811033139, "rougeL_fmeasure_stderr": 0.001353268307874384, "rougeL_precision": 0.21739223338213798, "rougeL_precision_stderr": 0.0011987485605256046, "rougeL_recall": 0.4023377833196794, "rougeL_recall_stderr": 0.002376723574094262, "rougeLsum_fmeasure": 0.33602429370388687, "rougeLsum_fmeasure_stderr": 0.0016395104225531447, "rougeLsum_precision": 0.26596256244660715, "rougeLsum_precision_stderr": 0.0015443527415276346, "rougeLsum_recall": 0.4842569652904595, "rougeLsum_recall_stderr": 0.002471682356356687}, "generate_gramatically_correct_text": {"bleu": 8.349660138830153, "bleu_stderr": 0.1098318499392546, "rouge1_fmeasure": 0.2653185459339885, "rouge1_fmeasure_stderr": 0.003718960309470489, "rouge1_precision": 0.24305067824681437, "rouge1_precision_stderr": 0.0036097560022010406, "rouge1_recall": 0.32723288456222843, "rouge1_recall_stderr": 0.0048182868947828535, "rouge2_fmeasure": 0.10730240032895362, "rouge2_fmeasure_stderr": 0.0018845525107885157, "rouge2_precision": 0.09330398651801074, "rouge2_precision_stderr": 0.0016570405561187557, "rouge2_recall": 0.13658467403886668, "rouge2_recall_stderr": 0.0025405278204551044, "rougeL_fmeasure": 0.18130203037790546, "rougeL_fmeasure_stderr": 0.0025961043557325117, "rougeL_precision": 0.16814754590628309, "rougeL_precision_stderr": 0.0027282671712716595, "rougeL_recall": 0.22485624238227028, "rougeL_recall_stderr": 0.0034527275025707706, "rougeLsum_fmeasure": 0.22128162666490145, "rougeLsum_fmeasure_stderr": 0.0031723992786646523, "rougeLsum_precision": 0.2041811673945807, "rougeLsum_precision_stderr": 0.0031846062911353203, "rougeLsum_recall": 0.27268640990499865, "rougeLsum_recall_stderr": 0.004106397617602531}, "generate_text_restaurant": {"bleu": 12.976405649449786, "bleu_stderr": 0.20113725542092992, "rouge1_fmeasure": 0.47007019927530463, "rouge1_fmeasure_stderr": 0.0022920087841303556, "rouge1_precision": 0.558994227405181, "rouge1_precision_stderr": 0.0032295866736836163, "rouge1_recall": 0.44495262240037453, "rouge1_recall_stderr": 0.0029490508361979437, "rouge2_fmeasure": 0.22591032128288588, "rouge2_fmeasure_stderr": 0.0020560357969047136, "rouge2_precision": 0.27217728286113113, "rouge2_precision_stderr": 0.0026898706377774815, "rouge2_recall": 0.21411207509510502, "rouge2_recall_stderr": 0.002230097528851014, "rougeL_fmeasure": 0.3425232001087342, "rougeL_fmeasure_stderr": 0.0020806737697165846, "rougeL_precision": 0.40975809640656674, "rougeL_precision_stderr": 0.00294221380121017, "rougeL_recall": 0.3236212819558878, "rougeL_recall_stderr": 0.0024673142471829217, "rougeLsum_fmeasure": 0.3859835303051329, "rougeLsum_fmeasure_stderr": 0.0023029475516390383, "rougeLsum_precision": 0.46033224287141866, "rougeLsum_precision_stderr": 0.0031762512099836643, "rougeLsum_recall": 0.3647714098177991, "rougeLsum_recall_stderr": 0.0027210820647532467}, "text": {"bleu": 6.694055179072492, "bleu_stderr": 0.07704898802651756, "rouge1_fmeasure": 0.44487573960989946, "rouge1_fmeasure_stderr": 0.001977816470537129, "rouge1_precision": 0.36979779913491, "rouge1_precision_stderr": 0.0020832359668947016, "rouge1_recall": 0.5914089255959427, "rouge1_recall_stderr": 0.0025404798791399787, "rouge2_fmeasure": 0.20141804290762905, "rouge2_fmeasure_stderr": 0.0015588769877815645, "rouge2_precision": 0.16613522297283992, "rouge2_precision_stderr": 0.001397693878296302, "rouge2_recall": 0.27310008704696526, "rouge2_recall_stderr": 0.0022423121530451896, "rougeL_fmeasure": 0.313184127205453, "rougeL_fmeasure_stderr": 0.0015466479745984572, "rougeL_precision": 0.25859076070964365, "rougeL_precision_stderr": 0.0014689767050739151, "rougeL_recall": 0.42183833274839183, "rougeL_recall_stderr": 0.002417511355707716, "rougeLsum_fmeasure": 0.3704488980543351, "rougeLsum_fmeasure_stderr": 0.0019175726004970364, "rougeLsum_precision": 0.3076440801284151, "rougeLsum_precision_stderr": 0.0019053103567047945, "rougeLsum_recall": 0.4935606984408204, "rougeLsum_recall_stderr": 0.0025806815753750124}}, "3": {"coherent_text": {"bleu": 6.622404462604137, "bleu_stderr": 0.0600956886865709, "rouge1_fmeasure": 0.4139372581461456, "rouge1_fmeasure_stderr": 0.0017028503243186983, "rouge1_precision": 0.32978578163106964, "rouge1_precision_stderr": 0.001728792250109152, "rouge1_recall": 0.58993990670276, "rouge1_recall_stderr": 0.0024371452756877373, "rouge2_fmeasure": 0.18499822465619842, "rouge2_fmeasure_stderr": 0.0014157929728201822, "rouge2_precision": 0.14600046958938082, "rouge2_precision_stderr": 0.0011984602947182365, "rouge2_recall": 0.2703010278791222, "rouge2_recall_stderr": 0.0022446737679045724, "rougeL_fmeasure": 0.28955171897620546, "rougeL_fmeasure_stderr": 0.0013460315368352376, "rougeL_precision": 0.22926603301976495, "rougeL_precision_stderr": 0.0012193510568444229, "rougeL_recall": 0.4182116954475941, "rougeL_recall_stderr": 0.002363070228365806, "rougeLsum_fmeasure": 0.34678316488645144, "rougeLsum_fmeasure_stderr": 0.0016548796761139582, "rougeLsum_precision": 0.2760388688353806, "rougeLsum_precision_stderr": 0.001581361548630609, "rougeLsum_recall": 0.49527056164943506, "rougeLsum_recall_stderr": 0.0024657921238186126}, "create_text_for_me": {"bleu": 6.63349544243743, "bleu_stderr": 0.05281657466901086, "rouge1_fmeasure": 0.3974550338517387, "rouge1_fmeasure_stderr": 0.0016482540144611944, "rouge1_precision": 0.31357525152538196, "rouge1_precision_stderr": 0.001623161380165631, "rouge1_recall": 0.5751723085823159, "rouge1_recall_stderr": 0.002495778990632278, "rouge2_fmeasure": 0.17547831360286914, "rouge2_fmeasure_stderr": 0.0014288067294337741, "rouge2_precision": 0.1371291641511403, "rouge2_precision_stderr": 0.0011818735703140412, "rouge2_recall": 0.260356557510865, "rouge2_recall_stderr": 0.0023088013029268985, "rougeL_fmeasure": 0.2754925401248682, "rougeL_fmeasure_stderr": 0.0013545543541219737, "rougeL_precision": 0.216247455101675, "rougeL_precision_stderr": 0.0011928912670333205, "rougeL_recall": 0.4032048757502036, "rougeL_recall_stderr": 0.0023797635332692417, "rougeLsum_fmeasure": 0.33527731886149553, "rougeLsum_fmeasure_stderr": 0.001640163433964876, "rougeLsum_precision": 0.26435133382219206, "rougeLsum_precision_stderr": 0.0015244603278307882, "rougeLsum_recall": 0.48612921079402044, "rougeLsum_recall_stderr": 0.0025405746329231096}, "generate_gramatically_correct_text": {"bleu": 9.100604672627616, "bleu_stderr": 0.11628331252327472, "rouge1_fmeasure": 0.3057154871720569, "rouge1_fmeasure_stderr": 0.0035668673332201113, "rouge1_precision": 0.2744732005419624, "rouge1_precision_stderr": 0.0034020795182056205, "rouge1_recall": 0.3792037077764579, "rouge1_recall_stderr": 0.00465258042529242, "rouge2_fmeasure": 0.12907057263921567, "rouge2_fmeasure_stderr": 0.0019101957633704694, "rouge2_precision": 0.11192035685967208, "rouge2_precision_stderr": 0.0016787135403730127, "rouge2_recall": 0.16449012589807802, "rouge2_recall_stderr": 0.0025920539576727714, "rougeL_fmeasure": 0.21199121837713777, "rougeL_fmeasure_stderr": 0.0025587269926346133, "rougeL_precision": 0.19158493144962355, "rougeL_precision_stderr": 0.0025745366569725494, "rougeL_recall": 0.2642632272554717, "rougeL_recall_stderr": 0.0034193238088634525, "rougeLsum_fmeasure": 0.2557399896778223, "rougeLsum_fmeasure_stderr": 0.0030869457481539284, "rougeLsum_precision": 0.23056997456483158, "rougeLsum_precision_stderr": 0.0030071550428457374, "rougeLsum_recall": 0.3171677356585434, "rougeLsum_recall_stderr": 0.00402997199179447}, "generate_text_restaurant": {"bleu": 13.841780368990428, "bleu_stderr": 0.16004525418920557, "rouge1_fmeasure": 0.4775062117887383, "rouge1_fmeasure_stderr": 0.0022520658159749783, "rouge1_precision": 0.5672114023663154, "rouge1_precision_stderr": 0.0031836037058964135, "rouge1_recall": 0.4506315681499688, "rouge1_recall_stderr": 0.0028976257859202057, "rouge2_fmeasure": 0.23547797340215765, "rouge2_fmeasure_stderr": 0.002080191282609909, "rouge2_precision": 0.2831489435622744, "rouge2_precision_stderr": 0.0026849363746315716, "rouge2_recall": 0.2224359793863677, "rouge2_recall_stderr": 0.0022585552678834257, "rougeL_fmeasure": 0.3522099816274127, "rougeL_fmeasure_stderr": 0.0021360587201332483, "rougeL_precision": 0.42052513955243626, "rougeL_precision_stderr": 0.0029794181672521524, "rougeL_recall": 0.3316609702212396, "rougeL_recall_stderr": 0.002482260811969057, "rougeLsum_fmeasure": 0.39783266320253785, "rougeLsum_fmeasure_stderr": 0.0023487903214587246, "rougeLsum_precision": 0.4731322251491655, "rougeLsum_precision_stderr": 0.0031830112351273762, "rougeLsum_recall": 0.3751305124701987, "rougeLsum_recall_stderr": 0.0027537781991579034}, "text": {"bleu": 6.6514653763342375, "bleu_stderr": 0.06251939465471733, "rouge1_fmeasure": 0.4380843233846489, "rouge1_fmeasure_stderr": 0.0019929244812592456, "rouge1_precision": 0.36420647303972614, "rouge1_precision_stderr": 0.0020915518131283885, "rouge1_recall": 0.5822439340272905, "rouge1_recall_stderr": 0.002574821849509054, "rouge2_fmeasure": 0.19894653788829594, "rouge2_fmeasure_stderr": 0.0016174170448261663, "rouge2_precision": 0.16409417489732006, "rouge2_precision_stderr": 0.001445153868542824, "rouge2_recall": 0.27002111151104974, "rouge2_recall_stderr": 0.0023305847933292184, "rougeL_fmeasure": 0.3101723423743742, "rougeL_fmeasure_stderr": 0.00157850445644739, "rougeL_precision": 0.25631927067186866, "rougeL_precision_stderr": 0.0015047882308515217, "rougeL_recall": 0.4172204517599229, "rougeL_recall_stderr": 0.002430745636023191, "rougeLsum_fmeasure": 0.3650855458428154, "rougeLsum_fmeasure_stderr": 0.0019418658394657236, "rougeLsum_precision": 0.3033028389713666, "rougeLsum_precision_stderr": 0.0019232661628729548, "rougeLsum_recall": 0.48618027753856774, "rougeLsum_recall_stderr": 0.00262308382372153}}, "4": {"coherent_text": {"bleu": 6.726984471838634, "bleu_stderr": 0.0796088492574431, "rouge1_fmeasure": 0.4106941328054044, "rouge1_fmeasure_stderr": 0.0017229230075679713, "rouge1_precision": 0.32648518891836104, "rouge1_precision_stderr": 0.0017271914729613752, "rouge1_recall": 0.586257235035879, "rouge1_recall_stderr": 0.0024423096252639558, "rouge2_fmeasure": 0.18449634822804548, "rouge2_fmeasure_stderr": 0.001477498111348285, "rouge2_precision": 0.1453440606347176, "rouge2_precision_stderr": 0.0012436842104140615, "rouge2_recall": 0.26971984634892004, "rouge2_recall_stderr": 0.002313107941237128, "rougeL_fmeasure": 0.28803275605668655, "rougeL_fmeasure_stderr": 0.001367927628256527, "rougeL_precision": 0.22765173104030403, "rougeL_precision_stderr": 0.001238329714231677, "rougeL_recall": 0.41649689481485414, "rougeL_recall_stderr": 0.002350472856644654, "rougeLsum_fmeasure": 0.3458744644841339, "rougeLsum_fmeasure_stderr": 0.0016762712143040775, "rougeLsum_precision": 0.274783630094611, "rougeLsum_precision_stderr": 0.0015950198452968627, "rougeLsum_recall": 0.4945541982831927, "rougeLsum_recall_stderr": 0.002451202764511733}, "create_text_for_me": {"bleu": 6.709765572579541, "bleu_stderr": 0.06261403260610318, "rouge1_fmeasure": 0.38937605226208516, "rouge1_fmeasure_stderr": 0.0017014154240876697, "rouge1_precision": 0.30664438468283994, "rouge1_precision_stderr": 0.0016595572641215095, "rouge1_recall": 0.5650188060391115, "rouge1_recall_stderr": 0.0024946022217113522, "rouge2_fmeasure": 0.1739020471143616, "rouge2_fmeasure_stderr": 0.001452734859135155, "rouge2_precision": 0.13579088054388685, "rouge2_precision_stderr": 0.0012067203935791926, "rouge2_recall": 0.25817000306049603, "rouge2_recall_stderr": 0.0023009577668858558, "rougeL_fmeasure": 0.27171532019877925, "rougeL_fmeasure_stderr": 0.0013724928070502846, "rougeL_precision": 0.21289213132253543, "rougeL_precision_stderr": 0.0012160818190789682, "rougeL_recall": 0.3987146480229239, "rougeL_recall_stderr": 0.002331544671296655, "rougeLsum_fmeasure": 0.3302419538023624, "rougeLsum_fmeasure_stderr": 0.0016726242236806516, "rougeLsum_precision": 0.26005394613535154, "rougeLsum_precision_stderr": 0.001558252869531713, "rougeLsum_recall": 0.4795521973701686, "rougeLsum_recall_stderr": 0.0024908487929554543}, "generate_gramatically_correct_text": {"bleu": 9.460262600466185, "bleu_stderr": 0.06690235758953499, "rouge1_fmeasure": 0.33535988312538023, "rouge1_fmeasure_stderr": 0.00333242597817113, "rouge1_precision": 0.29909310953486523, "rouge1_precision_stderr": 0.003198137138338973, "rouge1_recall": 0.4171947659672954, "rouge1_recall_stderr": 0.004380236874977217, "rouge2_fmeasure": 0.14413942196532828, "rouge2_fmeasure_stderr": 0.0018940385351166674, "rouge2_precision": 0.12487041668274895, "rouge2_precision_stderr": 0.0016702178366713861, "rouge2_recall": 0.18358213761587733, "rouge2_recall_stderr": 0.0025750805740997595, "rougeL_fmeasure": 0.23407187738369561, "rougeL_fmeasure_stderr": 0.0024485182965088507, "rougeL_precision": 0.2095831569760216, "rougeL_precision_stderr": 0.002448524558694762, "rougeL_recall": 0.2928182365204402, "rougeL_recall_stderr": 0.0033071812049612808, "rougeLsum_fmeasure": 0.2812675916988234, "rougeLsum_fmeasure_stderr": 0.0029431631516787177, "rougeLsum_precision": 0.2514610077689454, "rougeLsum_precision_stderr": 0.002860099081408974, "rougeLsum_recall": 0.3501029527885422, "rougeLsum_recall_stderr": 0.0038717879646786944}, "generate_text_restaurant": {"bleu": 14.347043577937871, "bleu_stderr": 0.13183200117809515, "rouge1_fmeasure": 0.4791803139648653, "rouge1_fmeasure_stderr": 0.0022359999842246512, "rouge1_precision": 0.5657141271378652, "rouge1_precision_stderr": 0.0032494806378829547, "rouge1_recall": 0.4543139798474618, "rouge1_recall_stderr": 0.002865992018425755, "rouge2_fmeasure": 0.23765394178309218, "rouge2_fmeasure_stderr": 0.0021422406093072697, "rouge2_precision": 0.2840374640470508, "rouge2_precision_stderr": 0.0028091760925762137, "rouge2_recall": 0.22548495306647312, "rouge2_recall_stderr": 0.002307333846171272, "rougeL_fmeasure": 0.3551586826634256, "rougeL_fmeasure_stderr": 0.0021591742079548407, "rougeL_precision": 0.4208287732411973, "rougeL_precision_stderr": 0.003031588432467615, "rougeL_recall": 0.33645734099493246, "rougeL_recall_stderr": 0.0025176571940632108, "rougeLsum_fmeasure": 0.40243539061119715, "rougeLsum_fmeasure_stderr": 0.002367520270615011, "rougeLsum_precision": 0.47480321305856177, "rougeLsum_precision_stderr": 0.003220379872286072, "rougeLsum_recall": 0.38184109614706496, "rougeLsum_recall_stderr": 0.002799589796872638}, "text": {"bleu": 6.608484443770737, "bleu_stderr": 0.09177253914044482, "rouge1_fmeasure": 0.433038880083634, "rouge1_fmeasure_stderr": 0.001976429894642913, "rouge1_precision": 0.36000067737683655, "rouge1_precision_stderr": 0.002059158783528559, "rouge1_recall": 0.5751430196884769, "rouge1_recall_stderr": 0.0025553447543118276, "rouge2_fmeasure": 0.19618559845779804, "rouge2_fmeasure_stderr": 0.0016525158117788473, "rouge2_precision": 0.16179960223783285, "rouge2_precision_stderr": 0.001465512561186318, "rouge2_recall": 0.26598167030517494, "rouge2_recall_stderr": 0.0023601948549819467, "rougeL_fmeasure": 0.3073988661453901, "rougeL_fmeasure_stderr": 0.0016004229019968986, "rougeL_precision": 0.25407783683621693, "rougeL_precision_stderr": 0.0015117553852618231, "rougeL_recall": 0.4129708872099754, "rougeL_recall_stderr": 0.0024312767233502217, "rougeLsum_fmeasure": 0.3634810794306642, "rougeLsum_fmeasure_stderr": 0.0019587560708400013, "rougeLsum_precision": 0.3020096150583601, "rougeLsum_precision_stderr": 0.00192602224235909, "rougeLsum_recall": 0.4835110037230676, "rougeLsum_recall_stderr": 0.0026281093270445572}}, "5": {"coherent_text": {"bleu": 6.5756845375883355, "bleu_stderr": 0.08897150277070985, "rouge1_fmeasure": 0.4044662434309372, "rouge1_fmeasure_stderr": 0.0017458799577526741, "rouge1_precision": 0.32116519599671667, "rouge1_precision_stderr": 0.0017400612729082726, "rouge1_recall": 0.5784532325776863, "rouge1_recall_stderr": 0.0024510408288361423, "rouge2_fmeasure": 0.180205137590262, "rouge2_fmeasure_stderr": 0.0014510894649712618, "rouge2_precision": 0.14182020295749317, "rouge2_precision_stderr": 0.0012237648320547914, "rouge2_recall": 0.26385422416008314, "rouge2_recall_stderr": 0.0022588904922048233, "rougeL_fmeasure": 0.2844408397217688, "rougeL_fmeasure_stderr": 0.001371012610736257, "rougeL_precision": 0.22455155493389567, "rougeL_precision_stderr": 0.0012390887238885092, "rougeL_recall": 0.411817587585727, "rougeL_recall_stderr": 0.0023153768885770987, "rougeLsum_fmeasure": 0.3423573003846329, "rougeLsum_fmeasure_stderr": 0.0017004223296000738, "rougeLsum_precision": 0.27161546746024895, "rougeLsum_precision_stderr": 0.0016061149766465918, "rougeLsum_recall": 0.490609400586177, "rougeLsum_recall_stderr": 0.0024821818387890946}, "create_text_for_me": {"bleu": 6.575403777061748, "bleu_stderr": 0.09823540673154174, "rouge1_fmeasure": 0.3858772280160323, "rouge1_fmeasure_stderr": 0.0016981938784725468, "rouge1_precision": 0.30336615193836675, "rouge1_precision_stderr": 0.001651812386890844, "rouge1_recall": 0.5612874601960536, "rouge1_recall_stderr": 0.002474965171767822, "rouge2_fmeasure": 0.17145057785135936, "rouge2_fmeasure_stderr": 0.0014287502471356415, "rouge2_precision": 0.1336565960148854, "rouge2_precision_stderr": 0.0011883817497889528, "rouge2_recall": 0.25509786184825395, "rouge2_recall_stderr": 0.0022469620904438923, "rougeL_fmeasure": 0.26945646442741866, "rougeL_fmeasure_stderr": 0.0013738953482368282, "rougeL_precision": 0.21071104756711098, "rougeL_precision_stderr": 0.0012104628563537965, "rougeL_recall": 0.396382791753183, "rougeL_recall_stderr": 0.002322655929385875, "rougeLsum_fmeasure": 0.32763137597619785, "rougeLsum_fmeasure_stderr": 0.0016696428766529263, "rougeLsum_precision": 0.2575546053753626, "rougeLsum_precision_stderr": 0.0015516497103719832, "rougeLsum_recall": 0.47687647145147444, "rougeLsum_recall_stderr": 0.0024725571404500667}, "generate_gramatically_correct_text": {"bleu": 9.58290843334344, "bleu_stderr": 0.09940670882099904, "rouge1_fmeasure": 0.35170201207032026, "rouge1_fmeasure_stderr": 0.0031635095609000353, "rouge1_precision": 0.3125044962383802, "rouge1_precision_stderr": 0.003072058347749594, "rouge1_recall": 0.43832279193619333, "rouge1_recall_stderr": 0.004161083245905112, "rouge2_fmeasure": 0.15257686050529207, "rouge2_fmeasure_stderr": 0.0018607982697786416, "rouge2_precision": 0.13168237302502842, "rouge2_precision_stderr": 0.0016416394703174114, "rouge2_recall": 0.19471607478958905, "rouge2_recall_stderr": 0.0025429321562742252, "rougeL_fmeasure": 0.24599327220157718, "rougeL_fmeasure_stderr": 0.0023449351106501785, "rougeL_precision": 0.2193444318712155, "rougeL_precision_stderr": 0.0023735293014745776, "rougeL_recall": 0.308468298974907, "rougeL_recall_stderr": 0.003191423466850776, "rougeLsum_fmeasure": 0.2949470778370446, "rougeLsum_fmeasure_stderr": 0.0027957905211735317, "rougeLsum_precision": 0.26273018537823156, "rougeLsum_precision_stderr": 0.0027544779870844888, "rougeLsum_recall": 0.3678867286427804, "rougeLsum_recall_stderr": 0.0036888444777911644}, "generate_text_restaurant": {"bleu": 14.305213942543, "bleu_stderr": 0.1483607819215848, "rouge1_fmeasure": 0.4804680058354203, "rouge1_fmeasure_stderr": 0.0021741397703150594, "rouge1_precision": 0.5597772799865962, "rouge1_precision_stderr": 0.0031779357829373007, "rouge1_recall": 0.45931387100423793, "rouge1_recall_stderr": 0.002804556836326018, "rouge2_fmeasure": 0.2366049201616526, "rouge2_fmeasure_stderr": 0.0020482405764989274, "rouge2_precision": 0.27941286210601324, "rouge2_precision_stderr": 0.002685923767684149, "rouge2_recall": 0.2261024234770814, "rouge2_recall_stderr": 0.002220825905817483, "rougeL_fmeasure": 0.3555529772285316, "rougeL_fmeasure_stderr": 0.0020868456031009723, "rougeL_precision": 0.4153388685887487, "rougeL_precision_stderr": 0.002922026669760074, "rougeL_recall": 0.33991925444128, "rougeL_recall_stderr": 0.0024671212347911007, "rougeLsum_fmeasure": 0.403868227449958, "rougeLsum_fmeasure_stderr": 0.0023025847428200684, "rougeLsum_precision": 0.470187398074112, "rougeLsum_precision_stderr": 0.0031325933820836164, "rougeLsum_recall": 0.3864435015330604, "rougeLsum_recall_stderr": 0.0027449279832689228}, "text": {"bleu": 6.335359360908907, "bleu_stderr": 0.0880259547967717, "rouge1_fmeasure": 0.4267199017854688, "rouge1_fmeasure_stderr": 0.0019587924811470827, "rouge1_precision": 0.3544672605326427, "rouge1_precision_stderr": 0.0020400084083317866, "rouge1_recall": 0.5675350811323949, "rouge1_recall_stderr": 0.0025580342690609322, "rouge2_fmeasure": 0.19065995281551984, "rouge2_fmeasure_stderr": 0.001637747580731308, "rouge2_precision": 0.15730682660007175, "rouge2_precision_stderr": 0.0014601294548965934, "rouge2_recall": 0.25829361144949015, "rouge2_recall_stderr": 0.00230882843770772, "rougeL_fmeasure": 0.30167860222723625, "rougeL_fmeasure_stderr": 0.0015893036860766903, "rougeL_precision": 0.24923159048984417, "rougeL_precision_stderr": 0.0015024102603759335, "rougeL_recall": 0.40544016089116924, "rougeL_recall_stderr": 0.002402265187758597, "rougeLsum_fmeasure": 0.35740425194488684, "rougeLsum_fmeasure_stderr": 0.001944256853237914, "rougeLsum_precision": 0.2968667042793682, "rougeLsum_precision_stderr": 0.0019183519029005254, "rougeLsum_recall": 0.47565074316708567, "rougeLsum_recall_stderr": 0.0025968713342409276}}}, "gem_xsum": {"0": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.8720915413287463, "bleu_stderr": 0.06977921896956246, "rouge1_fmeasure": 0.1404701926071358, "rouge1_fmeasure_stderr": 0.0023545854909488343, "rouge1_precision": 0.09962536145007753, "rouge1_precision_stderr": 0.001740659618979375, "rouge1_recall": 0.2495957789912658, "rouge1_recall_stderr": 0.004084832622173279, "rouge2_fmeasure": 0.021410321262745457, "rouge2_fmeasure_stderr": 0.0010461597649610937, "rouge2_precision": 0.015032914044550117, "rouge2_precision_stderr": 0.000742400789969203, "rouge2_recall": 0.03926943996615448, "rouge2_recall_stderr": 0.001959252805398743, "rougeL_fmeasure": 0.11379782676552355, "rougeL_fmeasure_stderr": 0.001698506173428402, "rougeL_precision": 0.08055461541255139, "rougeL_precision_stderr": 0.001249774505808259, "rougeL_recall": 0.20337457259832462, "rougeL_recall_stderr": 0.003060417391791486, "rougeLsum_fmeasure": 0.11421613258385542, "rougeLsum_fmeasure_stderr": 0.001935031419720415, "rougeLsum_precision": 0.08079598752449389, "rougeLsum_precision_stderr": 0.0014172566441327129, "rougeLsum_recall": 0.20443087549335434, "rougeLsum_recall_stderr": 0.0034517764908789015}, "DOC_tldr": {"bleu": 1.969669748694084, "bleu_stderr": 0.11920564417930218, "rouge1_fmeasure": 0.20129747803416173, "rouge1_fmeasure_stderr": 0.0027922294615110735, "rouge1_precision": 0.15468558801825094, "rouge1_precision_stderr": 0.0027906195490837874, "rouge1_recall": 0.3291046037990046, "rouge1_recall_stderr": 0.004388092575953328, "rouge2_fmeasure": 0.05123793702073981, "rouge2_fmeasure_stderr": 0.0018214840639693597, "rouge2_precision": 0.0400294063524096, "rouge2_precision_stderr": 0.001724105483571795, "rouge2_recall": 0.08497447416593046, "rouge2_recall_stderr": 0.0029266165872910183, "rougeL_fmeasure": 0.16254032673358534, "rougeL_fmeasure_stderr": 0.002238751342864787, "rougeL_precision": 0.12465526978296572, "rougeL_precision_stderr": 0.002299651183476469, "rougeL_recall": 0.26725068507569655, "rougeL_recall_stderr": 0.0035760579918268356, "rougeLsum_fmeasure": 0.15699398066183545, "rougeLsum_fmeasure_stderr": 0.002413809068769271, "rougeLsum_precision": 0.12064651566409819, "rougeLsum_precision_stderr": 0.002398515107357595, "rougeLsum_recall": 0.25801606284267453, "rougeLsum_recall_stderr": 0.0038925350228375816}, "article_DOC_summary": {"bleu": 2.103288067343989, "bleu_stderr": 0.12415880355418442, "rouge1_fmeasure": 0.2135984788737725, "rouge1_fmeasure_stderr": 0.0026714992621488937, "rouge1_precision": 0.1691487054500622, "rouge1_precision_stderr": 0.0024250397934530475, "rouge1_recall": 0.3329424577074328, "rouge1_recall_stderr": 0.004657379753879945, "rouge2_fmeasure": 0.04790575968435739, "rouge2_fmeasure_stderr": 0.0017255007150536288, "rouge2_precision": 0.03660197341061794, "rouge2_precision_stderr": 0.0013876615412254558, "rouge2_recall": 0.07882441796533844, "rouge2_recall_stderr": 0.0029121244310315966, "rougeL_fmeasure": 0.15810158263668517, "rougeL_fmeasure_stderr": 0.0020195259778767218, "rougeL_precision": 0.12480506419752084, "rougeL_precision_stderr": 0.0018165017981600132, "rougeL_recall": 0.24849031836840368, "rougeL_recall_stderr": 0.0036491823702079194, "rougeLsum_fmeasure": 0.16520124922802173, "rougeLsum_fmeasure_stderr": 0.0022185800515451995, "rougeLsum_precision": 0.12993544368406393, "rougeLsum_precision_stderr": 0.0019145615600976785, "rougeLsum_recall": 0.26077607018509813, "rougeLsum_recall_stderr": 0.004054503415661984}, "summarize_DOC": {"bleu": 1.575373380176573, "bleu_stderr": 0.0743995294897499, "rouge1_fmeasure": 0.2065385624658632, "rouge1_fmeasure_stderr": 0.002495181752673378, "rouge1_precision": 0.15951620968916577, "rouge1_precision_stderr": 0.002506196733818455, "rouge1_recall": 0.3380965213762154, "rouge1_recall_stderr": 0.004239971963662302, "rouge2_fmeasure": 0.041608603555378855, "rouge2_fmeasure_stderr": 0.0015173762355031157, "rouge2_precision": 0.03267382741321086, "rouge2_precision_stderr": 0.0015503006643301331, "rouge2_recall": 0.06995212128557793, "rouge2_recall_stderr": 0.002538369711070906, "rougeL_fmeasure": 0.1492749985320237, "rougeL_fmeasure_stderr": 0.0018430993892411397, "rougeL_precision": 0.1158742599512755, "rougeL_precision_stderr": 0.0019653714280750737, "rougeL_recall": 0.2450321311305678, "rougeL_recall_stderr": 0.0032031104916126017, "rougeLsum_fmeasure": 0.16300942364929533, "rougeLsum_fmeasure_stderr": 0.002098349740204003, "rougeLsum_precision": 0.12563518916564906, "rougeLsum_precision_stderr": 0.0020584621317715304, "rougeLsum_recall": 0.26886059673342566, "rougeLsum_recall_stderr": 0.0037413258402390244}, "summarize_this_DOC_summary": {"bleu": 2.064984135458196, "bleu_stderr": 0.11276853324640117, "rouge1_fmeasure": 0.21463729419325275, "rouge1_fmeasure_stderr": 0.002671429409363269, "rouge1_precision": 0.1656184917860495, "rouge1_precision_stderr": 0.002465318130849214, "rouge1_recall": 0.3451271695394911, "rouge1_recall_stderr": 0.004485695093538501, "rouge2_fmeasure": 0.04915332059409096, "rouge2_fmeasure_stderr": 0.0017122102302659385, "rouge2_precision": 0.037345942132846435, "rouge2_precision_stderr": 0.0014462619260980324, "rouge2_recall": 0.08165828005081678, "rouge2_recall_stderr": 0.0028400731706742888, "rougeL_fmeasure": 0.159314857725847, "rougeL_fmeasure_stderr": 0.0020081165474175313, "rougeL_precision": 0.12269872075535913, "rougeL_precision_stderr": 0.0018643694649009327, "rougeL_recall": 0.25782978020360103, "rougeL_recall_stderr": 0.0034687018620499803, "rougeLsum_fmeasure": 0.1659189540151564, "rougeLsum_fmeasure_stderr": 0.0022284342453661927, "rougeLsum_precision": 0.12736067944965968, "rougeLsum_precision_stderr": 0.001970254989936668, "rougeLsum_recall": 0.26933208119177504, "rougeLsum_recall_stderr": 0.003908747660119136}}, "1": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.7071340922221964, "bleu_stderr": 0.06567476459760405, "rouge1_fmeasure": 0.1568991797418592, "rouge1_fmeasure_stderr": 0.002216712754065909, "rouge1_precision": 0.11166497670488186, "rouge1_precision_stderr": 0.001646093728324672, "rouge1_recall": 0.2745822551275572, "rouge1_recall_stderr": 0.0037626527798719244, "rouge2_fmeasure": 0.0187210253001416, "rouge2_fmeasure_stderr": 0.0009208759050088938, "rouge2_precision": 0.013225636476818368, "rouge2_precision_stderr": 0.0006474952763639561, "rouge2_recall": 0.03329408008418082, "rouge2_recall_stderr": 0.0017080640073784744, "rougeL_fmeasure": 0.11103657989201658, "rougeL_fmeasure_stderr": 0.0014320021304524589, "rougeL_precision": 0.07882774952520541, "rougeL_precision_stderr": 0.0010561315770534385, "rougeL_recall": 0.1958952699537899, "rougeL_recall_stderr": 0.0025491385981828486, "rougeLsum_fmeasure": 0.126287265188546, "rougeLsum_fmeasure_stderr": 0.00174254258134218, "rougeLsum_precision": 0.0896933686749845, "rougeLsum_precision_stderr": 0.0012859991795038399, "rougeLsum_recall": 0.2224374288611004, "rougeLsum_recall_stderr": 0.0030478633438990885}, "DOC_tldr": {"bleu": 1.9843126589812214, "bleu_stderr": 0.07803286264393723, "rouge1_fmeasure": 0.19916839431146724, "rouge1_fmeasure_stderr": 0.0025060831527181906, "rouge1_precision": 0.14151483177458335, "rouge1_precision_stderr": 0.0018666664224144868, "rouge1_recall": 0.350507712921647, "rouge1_recall_stderr": 0.004347527687226855, "rouge2_fmeasure": 0.050711870640867504, "rouge2_fmeasure_stderr": 0.0016273936819634284, "rouge2_precision": 0.03563953027984095, "rouge2_precision_stderr": 0.0011469030277248067, "rouge2_recall": 0.09161704225190265, "rouge2_recall_stderr": 0.003025312844144268, "rougeL_fmeasure": 0.1584759141966615, "rougeL_fmeasure_stderr": 0.0019769396812308743, "rougeL_precision": 0.11241775738220008, "rougeL_precision_stderr": 0.0014552290073169463, "rougeL_recall": 0.28016025506550185, "rougeL_recall_stderr": 0.0035605767069095784, "rougeLsum_fmeasure": 0.15757176490392727, "rougeLsum_fmeasure_stderr": 0.0021232145053781655, "rougeLsum_precision": 0.11164690738304506, "rougeLsum_precision_stderr": 0.001554937658585751, "rougeLsum_recall": 0.27937348602219997, "rougeLsum_recall_stderr": 0.003826151047457801}, "article_DOC_summary": {"bleu": 1.7714825425476433, "bleu_stderr": 0.10905675609895228, "rouge1_fmeasure": 0.19244278875333556, "rouge1_fmeasure_stderr": 0.002639391462026543, "rouge1_precision": 0.1369839651831878, "rouge1_precision_stderr": 0.0019614322408207006, "rouge1_recall": 0.3369949215642884, "rouge1_recall_stderr": 0.004522579510481593, "rouge2_fmeasure": 0.04413377405232099, "rouge2_fmeasure_stderr": 0.0015985481401232032, "rouge2_precision": 0.031094803643512716, "rouge2_precision_stderr": 0.0011365997631264868, "rouge2_recall": 0.07935713496492708, "rouge2_recall_stderr": 0.002884276945599807, "rougeL_fmeasure": 0.1500620628581487, "rougeL_fmeasure_stderr": 0.0020207447957827454, "rougeL_precision": 0.10657563716596334, "rougeL_precision_stderr": 0.0014875062360428326, "rougeL_recall": 0.26467978376956175, "rougeL_recall_stderr": 0.0036163241442920165, "rougeLsum_fmeasure": 0.15187454343375792, "rougeLsum_fmeasure_stderr": 0.0021906040942089064, "rougeLsum_precision": 0.1078060619083588, "rougeLsum_precision_stderr": 0.0016045831813046104, "rougeLsum_recall": 0.2680535986457037, "rougeLsum_recall_stderr": 0.0038953175735036904}, "summarize_DOC": {"bleu": 1.8684098635173505, "bleu_stderr": 0.06338992961884797, "rouge1_fmeasure": 0.20332653496456884, "rouge1_fmeasure_stderr": 0.002535349618927259, "rouge1_precision": 0.14477662439175204, "rouge1_precision_stderr": 0.0018795188514419876, "rouge1_recall": 0.3557260266989316, "rouge1_recall_stderr": 0.004456910041921236, "rouge2_fmeasure": 0.04729852367510724, "rouge2_fmeasure_stderr": 0.0015420776527216781, "rouge2_precision": 0.0332459231860444, "rouge2_precision_stderr": 0.0010850655973791109, "rouge2_recall": 0.08572120065468666, "rouge2_recall_stderr": 0.002895224082192127, "rougeL_fmeasure": 0.15359088976552104, "rougeL_fmeasure_stderr": 0.0019473489961876593, "rougeL_precision": 0.10920770628719932, "rougeL_precision_stderr": 0.0014291037457289909, "rougeL_recall": 0.27011776305596435, "rougeL_recall_stderr": 0.0035598997301664764, "rougeLsum_fmeasure": 0.16157946667427311, "rougeLsum_fmeasure_stderr": 0.002148423192266251, "rougeLsum_precision": 0.11478861556780529, "rougeLsum_precision_stderr": 0.0015689281670261022, "rougeLsum_recall": 0.2847152787941595, "rougeLsum_recall_stderr": 0.0039331867569525865}, "summarize_this_DOC_summary": {"bleu": 1.558306696837244, "bleu_stderr": 0.09981504570416164, "rouge1_fmeasure": 0.17624612929896505, "rouge1_fmeasure_stderr": 0.0025718000739001897, "rouge1_precision": 0.1260899791574131, "rouge1_precision_stderr": 0.001905161571513694, "rouge1_recall": 0.30488764933726087, "rouge1_recall_stderr": 0.004406152742466401, "rouge2_fmeasure": 0.038642466121148654, "rouge2_fmeasure_stderr": 0.0015244525321813124, "rouge2_precision": 0.02734160155747233, "rouge2_precision_stderr": 0.0010866592288641356, "rouge2_recall": 0.06868815407030224, "rouge2_recall_stderr": 0.0027368160750971225, "rougeL_fmeasure": 0.14308653397520427, "rougeL_fmeasure_stderr": 0.002030967495914565, "rougeL_precision": 0.10218411344219641, "rougeL_precision_stderr": 0.0014948765149499935, "rougeL_recall": 0.24887472602828783, "rougeL_recall_stderr": 0.0035843610201584087, "rougeLsum_fmeasure": 0.13746530371970767, "rougeLsum_fmeasure_stderr": 0.0021096421026204895, "rougeLsum_precision": 0.09807478367610013, "rougeLsum_precision_stderr": 0.0015430046746426423, "rougeLsum_recall": 0.23944847065255154, "rougeLsum_recall_stderr": 0.003722025149898182}}, "2": {"DOC_boils_down_to_simple_idea_that": {"bleu": 1.2055699356745069, "bleu_stderr": 0.05452685939114017, "rouge1_fmeasure": 0.17473465066618604, "rouge1_fmeasure_stderr": 0.00245708961708968, "rouge1_precision": 0.124060562800013, "rouge1_precision_stderr": 0.0018136218300640054, "rouge1_recall": 0.3073301314994108, "rouge1_recall_stderr": 0.0042524341650730015, "rouge2_fmeasure": 0.03174503623834507, "rouge2_fmeasure_stderr": 0.0012950949882147274, "rouge2_precision": 0.02227233591062227, "rouge2_precision_stderr": 0.0009171326429615212, "rouge2_recall": 0.05765394909576481, "rouge2_recall_stderr": 0.002379587733761179, "rougeL_fmeasure": 0.13112441731732927, "rougeL_fmeasure_stderr": 0.0017687033339916917, "rougeL_precision": 0.09291237590423006, "rougeL_precision_stderr": 0.001297408097673992, "rougeL_recall": 0.23207281546402703, "rougeL_recall_stderr": 0.0031750505176512617, "rougeLsum_fmeasure": 0.13779052503552564, "rougeLsum_fmeasure_stderr": 0.001976539742066777, "rougeLsum_precision": 0.09759594898910186, "rougeLsum_precision_stderr": 0.0014440054482869897, "rougeLsum_recall": 0.24407902417613644, "rougeLsum_recall_stderr": 0.003545475320336588}, "DOC_tldr": {"bleu": 1.9960960514942592, "bleu_stderr": 0.07344519144130342, "rouge1_fmeasure": 0.20413697486271104, "rouge1_fmeasure_stderr": 0.002376457670474134, "rouge1_precision": 0.14505365849514526, "rouge1_precision_stderr": 0.0017717206403909573, "rouge1_recall": 0.3590688289676267, "rouge1_recall_stderr": 0.004194349096882626, "rouge2_fmeasure": 0.05192590462289213, "rouge2_fmeasure_stderr": 0.0016190417513229905, "rouge2_precision": 0.03641466698412616, "rouge2_precision_stderr": 0.0011418577722541036, "rouge2_recall": 0.09475841284636158, "rouge2_recall_stderr": 0.003043438432012329, "rougeL_fmeasure": 0.16427512906528735, "rougeL_fmeasure_stderr": 0.0019258406494987107, "rougeL_precision": 0.1165115860707238, "rougeL_precision_stderr": 0.0014128544079030998, "rougeL_recall": 0.29053341231755914, "rougeL_recall_stderr": 0.0035601421682268683, "rougeLsum_fmeasure": 0.16004314278537282, "rougeLsum_fmeasure_stderr": 0.002051543734547068, "rougeLsum_precision": 0.11344382044719216, "rougeLsum_precision_stderr": 0.0014987095085429382, "rougeLsum_recall": 0.2835275825737783, "rougeLsum_recall_stderr": 0.003772811693926147}, "article_DOC_summary": {"bleu": 1.8719539285791582, "bleu_stderr": 0.12363679141275902, "rouge1_fmeasure": 0.19861514735566918, "rouge1_fmeasure_stderr": 0.002536869900552261, "rouge1_precision": 0.1416725332039807, "rouge1_precision_stderr": 0.0019003755973271465, "rouge1_recall": 0.3459660596871756, "rouge1_recall_stderr": 0.004306340084752496, "rouge2_fmeasure": 0.046170973346933354, "rouge2_fmeasure_stderr": 0.0016039666517335285, "rouge2_precision": 0.032589396300244954, "rouge2_precision_stderr": 0.001135411316234186, "rouge2_recall": 0.08273970858446308, "rouge2_recall_stderr": 0.0029435229941216076, "rougeL_fmeasure": 0.1563808799657367, "rougeL_fmeasure_stderr": 0.0019620345863827896, "rougeL_precision": 0.11131065526290164, "rougeL_precision_stderr": 0.0014510877883445626, "rougeL_recall": 0.2741645486545069, "rougeL_recall_stderr": 0.003497544935564585, "rougeLsum_fmeasure": 0.15581654576519338, "rougeLsum_fmeasure_stderr": 0.0021586761800846295, "rougeLsum_precision": 0.11082674371408649, "rougeLsum_precision_stderr": 0.0015845483453646742, "rougeLsum_recall": 0.2737340240682109, "rougeLsum_recall_stderr": 0.003854876130312793}, "summarize_DOC": {"bleu": 1.9593269560207813, "bleu_stderr": 0.10392450231035287, "rouge1_fmeasure": 0.21082155347220088, "rouge1_fmeasure_stderr": 0.0024569832954891837, "rouge1_precision": 0.15082144789712365, "rouge1_precision_stderr": 0.0018424832486209342, "rouge1_recall": 0.3651698865935423, "rouge1_recall_stderr": 0.004302647367894026, "rouge2_fmeasure": 0.04973150539674991, "rouge2_fmeasure_stderr": 0.0016131300737703802, "rouge2_precision": 0.03513129846275268, "rouge2_precision_stderr": 0.0011451237082588754, "rouge2_recall": 0.08918749690012981, "rouge2_recall_stderr": 0.0029682699425890064, "rougeL_fmeasure": 0.15960845606925958, "rougeL_fmeasure_stderr": 0.0018776627339659353, "rougeL_precision": 0.11407141265154397, "rougeL_precision_stderr": 0.0013994053119914564, "rougeL_recall": 0.2776724573795989, "rougeL_recall_stderr": 0.003411155443688218, "rougeLsum_fmeasure": 0.16564364312727078, "rougeLsum_fmeasure_stderr": 0.002071661209778157, "rougeLsum_precision": 0.11828100442855599, "rougeLsum_precision_stderr": 0.0015337129412005281, "rougeLsum_recall": 0.28863758587492094, "rougeLsum_recall_stderr": 0.0037557189858194494}, "summarize_this_DOC_summary": {"bleu": 1.7326078380754275, "bleu_stderr": 0.11082657511901861, "rouge1_fmeasure": 0.18476307451897173, "rouge1_fmeasure_stderr": 0.0024826342866030933, "rouge1_precision": 0.13253960760217687, "rouge1_precision_stderr": 0.0018631836737003167, "rouge1_recall": 0.3178043987357393, "rouge1_recall_stderr": 0.004209555582114566, "rouge2_fmeasure": 0.04211259064384283, "rouge2_fmeasure_stderr": 0.0015475034261702306, "rouge2_precision": 0.029929646843642313, "rouge2_precision_stderr": 0.0011076561079256734, "rouge2_recall": 0.07427687891626805, "rouge2_recall_stderr": 0.0027863929510018536, "rougeL_fmeasure": 0.1521149622660914, "rougeL_fmeasure_stderr": 0.0020254138607920464, "rougeL_precision": 0.10895910761984066, "rougeL_precision_stderr": 0.001506797437434206, "rougeL_recall": 0.2627359450729775, "rougeL_recall_stderr": 0.0035428896385583442, "rougeLsum_fmeasure": 0.14364115654972823, "rougeLsum_fmeasure_stderr": 0.0021055633428788308, "rougeLsum_precision": 0.10274775436240437, "rougeLsum_precision_stderr": 0.0015507057264651516, "rougeLsum_recall": 0.2489222546895533, "rougeLsum_recall_stderr": 0.00370928368654382}}, "3": {"DOC_boils_down_to_simple_idea_that": {"bleu": 1.3690976879754526, "bleu_stderr": 0.07157486729365321, "rouge1_fmeasure": 0.17754481617140364, "rouge1_fmeasure_stderr": 0.0027411678148972474, "rouge1_precision": 0.12858513884438236, "rouge1_precision_stderr": 0.002120442853077002, "rouge1_recall": 0.3058467203905133, "rouge1_recall_stderr": 0.0046548186919951684, "rouge2_fmeasure": 0.03592964212545872, "rouge2_fmeasure_stderr": 0.001337319847086293, "rouge2_precision": 0.02559840737779659, "rouge2_precision_stderr": 0.0009669040664741145, "rouge2_recall": 0.06396559494568956, "rouge2_recall_stderr": 0.0024066314342877596, "rougeL_fmeasure": 0.13344572346235084, "rougeL_fmeasure_stderr": 0.001985028910537152, "rougeL_precision": 0.09637012660269542, "rougeL_precision_stderr": 0.0015081518881888312, "rougeL_recall": 0.23129631390107847, "rougeL_recall_stderr": 0.0034855936093635756, "rougeLsum_fmeasure": 0.14060704133134494, "rougeLsum_fmeasure_stderr": 0.002274916550088745, "rougeLsum_precision": 0.10149076059236951, "rougeLsum_precision_stderr": 0.0017109531080025527, "rougeLsum_recall": 0.24358222998073842, "rougeLsum_recall_stderr": 0.003957993939468868}, "DOC_tldr": {"bleu": 1.9836229338335558, "bleu_stderr": 0.09808438341855263, "rouge1_fmeasure": 0.2022841747965752, "rouge1_fmeasure_stderr": 0.0026384886657312224, "rouge1_precision": 0.14713365585983412, "rouge1_precision_stderr": 0.0020824125761163305, "rouge1_recall": 0.34916556365895823, "rouge1_recall_stderr": 0.004704775790168902, "rouge2_fmeasure": 0.0504745368841509, "rouge2_fmeasure_stderr": 0.00163370097686754, "rouge2_precision": 0.03592889088406201, "rouge2_precision_stderr": 0.0011712459443672357, "rouge2_recall": 0.09050705948348292, "rouge2_recall_stderr": 0.0030325027339432697, "rougeL_fmeasure": 0.15906261276123296, "rougeL_fmeasure_stderr": 0.0021562437238946204, "rougeL_precision": 0.11559690666144418, "rougeL_precision_stderr": 0.001671096800205579, "rougeL_recall": 0.2754667184343775, "rougeL_recall_stderr": 0.003934807057494367, "rougeLsum_fmeasure": 0.15926265578186155, "rougeLsum_fmeasure_stderr": 0.0022530285725081956, "rougeLsum_precision": 0.11557269164532327, "rougeLsum_precision_stderr": 0.001725315585914204, "rougeLsum_recall": 0.2766392842578445, "rougeLsum_recall_stderr": 0.004134898630416766}, "article_DOC_summary": {"bleu": 2.0369871247788307, "bleu_stderr": 0.11889285675918113, "rouge1_fmeasure": 0.19191359742761793, "rouge1_fmeasure_stderr": 0.00277051626651925, "rouge1_precision": 0.13956778448576387, "rouge1_precision_stderr": 0.0021737270808076243, "rouge1_recall": 0.329102550985314, "rouge1_recall_stderr": 0.004775853265191935, "rouge2_fmeasure": 0.04715930396420784, "rouge2_fmeasure_stderr": 0.001624147693037141, "rouge2_precision": 0.034143138087541, "rouge2_precision_stderr": 0.0012197975952686348, "rouge2_recall": 0.08270285997486158, "rouge2_recall_stderr": 0.0029002084266991856, "rougeL_fmeasure": 0.1528761624914166, "rougeL_fmeasure_stderr": 0.002169630539318251, "rougeL_precision": 0.11089339313344634, "rougeL_precision_stderr": 0.0016751091885532099, "rougeL_recall": 0.26375167528125915, "rougeL_recall_stderr": 0.0038713870137101424, "rougeLsum_fmeasure": 0.15380687866772882, "rougeLsum_fmeasure_stderr": 0.002388364659924707, "rougeLsum_precision": 0.11161318605392968, "rougeLsum_precision_stderr": 0.0018444411300991288, "rougeLsum_recall": 0.26547328847718554, "rougeLsum_recall_stderr": 0.004239107141632789}, "summarize_DOC": {"bleu": 1.9760625459105288, "bleu_stderr": 0.10026358022338139, "rouge1_fmeasure": 0.204398760358889, "rouge1_fmeasure_stderr": 0.00276366410673389, "rouge1_precision": 0.1496531280766477, "rouge1_precision_stderr": 0.002170811665161093, "rouge1_recall": 0.3453250604427468, "rouge1_recall_stderr": 0.0047401463995383085, "rouge2_fmeasure": 0.04868798327580798, "rouge2_fmeasure_stderr": 0.0016054863551996723, "rouge2_precision": 0.03495998710112079, "rouge2_precision_stderr": 0.0011734323689905663, "rouge2_recall": 0.0853256030786561, "rouge2_recall_stderr": 0.00288948347840057, "rougeL_fmeasure": 0.15352602365435797, "rougeL_fmeasure_stderr": 0.0021112441534149396, "rougeL_precision": 0.11215486999034506, "rougeL_precision_stderr": 0.001620815089476965, "rougeL_recall": 0.26057916777844126, "rougeL_recall_stderr": 0.0037363611572360737, "rougeLsum_fmeasure": 0.16206552227978172, "rougeLsum_fmeasure_stderr": 0.0023555539018775963, "rougeLsum_precision": 0.1182804937364553, "rougeLsum_precision_stderr": 0.0018148963331361236, "rougeLsum_recall": 0.27561134068552073, "rougeLsum_recall_stderr": 0.0041549575197084255}, "summarize_this_DOC_summary": {"bleu": 1.7305491538270004, "bleu_stderr": 0.08408232834719435, "rouge1_fmeasure": 0.1752477037036964, "rouge1_fmeasure_stderr": 0.002823759912213769, "rouge1_precision": 0.12889199801472273, "rouge1_precision_stderr": 0.002243455154850897, "rouge1_recall": 0.2927286723681437, "rouge1_recall_stderr": 0.004638755474625006, "rouge2_fmeasure": 0.0405702032936913, "rouge2_fmeasure_stderr": 0.0016059537675197902, "rouge2_precision": 0.029515166535948576, "rouge2_precision_stderr": 0.0012016618969078606, "rouge2_recall": 0.06911038852478063, "rouge2_recall_stderr": 0.002746574682775168, "rougeL_fmeasure": 0.14276741278375094, "rougeL_fmeasure_stderr": 0.0022683257050729088, "rougeL_precision": 0.10464484098581038, "rougeL_precision_stderr": 0.001773802303200482, "rougeL_recall": 0.24008506670200253, "rougeL_recall_stderr": 0.0038295614084253612, "rougeLsum_fmeasure": 0.13584003612122558, "rougeLsum_fmeasure_stderr": 0.002348292605866413, "rougeLsum_precision": 0.09960601532293903, "rougeLsum_precision_stderr": 0.0018313506067734666, "rougeLsum_recall": 0.22860772378561103, "rougeLsum_recall_stderr": 0.0039645397857025364}}, "4": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.7131409325442915, "bleu_stderr": 0.16045020297035031, "rouge1_fmeasure": 0.04580987420726127, "rouge1_fmeasure_stderr": 0.002686434828650492, "rouge1_precision": 0.03769871977195407, "rouge1_precision_stderr": 0.002336453416963925, "rouge1_recall": 0.07311282171723119, "rouge1_recall_stderr": 0.004454298183792852, "rouge2_fmeasure": 0.009837365340636202, "rouge2_fmeasure_stderr": 0.0008814198827318251, "rouge2_precision": 0.007625557776701037, "rouge2_precision_stderr": 0.0007866165856765972, "rouge2_recall": 0.01686591602432881, "rouge2_recall_stderr": 0.0015442682087128132, "rougeL_fmeasure": 0.03390847923190842, "rougeL_fmeasure_stderr": 0.0019941253411488963, "rougeL_precision": 0.02814473364569592, "rougeL_precision_stderr": 0.0018141347746336734, "rougeL_recall": 0.05436185626976168, "rougeL_recall_stderr": 0.0033426299994124293, "rougeLsum_fmeasure": 0.03619213971219476, "rougeLsum_fmeasure_stderr": 0.002146702755941942, "rougeLsum_precision": 0.029868454600151712, "rougeLsum_precision_stderr": 0.0019063509322727447, "rougeLsum_recall": 0.05818680411207516, "rougeLsum_recall_stderr": 0.003622236236410205}, "DOC_tldr": {"bleu": 1.078602667040749, "bleu_stderr": 0.18381444266538938, "rouge1_fmeasure": 0.05505531576181226, "rouge1_fmeasure_stderr": 0.0030265646206736638, "rouge1_precision": 0.04765862333877057, "rouge1_precision_stderr": 0.003034035842078532, "rouge1_recall": 0.08668561522891197, "rouge1_recall_stderr": 0.004843273307719953, "rouge2_fmeasure": 0.013418856497160158, "rouge2_fmeasure_stderr": 0.0011417937757972204, "rouge2_precision": 0.010462634958970572, "rouge2_precision_stderr": 0.0009728648293141028, "rouge2_recall": 0.021800656752831386, "rouge2_recall_stderr": 0.0018115640007053421, "rougeL_fmeasure": 0.04273395830712555, "rougeL_fmeasure_stderr": 0.0023651800321026328, "rougeL_precision": 0.037309800551862775, "rougeL_precision_stderr": 0.0024441508269820006, "rougeL_recall": 0.06722349707153805, "rougeL_recall_stderr": 0.0037810947177969894, "rougeLsum_fmeasure": 0.04406961555377166, "rougeLsum_fmeasure_stderr": 0.0024510630458409125, "rougeLsum_precision": 0.038418088250945504, "rougeLsum_precision_stderr": 0.0024990780458393853, "rougeLsum_recall": 0.06943623456802712, "rougeLsum_recall_stderr": 0.003937161605823187}, "article_DOC_summary": {"bleu": 0.9532704572175751, "bleu_stderr": 0.1334974714932564, "rouge1_fmeasure": 0.053658040802239085, "rouge1_fmeasure_stderr": 0.0029774150017601803, "rouge1_precision": 0.045437643598581885, "rouge1_precision_stderr": 0.002731741164998445, "rouge1_recall": 0.0834633850546414, "rouge1_recall_stderr": 0.004709515538170699, "rouge2_fmeasure": 0.012335864733508397, "rouge2_fmeasure_stderr": 0.0010450675751901856, "rouge2_precision": 0.009714659363066126, "rouge2_precision_stderr": 0.0009061043015811388, "rouge2_recall": 0.020204993259341004, "rouge2_recall_stderr": 0.0017173474943290197, "rougeL_fmeasure": 0.041486821630655535, "rougeL_fmeasure_stderr": 0.0022764116642440576, "rougeL_precision": 0.03525079234225019, "rougeL_precision_stderr": 0.0021438053902596605, "rougeL_recall": 0.06521180915166398, "rougeL_recall_stderr": 0.003703751644624955, "rougeLsum_fmeasure": 0.04288192765446807, "rougeLsum_fmeasure_stderr": 0.0023977202282267356, "rougeLsum_precision": 0.03647650806323336, "rougeLsum_precision_stderr": 0.002248078408967042, "rougeLsum_recall": 0.06711329846733026, "rougeLsum_recall_stderr": 0.003863144394106958}, "summarize_DOC": {"bleu": 0.9987646889959115, "bleu_stderr": 0.1912402463323597, "rouge1_fmeasure": 0.05625513613265757, "rouge1_fmeasure_stderr": 0.0030874961718411237, "rouge1_precision": 0.04792518752207993, "rouge1_precision_stderr": 0.002929855540885626, "rouge1_recall": 0.08695740365895663, "rouge1_recall_stderr": 0.004905955060606447, "rouge2_fmeasure": 0.012561323055777309, "rouge2_fmeasure_stderr": 0.0011221605993979012, "rouge2_precision": 0.01000387771406212, "rouge2_precision_stderr": 0.0009949695824421575, "rouge2_recall": 0.020457759459906272, "rouge2_recall_stderr": 0.001799928617720977, "rougeL_fmeasure": 0.042368809782355486, "rougeL_fmeasure_stderr": 0.0023587611382963657, "rougeL_precision": 0.03654303187169094, "rougeL_precision_stderr": 0.0023333023205796763, "rougeL_recall": 0.06544765513897487, "rougeL_recall_stderr": 0.0037490143344574268, "rougeLsum_fmeasure": 0.045014902928945284, "rougeLsum_fmeasure_stderr": 0.002517391175655878, "rougeLsum_precision": 0.038524011442170915, "rougeLsum_precision_stderr": 0.0024337355610465307, "rougeLsum_recall": 0.07009896492118943, "rougeLsum_recall_stderr": 0.004064534959138374}, "summarize_this_DOC_summary": {"bleu": 0.6780236081531866, "bleu_stderr": 0.08801012234752068, "rouge1_fmeasure": 0.047528176132410734, "rouge1_fmeasure_stderr": 0.002823749720620929, "rouge1_precision": 0.04123247889915294, "rouge1_precision_stderr": 0.0027362423249375554, "rouge1_recall": 0.07120572993230662, "rouge1_recall_stderr": 0.004274377455153015, "rouge2_fmeasure": 0.010936982876670076, "rouge2_fmeasure_stderr": 0.0010227818352712905, "rouge2_precision": 0.009497011127313206, "rouge2_precision_stderr": 0.0010718639651327534, "rouge2_recall": 0.016786374564059875, "rouge2_recall_stderr": 0.0015228476295663846, "rougeL_fmeasure": 0.03751611271132232, "rougeL_fmeasure_stderr": 0.0022506689999874908, "rougeL_precision": 0.03244728409674943, "rougeL_precision_stderr": 0.0021591440394866995, "rougeL_recall": 0.05663645241508702, "rougeL_recall_stderr": 0.0034535293095162792, "rougeLsum_fmeasure": 0.03670784975038728, "rougeLsum_fmeasure_stderr": 0.0022077354874054825, "rougeLsum_precision": 0.031933246077056165, "rougeLsum_precision_stderr": 0.002143169453256064, "rougeLsum_recall": 0.055391705623806235, "rougeLsum_recall_stderr": 0.003398556473468678}}, "5": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0001231472929586137, "rouge1_fmeasure_stderr": 8.72630464321695e-05, "rouge1_precision": 0.0017152658662092624, "rouge1_precision_stderr": 0.0012123554660875486, "rouge1_recall": 6.387886674158631e-05, "rouge1_recall_stderr": 4.527370030666761e-05, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0001231472929586137, "rougeL_fmeasure_stderr": 8.72630464321695e-05, "rougeL_precision": 0.0017152658662092624, "rougeL_precision_stderr": 0.0012123554660875486, "rougeL_recall": 6.387886674158631e-05, "rougeL_recall_stderr": 4.527370030666761e-05, "rougeLsum_fmeasure": 0.0001231472929586137, "rougeLsum_fmeasure_stderr": 8.72630464321695e-05, "rougeLsum_precision": 0.0017152658662092624, "rougeLsum_precision_stderr": 0.0012123554660875486, "rougeLsum_recall": 6.387886674158631e-05, "rougeLsum_recall_stderr": 4.527370030666761e-05}, "DOC_tldr": {"bleu": 2.8932273089650456e-42, "bleu_stderr": 3.552979377433657e-36, "rouge1_fmeasure": 0.0022213194298558382, "rouge1_fmeasure_stderr": 0.0006417999440798693, "rouge1_precision": 0.0025698165750884954, "rouge1_precision_stderr": 0.0007513867047594584, "rouge1_recall": 0.0020265023378529864, "rouge1_recall_stderr": 0.0005860337220124529, "rouge2_fmeasure": 9.617082045566528e-05, "rouge2_fmeasure_stderr": 6.874253211859863e-05, "rouge2_precision": 0.00011170849128673766, "rouge2_precision_stderr": 7.932541467693499e-05, "rouge2_recall": 8.475431338916355e-05, "rouge2_recall_stderr": 6.0983480512422104e-05, "rougeL_fmeasure": 0.0015697328565003042, "rougeL_fmeasure_stderr": 0.0004515600693647235, "rougeL_precision": 0.001814865674466118, "rougeL_precision_stderr": 0.0005255278900735306, "rougeL_recall": 0.0014201789051984832, "rougeL_recall_stderr": 0.00040633641993546393, "rougeLsum_fmeasure": 0.0017855275703933101, "rougeLsum_fmeasure_stderr": 0.0005066133842215296, "rougeLsum_precision": 0.00203326778659742, "rougeLsum_precision_stderr": 0.0005789262577812531, "rougeLsum_recall": 0.0016561300609609598, "rougeLsum_recall_stderr": 0.00047805234847829935}, "article_DOC_summary": {"bleu": 3.120169018915429e-39, "bleu_stderr": 2.777103521919484e-32, "rouge1_fmeasure": 0.002997767517780879, "rouge1_fmeasure_stderr": 0.0008334506629000642, "rouge1_precision": 0.003263719197769084, "rouge1_precision_stderr": 0.0008887617448902996, "rouge1_recall": 0.0028345456514645845, "rouge1_recall_stderr": 0.0008116977743173961, "rouge2_fmeasure": 0.0004337191943913522, "rouge2_fmeasure_stderr": 0.00021214389566854186, "rouge2_precision": 0.0004590858641913026, "rouge2_precision_stderr": 0.00021377485443811557, "rouge2_recall": 0.00041953329689178745, "rouge2_recall_stderr": 0.00021411305439143847, "rougeL_fmeasure": 0.002106529594577242, "rougeL_fmeasure_stderr": 0.0005845294157065285, "rougeL_precision": 0.0023145025817495005, "rougeL_precision_stderr": 0.0006312757980824667, "rougeL_recall": 0.0019717134578344517, "rougeL_recall_stderr": 0.0005600175515270516, "rougeLsum_fmeasure": 0.0024033045589119204, "rougeLsum_fmeasure_stderr": 0.0006766571604709825, "rougeLsum_precision": 0.0026347083023874887, "rougeLsum_precision_stderr": 0.0007273791404483311, "rougeLsum_recall": 0.00225801889401816, "rougeLsum_recall_stderr": 0.000653191190142516}, "summarize_DOC": {"bleu": 1.3330277651810904e-39, "bleu_stderr": 6.795417937992427e-33, "rouge1_fmeasure": 0.002203951765543492, "rouge1_fmeasure_stderr": 0.0006780578514746854, "rouge1_precision": 0.0025400865271768946, "rouge1_precision_stderr": 0.0008365321705968738, "rouge1_recall": 0.0020522199947177865, "rouge1_recall_stderr": 0.0006198953753349982, "rouge2_fmeasure": 0.00025865120204742847, "rouge2_fmeasure_stderr": 0.00017713704426631428, "rouge2_precision": 0.0003593890386343216, "rouge2_precision_stderr": 0.0002587217800602705, "rouge2_recall": 0.0002044705818290724, "rouge2_recall_stderr": 0.0001357861630560493, "rougeL_fmeasure": 0.001513916054366738, "rougeL_fmeasure_stderr": 0.00042616321835822437, "rougeL_precision": 0.001746686503389582, "rougeL_precision_stderr": 0.0005159950965839201, "rougeL_recall": 0.0013959158984171599, "rougeL_recall_stderr": 0.00038826984967891984, "rougeLsum_fmeasure": 0.0018556868062698107, "rougeLsum_fmeasure_stderr": 0.0005514893920061358, "rougeLsum_precision": 0.0021182020561365147, "rougeLsum_precision_stderr": 0.000660664849160224, "rougeLsum_recall": 0.0017475858050803443, "rougeLsum_recall_stderr": 0.0005209596862265322}, "summarize_this_DOC_summary": {"bleu": 4.142453217449745e-247, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0005723341987999615, "rouge1_fmeasure_stderr": 0.00024516446983701974, "rouge1_precision": 0.0025728987993138934, "rouge1_precision_stderr": 0.0011767976878626356, "rouge1_recall": 0.0003240120371493934, "rouge1_recall_stderr": 0.00013796006385495373, "rouge2_fmeasure": 0.00011435105774728416, "rouge2_fmeasure_stderr": 0.00011435105774728473, "rouge2_precision": 0.0008576329331046312, "rouge2_precision_stderr": 0.0008576329331046333, "rouge2_recall": 6.125949522175937e-05, "rouge2_recall_stderr": 6.125949522176059e-05, "rougeL_fmeasure": 0.0005723341987999615, "rougeL_fmeasure_stderr": 0.00024516446983701974, "rougeL_precision": 0.0025728987993138934, "rougeL_precision_stderr": 0.0011767976878626356, "rougeL_recall": 0.0003240120371493934, "rougeL_recall_stderr": 0.00013796006385495373, "rougeLsum_fmeasure": 0.0005723341987999615, "rougeLsum_fmeasure_stderr": 0.00024516446983701974, "rougeLsum_precision": 0.0025728987993138934, "rougeLsum_precision_stderr": 0.0011767976878626356, "rougeLsum_recall": 0.0003240120371493934, "rougeLsum_recall_stderr": 0.00013796006385495373}}}, "piqa": {"0": {"Correct the solution": {"bleu": 10.02681772761729, "bleu_stderr": 0.4785770088213236, "rouge1_fmeasure": 0.26893866283043205, "rouge1_fmeasure_stderr": 0.006128132638012782, "rouge1_precision": 0.21326442624004158, "rouge1_precision_stderr": 0.005569959071404712, "rouge1_recall": 0.5796067902091783, "rouge1_recall_stderr": 0.008493584491820132, "rouge2_fmeasure": 0.20120770859481335, "rouge2_fmeasure_stderr": 0.005573115844107413, "rouge2_precision": 0.1569774762608679, "rouge2_precision_stderr": 0.004918509910414435, "rouge2_recall": 0.4493611433994356, "rouge2_recall_stderr": 0.008641465811853976, "rougeL_fmeasure": 0.26037256266092274, "rougeL_fmeasure_stderr": 0.006063020148608195, "rougeL_precision": 0.20594687601926343, "rougeL_precision_stderr": 0.005476023109804243, "rougeL_recall": 0.5638381191556961, "rougeL_recall_stderr": 0.008539182187801923, "rougeLsum_fmeasure": 0.26266697026347924, "rougeLsum_fmeasure_stderr": 0.006101223479297265, "rougeLsum_precision": 0.20814801304375385, "rougeLsum_precision_stderr": 0.005524999926052949, "rougeLsum_recall": 0.5660905777243646, "rougeLsum_recall_stderr": 0.008533556999477658}, "choose the most appropriate solution": {"acc": 0.48639825897714906, "acc_norm": 0.48639825897714906, "acc_norm_stderr": 0.011661506839823789, "acc_stderr": 0.011661506839823789}, "no prompt needed": {"bleu": 0.17916585022658107, "bleu_stderr": 0.007303477837541812, "rouge1_fmeasure": 0.036054964953355036, "rouge1_fmeasure_stderr": 0.0008984262282200936, "rouge1_precision": 0.021356332010434582, "rouge1_precision_stderr": 0.0007105713714095752, "rouge1_recall": 0.2286180773139028, "rouge1_recall_stderr": 0.004259684658467588, "rouge2_fmeasure": 0.00581609356366873, "rouge2_fmeasure_stderr": 0.00028059938593242515, "rouge2_precision": 0.003342865314527443, "rouge2_precision_stderr": 0.00017849853167599175, "rouge2_recall": 0.04156016756116742, "rouge2_recall_stderr": 0.002190482133962692, "rougeL_fmeasure": 0.032951139104863636, "rougeL_fmeasure_stderr": 0.0007476528694649519, "rougeL_precision": 0.019312824968880614, "rougeL_precision_stderr": 0.0005557873015899917, "rougeL_recall": 0.21416319877579273, "rougeL_recall_stderr": 0.00397919084314672, "rougeLsum_fmeasure": 0.02990474554699068, "rougeLsum_fmeasure_stderr": 0.0007607771602225413, "rougeLsum_precision": 0.017643439425447376, "rougeLsum_precision_stderr": 0.0005894989170566875, "rougeLsum_recall": 0.19731432042647498, "rougeLsum_recall_stderr": 0.0038630321229470993}, "pick_correct_choice_index": {"acc": 0.49510337323177367, "acc_norm": 0.49510337323177367, "acc_norm_stderr": 0.01166526473007815, "acc_stderr": 0.01166526473007815}, "what_is_the_correct_ending": {"acc": 0.5663764961915125, "acc_norm": 0.5669205658324266, "acc_norm_stderr": 0.011560864423151372, "acc_stderr": 0.011562571737707337}}, "1": {"Correct the solution": {"bleu": 12.405128080356619, "bleu_stderr": 0.6282911341436567, "rouge1_fmeasure": 0.43052575441974283, "rouge1_fmeasure_stderr": 0.007565036202436803, "rouge1_precision": 0.4297880540568289, "rouge1_precision_stderr": 0.008220917619398086, "rouge1_recall": 0.6505221227794611, "rouge1_recall_stderr": 0.007032948083628127, "rouge2_fmeasure": 0.33159324549421026, "rouge2_fmeasure_stderr": 0.00747564127341564, "rouge2_precision": 0.3277678584275725, "rouge2_precision_stderr": 0.00787877500339806, "rouge2_recall": 0.5012746456886409, "rouge2_recall_stderr": 0.008142123538414653, "rougeL_fmeasure": 0.41915608188207465, "rougeL_fmeasure_stderr": 0.0076028128010187926, "rougeL_precision": 0.4165981868791939, "rougeL_precision_stderr": 0.00815637476891333, "rougeL_recall": 0.6336843792535592, "rougeL_recall_stderr": 0.007239228173826263, "rougeLsum_fmeasure": 0.42133006102653786, "rougeLsum_fmeasure_stderr": 0.0076023574833303156, "rougeLsum_precision": 0.41983019583164943, "rougeLsum_precision_stderr": 0.008186753775907333, "rougeLsum_recall": 0.6355954746439701, "rougeLsum_recall_stderr": 0.0072092086784890964}, "choose the most appropriate solution": {"acc": 0.5032644178454843, "acc_norm": 0.5032644178454843, "acc_norm_stderr": 0.01166557553076037, "acc_stderr": 0.01166557553076037}, "no prompt needed": {"bleu": 0.14849173039066274, "bleu_stderr": 0.006780886130825962, "rouge1_fmeasure": 0.032381757450450424, "rouge1_fmeasure_stderr": 0.000864749614587369, "rouge1_precision": 0.020251968434917507, "rouge1_precision_stderr": 0.001004184999269241, "rouge1_recall": 0.201417622459911, "rouge1_recall_stderr": 0.004027970741634812, "rouge2_fmeasure": 0.005210530351647924, "rouge2_fmeasure_stderr": 0.00032295371471857974, "rouge2_precision": 0.003495343473522734, "rouge2_precision_stderr": 0.00039644904227136035, "rouge2_recall": 0.03504591595228045, "rouge2_recall_stderr": 0.0020381015294354335, "rougeL_fmeasure": 0.03028754869084763, "rougeL_fmeasure_stderr": 0.0007880477659311635, "rougeL_precision": 0.018798741988949746, "rougeL_precision_stderr": 0.000866081508450852, "rougeL_recall": 0.19048063687209987, "rougeL_recall_stderr": 0.0037923827162263503, "rougeLsum_fmeasure": 0.026758462015252938, "rougeLsum_fmeasure_stderr": 0.0007341044406760539, "rougeLsum_precision": 0.01682276801642798, "rougeLsum_precision_stderr": 0.0008795517871256883, "rougeLsum_recall": 0.172585092925651, "rougeLsum_recall_stderr": 0.0035987297019058986}, "pick_correct_choice_index": {"acc": 0.49347116430903154, "acc_norm": 0.49347116430903154, "acc_norm_stderr": 0.011664829595210969, "acc_stderr": 0.011664829595210969}, "what_is_the_correct_ending": {"acc": 0.573993471164309, "acc_norm": 0.5745375408052231, "acc_norm_stderr": 0.011535468840824528, "acc_stderr": 0.011537375448519443}}, "2": {"Correct the solution": {"bleu": 15.396111872252607, "bleu_stderr": 0.7207759822340656, "rouge1_fmeasure": 0.49207161703177676, "rouge1_fmeasure_stderr": 0.0079131367283513, "rouge1_precision": 0.4876095593388225, "rouge1_precision_stderr": 0.008404900319386375, "rouge1_recall": 0.6784375831550562, "rouge1_recall_stderr": 0.006899269582410456, "rouge2_fmeasure": 0.38914197156300356, "rouge2_fmeasure_stderr": 0.008043239661107048, "rouge2_precision": 0.3809761461318492, "rouge2_precision_stderr": 0.008300803335151334, "rouge2_recall": 0.5339790203304655, "rouge2_recall_stderr": 0.008121721991565935, "rougeL_fmeasure": 0.48065244860067763, "rougeL_fmeasure_stderr": 0.007989865365422826, "rougeL_precision": 0.4745592667499523, "rougeL_precision_stderr": 0.008400824405327217, "rougeL_recall": 0.6625149456976438, "rougeL_recall_stderr": 0.0071233628299963376, "rougeLsum_fmeasure": 0.4826221999255758, "rougeLsum_fmeasure_stderr": 0.00797725209707895, "rougeLsum_precision": 0.4769027851776395, "rougeLsum_precision_stderr": 0.008404721247676569, "rougeLsum_recall": 0.665131716020771, "rougeLsum_recall_stderr": 0.0070890587100882405}, "choose the most appropriate solution": {"acc": 0.5103373231773667, "acc_norm": 0.5103373231773667, "acc_norm_stderr": 0.011663330673075898, "acc_stderr": 0.011663330673075898}, "no prompt needed": {"bleu": 0.14199881699588157, "bleu_stderr": 0.007298077065204388, "rouge1_fmeasure": 0.03229518293521006, "rouge1_fmeasure_stderr": 0.0008347091468006159, "rouge1_precision": 0.019700320148592156, "rouge1_precision_stderr": 0.0007325263284771759, "rouge1_recall": 0.1998389873544708, "rouge1_recall_stderr": 0.003997380487910199, "rouge2_fmeasure": 0.004949579293108938, "rouge2_fmeasure_stderr": 0.0002653281442859, "rouge2_precision": 0.002931431196844649, "rouge2_precision_stderr": 0.0001919227650150536, "rouge2_recall": 0.034428579543291746, "rouge2_recall_stderr": 0.0019427883698678984, "rougeL_fmeasure": 0.029752024739423115, "rougeL_fmeasure_stderr": 0.0007341759429352048, "rougeL_precision": 0.018031465684386884, "rougeL_precision_stderr": 0.0006335497251678613, "rougeL_recall": 0.187052546508197, "rougeL_recall_stderr": 0.0037036180117878304, "rougeLsum_fmeasure": 0.026370393722191878, "rougeLsum_fmeasure_stderr": 0.0006951231645923941, "rougeLsum_precision": 0.01617912400918, "rougeLsum_precision_stderr": 0.0006560029568093861, "rougeLsum_recall": 0.1700270228297869, "rougeLsum_recall_stderr": 0.0035543068929493704}, "pick_correct_choice_index": {"acc": 0.4885745375408052, "acc_norm": 0.4885745375408052, "acc_norm_stderr": 0.011662778026451676, "acc_stderr": 0.011662778026451676}, "what_is_the_correct_ending": {"acc": 0.5696409140369967, "acc_norm": 0.5723612622415669, "acc_norm_stderr": 0.01154300962328283, "acc_stderr": 0.011552114834700507}}, "3": {"Correct the solution": {"bleu": 16.218180922067372, "bleu_stderr": 0.8509332297059158, "rouge1_fmeasure": 0.4977472383213469, "rouge1_fmeasure_stderr": 0.007835731534849398, "rouge1_precision": 0.4890706684702515, "rouge1_precision_stderr": 0.008396913867721763, "rouge1_recall": 0.6997303374835331, "rouge1_recall_stderr": 0.00655926076210075, "rouge2_fmeasure": 0.39761438363429064, "rouge2_fmeasure_stderr": 0.007954981983937091, "rouge2_precision": 0.38688066002197424, "rouge2_precision_stderr": 0.008249491136803927, "rouge2_recall": 0.5562519826404686, "rouge2_recall_stderr": 0.007946807268147116, "rougeL_fmeasure": 0.4863555951176898, "rougeL_fmeasure_stderr": 0.00792372037118076, "rougeL_precision": 0.4760638248450606, "rougeL_precision_stderr": 0.008403042671103629, "rougeL_recall": 0.683370150325831, "rougeL_recall_stderr": 0.006819890967440377, "rougeLsum_fmeasure": 0.48895320036596807, "rougeLsum_fmeasure_stderr": 0.007901377124692735, "rougeLsum_precision": 0.47941150704079855, "rougeLsum_precision_stderr": 0.008400997890393643, "rougeLsum_recall": 0.687165768578369, "rougeLsum_recall_stderr": 0.0067675214257696}, "choose the most appropriate solution": {"acc": 0.5048966267682263, "acc_norm": 0.5048966267682263, "acc_norm_stderr": 0.01166526473007814, "acc_stderr": 0.01166526473007814}, "no prompt needed": {"bleu": 0.14611074118885167, "bleu_stderr": 0.010329545037543015, "rouge1_fmeasure": 0.032166278597507376, "rouge1_fmeasure_stderr": 0.0008641364081647094, "rouge1_precision": 0.019397395062929524, "rouge1_precision_stderr": 0.0007719276203796916, "rouge1_recall": 0.19955139137105213, "rouge1_recall_stderr": 0.004054485142263899, "rouge2_fmeasure": 0.004937686316660083, "rouge2_fmeasure_stderr": 0.0003186532275811235, "rouge2_precision": 0.0031310622057378625, "rouge2_precision_stderr": 0.00043845430276555624, "rouge2_recall": 0.03466455497519352, "rouge2_recall_stderr": 0.0020194141383628244, "rougeL_fmeasure": 0.029306952043178024, "rougeL_fmeasure_stderr": 0.000751437752875103, "rougeL_precision": 0.01770820830988742, "rougeL_precision_stderr": 0.0007194515278158072, "rougeL_recall": 0.1838484940943189, "rougeL_recall_stderr": 0.0036582750075176943, "rougeLsum_fmeasure": 0.026383518066224777, "rougeLsum_fmeasure_stderr": 0.0007264066302234951, "rougeLsum_precision": 0.016012342883103627, "rougeLsum_precision_stderr": 0.0007103602078950152, "rougeLsum_recall": 0.17027891330022135, "rougeLsum_recall_stderr": 0.003611508658664297}, "pick_correct_choice_index": {"acc": 0.5065288356909684, "acc_norm": 0.5065288356909684, "acc_norm_stderr": 0.01166482959521097, "acc_stderr": 0.01166482959521097}, "what_is_the_correct_ending": {"acc": 0.5554951033732318, "acc_norm": 0.5565832426550599, "acc_norm_stderr": 0.011590883373666854, "acc_stderr": 0.011593746871584154}}, "4": {"Correct the solution": {"bleu": 14.198451804549311, "bleu_stderr": 0.6684426734932597, "rouge1_fmeasure": 0.4624679947761788, "rouge1_fmeasure_stderr": 0.007768937193183521, "rouge1_precision": 0.4481132422374573, "rouge1_precision_stderr": 0.008389023019191415, "rouge1_recall": 0.7084211018687647, "rouge1_recall_stderr": 0.006378983318263235, "rouge2_fmeasure": 0.36764860730658894, "rouge2_fmeasure_stderr": 0.007715456641810255, "rouge2_precision": 0.3533165568819186, "rouge2_precision_stderr": 0.008050961633003766, "rouge2_recall": 0.5593924878365768, "rouge2_recall_stderr": 0.007881040514155611, "rougeL_fmeasure": 0.45164355068805134, "rougeL_fmeasure_stderr": 0.00783665587254422, "rougeL_precision": 0.43585149585760313, "rougeL_precision_stderr": 0.008369484613704422, "rougeL_recall": 0.691252031424185, "rougeL_recall_stderr": 0.006649043538442779, "rougeLsum_fmeasure": 0.4544143905392024, "rougeLsum_fmeasure_stderr": 0.007811758155493913, "rougeLsum_precision": 0.4394149652836155, "rougeLsum_precision_stderr": 0.008372880319767448, "rougeLsum_recall": 0.6958738596781262, "rougeLsum_recall_stderr": 0.0065890391738934635}, "choose the most appropriate solution": {"acc": 0.5032644178454843, "acc_norm": 0.5032644178454843, "acc_norm_stderr": 0.011665575530760367, "acc_stderr": 0.011665575530760367}, "no prompt needed": {"bleu": 0.13831086006846338, "bleu_stderr": 0.007223374215514496, "rouge1_fmeasure": 0.03155775272111882, "rouge1_fmeasure_stderr": 0.0007831075219253004, "rouge1_precision": 0.01879738324600387, "rouge1_precision_stderr": 0.0005908622270302197, "rouge1_recall": 0.19954407744415306, "rouge1_recall_stderr": 0.004053615634720824, "rouge2_fmeasure": 0.004548001185708453, "rouge2_fmeasure_stderr": 0.0002389268902228686, "rouge2_precision": 0.002591258862523811, "rouge2_precision_stderr": 0.00014237661089962314, "rouge2_recall": 0.03292368767698468, "rouge2_recall_stderr": 0.00192514110533362, "rougeL_fmeasure": 0.02877406291863521, "rougeL_fmeasure_stderr": 0.0006862843463300422, "rougeL_precision": 0.01714064421552565, "rougeL_precision_stderr": 0.0005309706560737331, "rougeL_recall": 0.18441955079753033, "rougeL_recall_stderr": 0.003735346161231745, "rougeLsum_fmeasure": 0.025839149783962134, "rougeLsum_fmeasure_stderr": 0.0006410049906716466, "rougeLsum_precision": 0.015437163218078281, "rougeLsum_precision_stderr": 0.0005145259041792498, "rougeLsum_recall": 0.17088506075667292, "rougeLsum_recall_stderr": 0.0036630999910594376}, "pick_correct_choice_index": {"acc": 0.5081610446137106, "acc_norm": 0.5081610446137106, "acc_norm_stderr": 0.011664270112244237, "acc_stderr": 0.011664270112244237}, "what_is_the_correct_ending": {"acc": 0.5544069640914037, "acc_norm": 0.5533188248095756, "acc_norm_stderr": 0.01159930504274508, "acc_stderr": 0.01159655408098765}}, "5": {"Correct the solution": {"bleu": 13.506508915229169, "bleu_stderr": 0.5116866328228713, "rouge1_fmeasure": 0.4316365583256063, "rouge1_fmeasure_stderr": 0.007604543745568956, "rouge1_precision": 0.40539001717811785, "rouge1_precision_stderr": 0.008246350607890437, "rouge1_recall": 0.720893483949784, "rouge1_recall_stderr": 0.006190339720420079, "rouge2_fmeasure": 0.3454708512028626, "rouge2_fmeasure_stderr": 0.007528073940807884, "rouge2_precision": 0.323283729079598, "rouge2_precision_stderr": 0.00784836767981782, "rouge2_recall": 0.5717527841713916, "rouge2_recall_stderr": 0.007807360491891714, "rougeL_fmeasure": 0.4222245633792425, "rougeL_fmeasure_stderr": 0.007657295802053638, "rougeL_precision": 0.3959242347319179, "rougeL_precision_stderr": 0.00822162851212014, "rougeL_recall": 0.7025263199962368, "rougeL_recall_stderr": 0.006478632818938049, "rougeLsum_fmeasure": 0.42507655541519007, "rougeLsum_fmeasure_stderr": 0.007630445420039102, "rougeLsum_precision": 0.3987034301296399, "rougeLsum_precision_stderr": 0.008213614641467533, "rougeLsum_recall": 0.7089771703621439, "rougeLsum_recall_stderr": 0.006396913155506568}, "choose the most appropriate solution": {"acc": 0.5038084874863983, "acc_norm": 0.5038084874863983, "acc_norm_stderr": 0.0116654857447468, "acc_stderr": 0.0116654857447468}, "no prompt needed": {"bleu": 0.11807145345667959, "bleu_stderr": 0.007767667145105737, "rouge1_fmeasure": 0.031432905163962875, "rouge1_fmeasure_stderr": 0.0007860008461527916, "rouge1_precision": 0.018866851198029937, "rouge1_precision_stderr": 0.0006293413700291389, "rouge1_recall": 0.19598796545291594, "rouge1_recall_stderr": 0.0039141300191215, "rouge2_fmeasure": 0.004669858309181997, "rouge2_fmeasure_stderr": 0.00024073389992590414, "rouge2_precision": 0.0027315307202555304, "rouge2_precision_stderr": 0.00015835092179141186, "rouge2_recall": 0.033330339409121036, "rouge2_recall_stderr": 0.0019094967708652559, "rougeL_fmeasure": 0.028453364453624484, "rougeL_fmeasure_stderr": 0.0006751447210313165, "rougeL_precision": 0.017052586854830805, "rougeL_precision_stderr": 0.0005424487932078866, "rougeL_recall": 0.1799650841386416, "rougeL_recall_stderr": 0.003577924759854656, "rougeLsum_fmeasure": 0.02611703752890656, "rougeLsum_fmeasure_stderr": 0.0006550883631257474, "rougeLsum_precision": 0.015736631180765175, "rougeLsum_precision_stderr": 0.0005595356316243608, "rougeLsum_recall": 0.16949142493214228, "rougeLsum_recall_stderr": 0.003578759284382382}, "pick_correct_choice_index": {"acc": 0.49020674646354734, "acc_norm": 0.49020674646354734, "acc_norm_stderr": 0.011663586263283223, "acc_stderr": 0.011663586263283223}, "what_is_the_correct_ending": {"acc": 0.5554951033732318, "acc_norm": 0.5571273122959739, "acc_norm_stderr": 0.0115894305035091, "acc_stderr": 0.011593746871584154}}}, "sciq": {"0": {"Direct Question": {"acc": 0.867, "acc_norm": 0.791, "acc_norm_stderr": 0.012864077288499351, "acc_stderr": 0.010743669132397335}, "Direct Question (Closed Book)": {"acc": 0.639, "acc_norm": 0.562, "acc_norm_stderr": 0.01569721001969469, "acc_stderr": 0.015195720118175125}, "Multiple Choice": {"acc": 0.601, "acc_norm": 0.525, "acc_norm_stderr": 0.01579951342999602, "acc_stderr": 0.015493193313162906}, "Multiple Choice (Closed Book)": {"acc": 0.5, "acc_norm": 0.453, "acc_norm_stderr": 0.015749255189977596, "acc_stderr": 0.015819299929208316}, "Multiple Choice Question First": {"acc": 0.625, "acc_norm": 0.531, "acc_norm_stderr": 0.015788865959539006, "acc_stderr": 0.015316971293620996}}, "1": {"Direct Question": {"acc": 0.892, "acc_norm": 0.876, "acc_norm_stderr": 0.01042749887234397, "acc_stderr": 0.00982000165134571}, "Direct Question (Closed Book)": {"acc": 0.679, "acc_norm": 0.665, "acc_norm_stderr": 0.014933117490932579, "acc_stderr": 0.014770821817934645}, "Multiple Choice": {"acc": 0.507, "acc_norm": 0.474, "acc_norm_stderr": 0.01579789775804276, "acc_stderr": 0.01581774956184357}, "Multiple Choice (Closed Book)": {"acc": 0.506, "acc_norm": 0.475, "acc_norm_stderr": 0.015799513429996016, "acc_stderr": 0.015818160898606715}, "Multiple Choice Question First": {"acc": 0.42, "acc_norm": 0.408, "acc_norm_stderr": 0.015549205052920676, "acc_stderr": 0.015615500115072957}}, "2": {"Direct Question": {"acc": 0.9, "acc_norm": 0.893, "acc_norm_stderr": 0.009779910359847165, "acc_stderr": 0.009491579957525044}, "Direct Question (Closed Book)": {"acc": 0.702, "acc_norm": 0.691, "acc_norm_stderr": 0.014619600977206488, "acc_stderr": 0.014470846741134715}, "Multiple Choice": {"acc": 0.559, "acc_norm": 0.507, "acc_norm_stderr": 0.015817749561843567, "acc_stderr": 0.015708779894242676}, "Multiple Choice (Closed Book)": {"acc": 0.539, "acc_norm": 0.509, "acc_norm_stderr": 0.015816736995005392, "acc_stderr": 0.015771104201283186}, "Multiple Choice Question First": {"acc": 0.477, "acc_norm": 0.452, "acc_norm_stderr": 0.01574623586588068, "acc_stderr": 0.0158025542467261}}, "3": {"Direct Question": {"acc": 0.909, "acc_norm": 0.903, "acc_norm_stderr": 0.009363689373248111, "acc_stderr": 0.00909954953840023}, "Direct Question (Closed Book)": {"acc": 0.717, "acc_norm": 0.707, "acc_norm_stderr": 0.014399942998441273, "acc_stderr": 0.014251810906481744}, "Multiple Choice": {"acc": 0.607, "acc_norm": 0.57, "acc_norm_stderr": 0.01566350361015528, "acc_stderr": 0.015452824654081496}, "Multiple Choice (Closed Book)": {"acc": 0.57, "acc_norm": 0.521, "acc_norm_stderr": 0.015805341148131296, "acc_stderr": 0.015663503610155283}, "Multiple Choice Question First": {"acc": 0.546, "acc_norm": 0.532, "acc_norm_stderr": 0.01578686875935901, "acc_stderr": 0.01575221038877184}}, "4": {"Direct Question": {"acc": 0.912, "acc_norm": 0.907, "acc_norm_stderr": 0.009188875634996662, "acc_stderr": 0.00896305396259208}, "Direct Question (Closed Book)": {"acc": 0.716, "acc_norm": 0.698, "acc_norm_stderr": 0.014526080235459543, "acc_stderr": 0.014267009061031306}, "Multiple Choice": {"acc": 0.642, "acc_norm": 0.608, "acc_norm_stderr": 0.015445859463771297, "acc_stderr": 0.01516792886540756}, "Multiple Choice (Closed Book)": {"acc": 0.565, "acc_norm": 0.554, "acc_norm_stderr": 0.015726771166750354, "acc_stderr": 0.0156850572527172}, "Multiple Choice Question First": {"acc": 0.574, "acc_norm": 0.566, "acc_norm_stderr": 0.015680876566375058, "acc_stderr": 0.01564508768811381}}, "5": {"Direct Question": {"acc": 0.918, "acc_norm": 0.912, "acc_norm_stderr": 0.00896305396259208, "acc_stderr": 0.008680515615523715}, "Direct Question (Closed Book)": {"acc": 0.716, "acc_norm": 0.703, "acc_norm_stderr": 0.014456832294801105, "acc_stderr": 0.014267009061031307}, "Multiple Choice": {"acc": 0.643, "acc_norm": 0.604, "acc_norm_stderr": 0.015473313265859406, "acc_stderr": 0.015158521721486776}, "Multiple Choice (Closed Book)": {"acc": 0.577, "acc_norm": 0.545, "acc_norm_stderr": 0.01575510149834709, "acc_stderr": 0.015630589090476345}, "Multiple Choice Question First": {"acc": 0.622, "acc_norm": 0.595, "acc_norm_stderr": 0.015531136990453049, "acc_stderr": 0.01534116525402665}}}, "story_cloze_2016": {"0": {"Answer Given options": {"acc": 0.4730090860502405, "acc_norm": 0.5024051309460181, "acc_norm_stderr": 0.011562298481438055, "acc_stderr": 0.011545573278697235}, "Choose Story Ending": {"acc": 0.4820951362907536, "acc_norm": 0.5221806520577231, "acc_norm_stderr": 0.011551049647290302, "acc_stderr": 0.011555016408505474}, "Novel Correct Ending": {"acc": 0.4820951362907536, "acc_norm": 0.5109567076429716, "acc_norm_stderr": 0.011559655791130729, "acc_stderr": 0.011555016408505476}, "Story Continuation and Options": {"acc": 0.46125066809192944, "acc_norm": 0.5104222340994121, "acc_norm_stderr": 0.011559920087347771, "acc_stderr": 0.011527657726586461}}, "1": {"Answer Given options": {"acc": 0.47140566541956175, "acc_norm": 0.5077498663816141, "acc_norm_stderr": 0.011561043278863545, "acc_stderr": 0.011543509045585206}, "Choose Story Ending": {"acc": 0.48583645109567075, "acc_norm": 0.4965259219668626, "acc_norm_stderr": 0.011562153149168298, "acc_stderr": 0.011557792331301673}, "Novel Correct Ending": {"acc": 0.4820951362907536, "acc_norm": 0.5114911811865313, "acc_norm_stderr": 0.011559378273599126, "acc_stderr": 0.011555016408505476}, "Story Continuation and Options": {"acc": 0.48850881881346875, "acc_norm": 0.5002672367717798, "acc_norm_stderr": 0.011562430600098487, "acc_stderr": 0.011559378273599123}}, "2": {"Answer Given options": {"acc": 0.47728487439871725, "acc_norm": 0.4917156600748263, "acc_norm_stderr": 0.011560845076525713, "acc_stderr": 0.011550494192008947}, "Choose Story Ending": {"acc": 0.48583645109567075, "acc_norm": 0.48957776590058794, "acc_norm_stderr": 0.011559920087347776, "acc_stderr": 0.011557792331301671}, "Novel Correct Ending": {"acc": 0.4853019775521112, "acc_norm": 0.48957776590058794, "acc_norm_stderr": 0.011559920087347776, "acc_stderr": 0.011557435464292916}, "Story Continuation and Options": {"acc": 0.47728487439871725, "acc_norm": 0.4911811865312667, "acc_norm_stderr": 0.011560633656952968, "acc_stderr": 0.011550494192008947}}, "3": {"Answer Given options": {"acc": 0.47247461250668094, "acc_norm": 0.47888829502939606, "acc_norm_stderr": 0.011552120807053817, "acc_stderr": 0.01154489847386458}, "Choose Story Ending": {"acc": 0.4751469802244789, "acc_norm": 0.4778193479422769, "acc_norm_stderr": 0.01155104964729031, "acc_stderr": 0.01154813982307477}, "Novel Correct Ending": {"acc": 0.4794227685729556, "acc_norm": 0.4863709246392304, "acc_norm_stderr": 0.011558135970599896, "acc_stderr": 0.011552636515221862}, "Story Continuation and Options": {"acc": 0.4681988241582042, "acc_norm": 0.484233030464992, "acc_norm_stderr": 0.01155668204219638, "acc_stderr": 0.011539022035111228}}, "4": {"Answer Given options": {"acc": 0.4730090860502405, "acc_norm": 0.4778193479422769, "acc_norm_stderr": 0.011551049647290312, "acc_stderr": 0.011545573278697237}, "Choose Story Ending": {"acc": 0.47247461250668094, "acc_norm": 0.48690539818278994, "acc_norm_stderr": 0.011558466383367183, "acc_stderr": 0.011544898473864586}, "Novel Correct Ending": {"acc": 0.4831640833778728, "acc_norm": 0.4906467129877071, "acc_norm_stderr": 0.011560409019420362, "acc_stderr": 0.011555875693960771}, "Story Continuation and Options": {"acc": 0.4692677712453234, "acc_norm": 0.48743987172634956, "acc_norm_stderr": 0.01155878357073797, "acc_stderr": 0.011540570846495544}}, "5": {"Answer Given options": {"acc": 0.47033671833244256, "acc_norm": 0.4767504008551577, "acc_norm_stderr": 0.011549925483927456, "acc_stderr": 0.011542066509767012}, "Choose Story Ending": {"acc": 0.4665954035275254, "acc_norm": 0.4836985569214324, "acc_norm_stderr": 0.011556285484521572, "acc_stderr": 0.011536599118298173}, "Novel Correct Ending": {"acc": 0.47888829502939606, "acc_norm": 0.48690539818278994, "acc_norm_stderr": 0.011558466383367183, "acc_stderr": 0.011552120807053812}, "Story Continuation and Options": {"acc": 0.4740780331373597, "acc_norm": 0.4826296098343132, "acc_norm_stderr": 0.011555452669106634, "acc_stderr": 0.011546883081384901}}}, "superglue_rte": {"0": {"GPT-3 style": {"acc": 0.4404332129963899, "acc_norm": 0.4729241877256318, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.029882123363118723}, "MNLI crowdsource": {"acc": 0.5523465703971119, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.02993107036293953}, "does it follow that": {"acc": 0.5451263537906137, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.029973636495415252}, "guaranteed true": {"acc": 0.48014440433212996, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.0300727231673172}, "should assume": {"acc": 0.51985559566787, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030072723167317184}}, "1": {"GPT-3 style": {"acc": 0.5018050541516246, "acc_norm": 0.5018050541516246, "acc_norm_stderr": 0.030096267148976626, "acc_stderr": 0.030096267148976626}, "MNLI crowdsource": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}, "does it follow that": {"acc": 0.48736462093862815, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030086851767188564}, "guaranteed true": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}, "should assume": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}}, "2": {"GPT-3 style": {"acc": 0.5234657039711191, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.03003973059219781, "acc_stderr": 0.030063300411902652}, "MNLI crowdsource": {"acc": 0.5054151624548736, "acc_norm": 0.5018050541516246, "acc_norm_stderr": 0.030096267148976626, "acc_stderr": 0.030094698123239966}, "does it follow that": {"acc": 0.51985559566787, "acc_norm": 0.5234657039711191, "acc_norm_stderr": 0.030063300411902652, "acc_stderr": 0.030072723167317177}, "guaranteed true": {"acc": 0.48375451263537905, "acc_norm": 0.48736462093862815, "acc_norm_stderr": 0.030086851767188564, "acc_stderr": 0.030080573208738064}, "should assume": {"acc": 0.4981949458483754, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030096267148976633}}, "3": {"GPT-3 style": {"acc": 0.555956678700361, "acc_norm": 0.5451263537906137, "acc_norm_stderr": 0.029973636495415252, "acc_stderr": 0.029907396333795987}, "MNLI crowdsource": {"acc": 0.5018050541516246, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030096267148976626}, "does it follow that": {"acc": 0.5306859205776173, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.03003973059219781}, "guaranteed true": {"acc": 0.5270758122743683, "acc_norm": 0.516245487364621, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030052303463143706}, "should assume": {"acc": 0.516245487364621, "acc_norm": 0.51985559566787, "acc_norm_stderr": 0.030072723167317177, "acc_stderr": 0.030080573208738064}}, "4": {"GPT-3 style": {"acc": 0.5631768953068592, "acc_norm": 0.51985559566787, "acc_norm_stderr": 0.030072723167317177, "acc_stderr": 0.02985524739031494}, "MNLI crowdsource": {"acc": 0.47653429602888087, "acc_norm": 0.47653429602888087, "acc_norm_stderr": 0.03006330041190266, "acc_stderr": 0.030063300411902652}, "does it follow that": {"acc": 0.516245487364621, "acc_norm": 0.5018050541516246, "acc_norm_stderr": 0.030096267148976633, "acc_stderr": 0.030080573208738064}, "guaranteed true": {"acc": 0.49097472924187724, "acc_norm": 0.5018050541516246, "acc_norm_stderr": 0.030096267148976626, "acc_stderr": 0.030091559826331327}, "should assume": {"acc": 0.47653429602888087, "acc_norm": 0.49458483754512633, "acc_norm_stderr": 0.03009469812323996, "acc_stderr": 0.03006330041190266}}, "5": {"GPT-3 style": {"acc": 0.5631768953068592, "acc_norm": 0.5415162454873647, "acc_norm_stderr": 0.029992535385373314, "acc_stderr": 0.029855247390314945}, "MNLI crowdsource": {"acc": 0.4584837545126354, "acc_norm": 0.48014440433212996, "acc_norm_stderr": 0.0300727231673172, "acc_stderr": 0.02999253538537331}, "does it follow that": {"acc": 0.5415162454873647, "acc_norm": 0.48375451263537905, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.029992535385373314}, "guaranteed true": {"acc": 0.4729241877256318, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030052303463143706}, "should assume": {"acc": 0.4584837545126354, "acc_norm": 0.48375451263537905, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.029992535385373314}}}, "winogrande": {"0": {"Replace": {"acc": 0.500394632991318, "acc_norm": 0.4940805051302289, "acc_norm_stderr": 0.014051500838485807, "acc_stderr": 0.014052481306049516}, "True or False": {"acc": 0.494869771112865, "acc_norm": 0.4956590370955012, "acc_norm_stderr": 0.014051956064076896, "acc_stderr": 0.014051745961790516}, "does underscore refer to": {"acc": 0.4696132596685083, "acc_norm": 0.47908445146014206, "acc_norm_stderr": 0.014040185494212945, "acc_stderr": 0.014026510839428732}, "stand for": {"acc": 0.49171270718232046, "acc_norm": 0.489344909234412, "acc_norm_stderr": 0.0140492945362904, "acc_stderr": 0.014050555322824194}, "underscore refer to": {"acc": 0.49171270718232046, "acc_norm": 0.48697711128650356, "acc_norm_stderr": 0.014047718393997663, "acc_stderr": 0.014050555322824194}}, "1": {"Replace": {"acc": 0.5035516969218626, "acc_norm": 0.4972375690607735, "acc_norm_stderr": 0.014052271211616445, "acc_stderr": 0.014052131146915853}, "True or False": {"acc": 0.4925019731649566, "acc_norm": 0.48697711128650356, "acc_norm_stderr": 0.01404771839399767, "acc_stderr": 0.01405090552122858}, "does underscore refer to": {"acc": 0.4909234411996843, "acc_norm": 0.4909234411996843, "acc_norm_stderr": 0.014050170094497704, "acc_stderr": 0.014050170094497704}, "stand for": {"acc": 0.4956590370955012, "acc_norm": 0.4988161010260458, "acc_norm_stderr": 0.014052446290529022, "acc_stderr": 0.014051956064076896}, "underscore refer to": {"acc": 0.47908445146014206, "acc_norm": 0.47434885556432516, "acc_norm_stderr": 0.014033980956108557, "acc_stderr": 0.014040185494212952}}, "2": {"Replace": {"acc": 0.5067087608524072, "acc_norm": 0.5074980268350434, "acc_norm_stderr": 0.014050905521228577, "acc_stderr": 0.014051220692330349}, "True or False": {"acc": 0.5074980268350434, "acc_norm": 0.5122336227308603, "acc_norm_stderr": 0.014048278820405624, "acc_stderr": 0.014050905521228577}, "does underscore refer to": {"acc": 0.48303078137332284, "acc_norm": 0.46882399368587213, "acc_norm_stderr": 0.014025142640639516, "acc_stderr": 0.014044390401612967}, "stand for": {"acc": 0.4909234411996843, "acc_norm": 0.48697711128650356, "acc_norm_stderr": 0.014047718393997663, "acc_stderr": 0.014050170094497707}, "underscore refer to": {"acc": 0.49171270718232046, "acc_norm": 0.49171270718232046, "acc_norm_stderr": 0.014050555322824192, "acc_stderr": 0.014050555322824192}}, "3": {"Replace": {"acc": 0.5217048145224941, "acc_norm": 0.5035516969218626, "acc_norm_stderr": 0.014052131146915852, "acc_stderr": 0.01403923921648463}, "True or False": {"acc": 0.5067087608524072, "acc_norm": 0.5130228887134964, "acc_norm_stderr": 0.014047718393997663, "acc_stderr": 0.014051220692330346}, "does underscore refer to": {"acc": 0.494869771112865, "acc_norm": 0.49013417521704816, "acc_norm_stderr": 0.014049749833367589, "acc_stderr": 0.014051745961790516}, "stand for": {"acc": 0.4980268350434096, "acc_norm": 0.500394632991318, "acc_norm_stderr": 0.014052481306049516, "acc_stderr": 0.01405237625922564}, "underscore refer to": {"acc": 0.5138121546961326, "acc_norm": 0.4988161010260458, "acc_norm_stderr": 0.014052446290529019, "acc_stderr": 0.014047122916440415}}, "4": {"Replace": {"acc": 0.5177584846093133, "acc_norm": 0.5043409629044988, "acc_norm_stderr": 0.014051956064076892, "acc_stderr": 0.014043619596174964}, "True or False": {"acc": 0.5059194948697711, "acc_norm": 0.510655090765588, "acc_norm_stderr": 0.014049294536290396, "acc_stderr": 0.014051500838485807}, "does underscore refer to": {"acc": 0.49171270718232046, "acc_norm": 0.48224151539068666, "acc_norm_stderr": 0.01404361959617496, "acc_stderr": 0.014050555322824194}, "stand for": {"acc": 0.5059194948697711, "acc_norm": 0.5019731649565904, "acc_norm_stderr": 0.01405237625922564, "acc_stderr": 0.014051500838485807}, "underscore refer to": {"acc": 0.5177584846093133, "acc_norm": 0.5130228887134964, "acc_norm_stderr": 0.014047718393997667, "acc_stderr": 0.014043619596174962}}, "5": {"Replace": {"acc": 0.5193370165745856, "acc_norm": 0.510655090765588, "acc_norm_stderr": 0.014049294536290396, "acc_stderr": 0.014041972733712976}, "True or False": {"acc": 0.5043409629044988, "acc_norm": 0.510655090765588, "acc_norm_stderr": 0.014049294536290396, "acc_stderr": 0.0140519560640769}, "does underscore refer to": {"acc": 0.4996053670086819, "acc_norm": 0.4972375690607735, "acc_norm_stderr": 0.014052271211616436, "acc_stderr": 0.014052481306049512}, "stand for": {"acc": 0.4988161010260458, "acc_norm": 0.4980268350434096, "acc_norm_stderr": 0.014052376259225629, "acc_stderr": 0.014052446290529022}, "underscore refer to": {"acc": 0.5035516969218626, "acc_norm": 0.5011838989739542, "acc_norm_stderr": 0.014052446290529019, "acc_stderr": 0.01405213114691586}}}} \ No newline at end of file