Muennighoff commited on
Commit
7d75290
·
1 Parent(s): 0b13ee2
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 4b284b84b10c4pyseed1/evaluation/generation/merged.csv +53 -0
  2. 4b284b84b10c4pyseed1/evaluation/generation/merged.json +1 -0
  3. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_0.csv +21 -0
  4. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_0_lm-eval_global_step80108_2023-05-13-13-52-19_0shots_backup.json +0 -87
  5. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_1.csv +21 -0
  6. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_1_lm-eval_global_step80108_2023-05-13-13-52-19_1shots_backup.json +0 -87
  7. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_2.csv +21 -0
  8. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_2_lm-eval_global_step80108_2023-05-13-13-52-19_2shots_backup.json +0 -87
  9. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_3.csv +21 -0
  10. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_3_lm-eval_global_step80108_2023-05-13-13-52-19_3shots_backup.json +0 -87
  11. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_4.csv +21 -0
  12. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_4_lm-eval_global_step80108_2023-05-13-13-52-19_4shots_backup.json +0 -87
  13. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_5.csv +21 -0
  14. 4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_5_lm-eval_global_step80108_2023-05-13-13-52-19_5shots_backup.json +0 -87
  15. 4b284b84b10c4pyseed2/evaluation/generation/merged.csv +53 -0
  16. 4b284b84b10c4pyseed2/evaluation/generation/merged.json +1 -0
  17. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_0.csv +21 -0
  18. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_0_lm-eval_global_step80108_2023-05-13-13-52-19_0shots_backup.json +0 -87
  19. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_1.csv +21 -0
  20. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_1_lm-eval_global_step80108_2023-05-13-13-52-19_1shots_backup.json +0 -87
  21. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_2.csv +21 -0
  22. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_2_lm-eval_global_step80108_2023-05-13-13-52-19_2shots_backup.json +0 -87
  23. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_3.csv +21 -0
  24. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_3_lm-eval_global_step80108_2023-05-13-13-52-19_3shots_backup.json +0 -87
  25. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_4.csv +21 -0
  26. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_4_lm-eval_global_step80108_2023-05-13-13-52-19_4shots_backup.json +0 -87
  27. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_5.csv +21 -0
  28. 4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_5_lm-eval_global_step80108_2023-05-13-13-52-19_5shots_backup.json +0 -87
  29. 4b284b84b10c4pyseed3/evaluation/generation/merged.csv +53 -0
  30. 4b284b84b10c4pyseed3/evaluation/generation/merged.json +1 -0
  31. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_0.csv +21 -0
  32. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_0_lm-eval_global_step80108_2023-05-13-13-52-19_0shots_backup.json +0 -87
  33. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_1.csv +21 -0
  34. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_1_lm-eval_global_step80108_2023-05-13-13-52-19_1shots_backup.json +0 -87
  35. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_2.csv +21 -0
  36. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_2_lm-eval_global_step80108_2023-05-13-13-52-19_2shots_backup.json +0 -87
  37. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_3.csv +21 -0
  38. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_3_lm-eval_global_step80108_2023-05-13-13-52-19_3shots_backup.json +0 -87
  39. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_4.csv +21 -0
  40. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_4_lm-eval_global_step80108_2023-05-13-13-52-19_4shots_backup.json +0 -87
  41. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_5.csv +21 -0
  42. 4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_5_lm-eval_global_step80108_2023-05-13-13-52-19_5shots_backup.json +0 -87
  43. 4b284b84b10c4pyseed4/evaluation/generation/merged.csv +53 -0
  44. 4b284b84b10c4pyseed4/evaluation/generation/merged.json +1 -0
  45. 4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_0.csv +21 -0
  46. 4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_0_lm-eval_global_step80108_2023-05-13-13-52-19_0shots_backup.json +0 -87
  47. 4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_1.csv +21 -0
  48. 4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_1_lm-eval_global_step80108_2023-05-13-13-52-19_1shots_backup.json +0 -87
  49. 4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_2.csv +21 -0
  50. 4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_2_lm-eval_global_step80108_2023-05-13-13-52-19_2shots_backup.json +0 -87
4b284b84b10c4pyseed1/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.004592446716382532
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.004592446716382532
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.25032847605768066
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.25032847605768066
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2764918747191911
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2764918747191911
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2851915123694789
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2851915123694789
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.28884988764815905
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.28884988764815905
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2902670144861217
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2902670144861217
14
+ e2e_nlg_cleaned,5,average,multiple,0.2326202019995023
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03823209781440638
16
+ gem_xsum,0,median,rouge2_fmeasure,0.03823209781440638
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03681668414425123
18
+ gem_xsum,1,median,rouge2_fmeasure,0.03681668414425123
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04199512085291508
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04199512085291508
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04244649755795357
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04244649755795357
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009504763442679623
24
+ gem_xsum,4,median,rouge2_fmeasure,0.009504763442679623
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,7.895668273026765e-05
26
+ gem_xsum,5,median,rouge2_fmeasure,7.895668273026765e-05
27
+ gem_xsum,5,average,multiple,0.028179020082489358
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.044717276624656535
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.044717276624656535
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.09554456994272455
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.09554456994272455
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.10131144981946384
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.10131144981946384
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10005533448149066
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.10005533448149066
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10592268302143613
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.10592268302143613
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11115468658056626
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.11115468658056626
40
+ web_nlg_en,5,average,multiple,0.09311766674505634
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.046716868794861754
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.046716868794861754
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05708999278032344
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05708999278032344
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.061995678538271676
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.061995678538271676
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.050854113940311155
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.050854113940311155
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.016460544041963263
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.016460544041963263
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0031577837444284164
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0031577837444284164
53
+ wiki_lingua_en,5,average,multiple,0.03937916364002662
4b284b84b10c4pyseed1/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.284309215506823, "bleu_stderr": 0.029833086509604305, "rouge1_fmeasure": 0.10244258452443039, "rouge1_fmeasure_stderr": 0.0018745218333798596, "rouge1_precision": 0.06581911668836364, "rouge1_precision_stderr": 0.001350869684012805, "rouge1_recall": 0.3172061653272691, "rouge1_recall_stderr": 0.004652086353059752, "rouge2_fmeasure": 0.044717276624656535, "rouge2_fmeasure_stderr": 0.0011638609756180738, "rouge2_precision": 0.02872547580815573, "rouge2_precision_stderr": 0.0008018335661056892, "rouge2_recall": 0.13571970653094373, "rouge2_recall_stderr": 0.003259456655314051, "rougeL_fmeasure": 0.09702667756798634, "rougeL_fmeasure_stderr": 0.0017265383378214521, "rougeL_precision": 0.0622329746960186, "rougeL_precision_stderr": 0.0012407879166700445, "rougeL_recall": 0.3025393331950721, "rougeL_recall_stderr": 0.0044598083185581104, "rougeLsum_fmeasure": 0.09491867829539585, "rougeLsum_fmeasure_stderr": 0.001775810247231079, "rougeLsum_precision": 0.061177793399579794, "rougeLsum_precision_stderr": 0.0012852655539325787, "rougeLsum_recall": 0.29018534616085484, "rougeLsum_recall_stderr": 0.004307476009615367}}, "1": {"PALM_prompt": {"bleu": 0.753805187803133, "bleu_stderr": 0.06617087138127505, "rouge1_fmeasure": 0.18109300201332623, "rouge1_fmeasure_stderr": 0.004090221106346305, "rouge1_precision": 0.16415427297886892, "rouge1_precision_stderr": 0.005101621265674676, "rouge1_recall": 0.3369859621654897, "rouge1_recall_stderr": 0.005253905893732299, "rouge2_fmeasure": 0.09554456994272455, "rouge2_fmeasure_stderr": 0.002836360406157026, "rouge2_precision": 0.08962231920980905, "rouge2_precision_stderr": 0.003652794559796275, "rouge2_recall": 0.179093576791636, "rouge2_recall_stderr": 0.0038003650408636133, "rougeL_fmeasure": 0.16243696323937595, "rougeL_fmeasure_stderr": 0.0035339999120070865, "rougeL_precision": 0.14633173241616088, "rougeL_precision_stderr": 0.004561222352471328, "rougeL_recall": 0.31098631813906064, "rougeL_recall_stderr": 0.004771152313092668, "rougeLsum_fmeasure": 0.16553233071424595, "rougeLsum_fmeasure_stderr": 0.003604801229722386, "rougeLsum_precision": 0.14939741590780758, "rougeLsum_precision_stderr": 0.00464113354592472, "rougeLsum_recall": 0.3149542980060386, "rougeLsum_recall_stderr": 0.004807984726838254}}, "2": {"PALM_prompt": {"bleu": 0.7932674060025208, "bleu_stderr": 0.030981623711671676, "rouge1_fmeasure": 0.19124926479296447, "rouge1_fmeasure_stderr": 0.004229154056037897, "rouge1_precision": 0.16979256072547352, "rouge1_precision_stderr": 0.005082164079217998, "rouge1_recall": 0.3554713624023758, "rouge1_recall_stderr": 0.004883533769915491, "rouge2_fmeasure": 0.10131144981946384, "rouge2_fmeasure_stderr": 0.0029085710229933676, "rouge2_precision": 0.09157774507207324, "rouge2_precision_stderr": 0.003436105148073816, "rouge2_recall": 0.19015999561328903, "rouge2_recall_stderr": 0.003709227050021704, "rougeL_fmeasure": 0.16901753523040366, "rougeL_fmeasure_stderr": 0.003515650361264507, "rougeL_precision": 0.1476683147887663, "rougeL_precision_stderr": 0.004295885608543962, "rougeL_recall": 0.32627871757286087, "rougeL_recall_stderr": 0.004378817143139272, "rougeLsum_fmeasure": 0.17394029235992964, "rougeLsum_fmeasure_stderr": 0.0036594961734086984, "rougeLsum_precision": 0.15294809039092577, "rougeLsum_precision_stderr": 0.004486762320273044, "rougeLsum_recall": 0.3318227977234109, "rougeLsum_recall_stderr": 0.004447615633152609}}, "3": {"PALM_prompt": {"bleu": 0.8513001578324528, "bleu_stderr": 0.026740684968996915, "rouge1_fmeasure": 0.18899031024139057, "rouge1_fmeasure_stderr": 0.004063095454932207, "rouge1_precision": 0.1682074233922619, "rouge1_precision_stderr": 0.00500858635828189, "rouge1_recall": 0.35827045347689035, "rouge1_recall_stderr": 0.0047237125785498015, "rouge2_fmeasure": 0.10005533448149066, "rouge2_fmeasure_stderr": 0.002723354331085578, "rouge2_precision": 0.09147237522532128, "rouge2_precision_stderr": 0.003348677910977491, "rouge2_recall": 0.19208203447770358, "rouge2_recall_stderr": 0.003568780102250434, "rougeL_fmeasure": 0.1676956313139649, "rougeL_fmeasure_stderr": 0.0034072621145858485, "rougeL_precision": 0.14745670611923176, "rougeL_precision_stderr": 0.004288564029853486, "rougeL_recall": 0.32945503386050456, "rougeL_recall_stderr": 0.00430176341284325, "rougeLsum_fmeasure": 0.17242524198036677, "rougeLsum_fmeasure_stderr": 0.003538649731275397, "rougeLsum_precision": 0.15247421609250705, "rougeLsum_precision_stderr": 0.004470035388946363, "rougeLsum_recall": 0.33518148762631234, "rougeLsum_recall_stderr": 0.004338973715432066}}, "4": {"PALM_prompt": {"bleu": 0.9861923448931026, "bleu_stderr": 0.06451187715847112, "rouge1_fmeasure": 0.19517152553929415, "rouge1_fmeasure_stderr": 0.004158564969560929, "rouge1_precision": 0.17551657242946475, "rouge1_precision_stderr": 0.00518436550875209, "rouge1_recall": 0.36715724181406534, "rouge1_recall_stderr": 0.004737112336780039, "rouge2_fmeasure": 0.10592268302143613, "rouge2_fmeasure_stderr": 0.002888099644857794, "rouge2_precision": 0.09748212020764499, "rouge2_precision_stderr": 0.003545806528867412, "rouge2_recall": 0.2016532698906794, "rouge2_recall_stderr": 0.0037519961144571014, "rougeL_fmeasure": 0.17334860222724158, "rougeL_fmeasure_stderr": 0.003517583711032378, "rougeL_precision": 0.1533354125519617, "rougeL_precision_stderr": 0.004409368563127759, "rougeL_recall": 0.33802875005319016, "rougeL_recall_stderr": 0.004320230574983353, "rougeLsum_fmeasure": 0.17995243917889453, "rougeLsum_fmeasure_stderr": 0.0037046640606234428, "rougeLsum_precision": 0.16074613904282228, "rougeLsum_precision_stderr": 0.0046803037433223995, "rougeLsum_recall": 0.34585417154412335, "rougeLsum_recall_stderr": 0.004400674408465862}}, "5": {"PALM_prompt": {"bleu": 1.0276495922602185, "bleu_stderr": 0.04351925208896884, "rouge1_fmeasure": 0.20344274210510788, "rouge1_fmeasure_stderr": 0.004377149753563969, "rouge1_precision": 0.1857912306879972, "rouge1_precision_stderr": 0.005456711062603829, "rouge1_recall": 0.3727786563840607, "rouge1_recall_stderr": 0.004811268133569124, "rouge2_fmeasure": 0.11115468658056626, "rouge2_fmeasure_stderr": 0.003036992688342426, "rouge2_precision": 0.10367866583006519, "rouge2_precision_stderr": 0.0037422913783832666, "rouge2_recall": 0.2048240498962582, "rouge2_recall_stderr": 0.003783202485492078, "rougeL_fmeasure": 0.1799634484068292, "rougeL_fmeasure_stderr": 0.003677070537566347, "rougeL_precision": 0.16249202574637664, "rougeL_precision_stderr": 0.004682286684786112, "rougeL_recall": 0.34092053938711586, "rougeL_recall_stderr": 0.004300994367226273, "rougeLsum_fmeasure": 0.1870872812570983, "rougeLsum_fmeasure_stderr": 0.003877065731634771, "rougeLsum_precision": 0.17041235466942184, "rougeLsum_precision_stderr": 0.004942926799192775, "rougeLsum_recall": 0.34872315434327716, "rougeLsum_recall_stderr": 0.0043703888860243964}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.2503849050585125, "bleu_stderr": 0.051507727846378235, "rouge1_fmeasure": 0.20368781100751032, "rouge1_fmeasure_stderr": 0.002068000366661119, "rouge1_precision": 0.17790091868095817, "rouge1_precision_stderr": 0.0021603958439972985, "rouge1_recall": 0.28767225722987566, "rouge1_recall_stderr": 0.0029352879041788967, "rouge2_fmeasure": 0.046716868794861754, "rouge2_fmeasure_stderr": 0.0009706566212074418, "rouge2_precision": 0.04053806715598245, "rouge2_precision_stderr": 0.0008830811951633252, "rouge2_recall": 0.06802940844958125, "rouge2_recall_stderr": 0.001590741774901635, "rougeL_fmeasure": 0.14924671186550809, "rougeL_fmeasure_stderr": 0.00141048666429078, "rougeL_precision": 0.12871570492003945, "rougeL_precision_stderr": 0.0014382222386276713, "rougeL_recall": 0.21643319804515018, "rougeL_recall_stderr": 0.002301502562645028, "rougeLsum_fmeasure": 0.18900724598178428, "rougeLsum_fmeasure_stderr": 0.001924609918596603, "rougeLsum_precision": 0.1650342719642491, "rougeLsum_precision_stderr": 0.0020126516657929527, "rougeLsum_recall": 0.2674750388003852, "rougeLsum_recall_stderr": 0.0027562633553659954}}, "1": {"tldr_en": {"bleu": 3.4982309775393885, "bleu_stderr": 0.09222704336399924, "rouge1_fmeasure": 0.21511983140208174, "rouge1_fmeasure_stderr": 0.0021698987504195433, "rouge1_precision": 0.27393078815860783, "rouge1_precision_stderr": 0.0038623203456685886, "rouge1_recall": 0.2597178454728387, "rouge1_recall_stderr": 0.003151219857377147, "rouge2_fmeasure": 0.05708999278032344, "rouge2_fmeasure_stderr": 0.0012762093671854961, "rouge2_precision": 0.08126791051957892, "rouge2_precision_stderr": 0.0024917008391937712, "rouge2_recall": 0.06812633213172745, "rouge2_recall_stderr": 0.0016323029682680788, "rougeL_fmeasure": 0.1605722253085775, "rougeL_fmeasure_stderr": 0.0016204785663363505, "rougeL_precision": 0.2116066951379208, "rougeL_precision_stderr": 0.0033271798685739064, "rougeL_recall": 0.19381965988537161, "rougeL_recall_stderr": 0.0024101880028630914, "rougeLsum_fmeasure": 0.2017459406813838, "rougeLsum_fmeasure_stderr": 0.0020465753159111125, "rougeLsum_precision": 0.25730029161753404, "rougeLsum_precision_stderr": 0.0036763324140579832, "rougeLsum_recall": 0.24378425315233523, "rougeLsum_recall_stderr": 0.002975555660811828}}, "2": {"tldr_en": {"bleu": 3.873352952970367, "bleu_stderr": 0.08602903997292226, "rouge1_fmeasure": 0.2235176724065316, "rouge1_fmeasure_stderr": 0.002197037373765131, "rouge1_precision": 0.2952493239632582, "rouge1_precision_stderr": 0.003960984019635869, "rouge1_recall": 0.26017147617808556, "rouge1_recall_stderr": 0.0030898471494932573, "rouge2_fmeasure": 0.061995678538271676, "rouge2_fmeasure_stderr": 0.0013413408864346255, "rouge2_precision": 0.09079158882750632, "rouge2_precision_stderr": 0.0025346171837202776, "rouge2_recall": 0.07118247751799445, "rouge2_recall_stderr": 0.0016684374732950987, "rougeL_fmeasure": 0.1699086382724902, "rougeL_fmeasure_stderr": 0.0016802663528347446, "rougeL_precision": 0.23067990584344947, "rougeL_precision_stderr": 0.003390251609468385, "rougeL_recall": 0.19795924007057863, "rougeL_recall_stderr": 0.0024239337694097465, "rougeLsum_fmeasure": 0.20973532628833993, "rougeLsum_fmeasure_stderr": 0.002083876431215809, "rougeLsum_precision": 0.2785874960007144, "rougeLsum_precision_stderr": 0.0038306025826887855, "rougeLsum_recall": 0.2439739751120984, "rougeLsum_recall_stderr": 0.0029217551928644968}}, "3": {"tldr_en": {"bleu": 2.9199915076563157, "bleu_stderr": 0.050718023179781004, "rouge1_fmeasure": 0.1852380650589816, "rouge1_fmeasure_stderr": 0.002453688960096756, "rouge1_precision": 0.2644793338559228, "rouge1_precision_stderr": 0.004300446011877091, "rouge1_recall": 0.20721505932505885, "rouge1_recall_stderr": 0.0032596678260916894, "rouge2_fmeasure": 0.050854113940311155, "rouge2_fmeasure_stderr": 0.001262646254201672, "rouge2_precision": 0.07962757766274792, "rouge2_precision_stderr": 0.002538837956533235, "rouge2_recall": 0.056697549727240754, "rouge2_recall_stderr": 0.0015655044967252236, "rougeL_fmeasure": 0.14274269415388388, "rougeL_fmeasure_stderr": 0.0018860937901053825, "rougeL_precision": 0.21055962380090806, "rougeL_precision_stderr": 0.0036820307774536325, "rougeL_recall": 0.1594739896843766, "rougeL_recall_stderr": 0.0025655960354455483, "rougeLsum_fmeasure": 0.17393190478618795, "rougeLsum_fmeasure_stderr": 0.0023165981237423306, "rougeLsum_precision": 0.24962571404876155, "rougeLsum_precision_stderr": 0.004132046557404798, "rougeLsum_recall": 0.19448430427679694, "rougeLsum_recall_stderr": 0.0030769199275888057}}, "4": {"tldr_en": {"bleu": 0.1067162813855955, "bleu_stderr": 0.01642553426435665, "rouge1_fmeasure": 0.06076697813953489, "rouge1_fmeasure_stderr": 0.0021300692037034092, "rouge1_precision": 0.09332602866833148, "rouge1_precision_stderr": 0.003570294101613068, "rouge1_recall": 0.06586871137800855, "rouge1_recall_stderr": 0.0025026795468805797, "rouge2_fmeasure": 0.016460544041963263, "rouge2_fmeasure_stderr": 0.0008881114872332711, "rouge2_precision": 0.02827058895658381, "rouge2_precision_stderr": 0.0018246284253883897, "rouge2_recall": 0.017969796115762948, "rouge2_recall_stderr": 0.0010669408876304014, "rougeL_fmeasure": 0.04815869214862383, "rougeL_fmeasure_stderr": 0.001697306692637545, "rougeL_precision": 0.07656431813455133, "rougeL_precision_stderr": 0.003063240688380635, "rougeL_recall": 0.052053276943651174, "rougeL_recall_stderr": 0.0019951400763600037, "rougeLsum_fmeasure": 0.05680856447838402, "rougeLsum_fmeasure_stderr": 0.0020003560937323227, "rougeLsum_precision": 0.08788804889171757, "rougeLsum_precision_stderr": 0.0034049566118768337, "rougeLsum_recall": 0.061446683378615426, "rougeLsum_recall_stderr": 0.002336121336344904}}, "5": {"tldr_en": {"bleu": 7.489712738905427e-14, "bleu_stderr": 1.1260174383292979e-12, "rouge1_fmeasure": 0.009722653883413835, "rouge1_fmeasure_stderr": 0.0009640011558515617, "rouge1_precision": 0.015805709131080613, "rouge1_precision_stderr": 0.0016507735911678808, "rouge1_recall": 0.010404485275122482, "rouge1_recall_stderr": 0.001118878971854598, "rouge2_fmeasure": 0.0031577837444284164, "rouge2_fmeasure_stderr": 0.00041844867000327934, "rouge2_precision": 0.006066305176907762, "rouge2_precision_stderr": 0.000920658460289576, "rouge2_recall": 0.0033132551609063956, "rouge2_recall_stderr": 0.0004935460868227713, "rougeL_fmeasure": 0.007849160982570056, "rougeL_fmeasure_stderr": 0.0007775176130013305, "rougeL_precision": 0.01339410530484845, "rougeL_precision_stderr": 0.001457114304279022, "rougeL_recall": 0.008254165617272078, "rougeL_recall_stderr": 0.0008698320604862179, "rougeLsum_fmeasure": 0.009124669383155015, "rougeLsum_fmeasure_stderr": 0.0009073079813537541, "rougeLsum_precision": 0.015094524794108326, "rougeLsum_precision_stderr": 0.001595743806157447, "rougeLsum_recall": 0.009685717063570557, "rougeLsum_recall_stderr": 0.0010375114498161015}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.22005998147458544, "bleu_stderr": 0.0250487067503603, "rouge1_fmeasure": 0.08614972753839216, "rouge1_fmeasure_stderr": 0.0008903909075822222, "rouge1_precision": 0.08455976142294155, "rouge1_precision_stderr": 0.0009636491454169889, "rouge1_recall": 0.09545554351114534, "rouge1_recall_stderr": 0.0010469871961522913, "rouge2_fmeasure": 0.004592446716382532, "rouge2_fmeasure_stderr": 0.0003834061963154109, "rouge2_precision": 0.0050051907680722894, "rouge2_precision_stderr": 0.00039263575137459715, "rouge2_recall": 0.004726385658481872, "rouge2_recall_stderr": 0.00044633081826916894, "rougeL_fmeasure": 0.08008359695112764, "rougeL_fmeasure_stderr": 0.0007239720189765483, "rougeL_precision": 0.0783514821082424, "rougeL_precision_stderr": 0.0008032303128996655, "rougeL_recall": 0.08915486867281326, "rougeL_recall_stderr": 0.0008718664066674497, "rougeLsum_fmeasure": 0.08044995901414072, "rougeLsum_fmeasure_stderr": 0.0008173682710957769, "rougeLsum_precision": 0.07886133099839461, "rougeLsum_precision_stderr": 0.0008836330878653256, "rougeLsum_recall": 0.08934859411128912, "rougeLsum_recall_stderr": 0.000977605386921654}}, "1": {"generate_text_restaurant": {"bleu": 13.800398344487716, "bleu_stderr": 0.15346177669430705, "rouge1_fmeasure": 0.5123702282037764, "rouge1_fmeasure_stderr": 0.002443558170198136, "rouge1_precision": 0.6135241484922318, "rouge1_precision_stderr": 0.0030630851817342406, "rouge1_recall": 0.4790322546729597, "rouge1_recall_stderr": 0.003182294502893629, "rouge2_fmeasure": 0.25032847605768066, "rouge2_fmeasure_stderr": 0.002153114612123916, "rouge2_precision": 0.30235076353444407, "rouge2_precision_stderr": 0.002646874306056597, "rouge2_recall": 0.2343703657609816, "rouge2_recall_stderr": 0.002335598188230106, "rougeL_fmeasure": 0.3639720427770328, "rougeL_fmeasure_stderr": 0.0021984061431177555, "rougeL_precision": 0.43955666108409125, "rougeL_precision_stderr": 0.002945445598769829, "rougeL_recall": 0.3389986234208399, "rougeL_recall_stderr": 0.0025731015700906183, "rougeLsum_fmeasure": 0.4140785882669693, "rougeLsum_fmeasure_stderr": 0.002467350834842922, "rougeLsum_precision": 0.49748024851751865, "rougeLsum_precision_stderr": 0.003135565492715848, "rougeLsum_recall": 0.3863857641705684, "rougeLsum_recall_stderr": 0.0029130057687118465}}, "2": {"generate_text_restaurant": {"bleu": 16.220114618030067, "bleu_stderr": 0.15289130928894384, "rouge1_fmeasure": 0.5448820108889906, "rouge1_fmeasure_stderr": 0.002396410343480538, "rouge1_precision": 0.6232512473842998, "rouge1_precision_stderr": 0.0030165334823404376, "rouge1_recall": 0.5187486242536772, "rouge1_recall_stderr": 0.0030737890642935535, "rouge2_fmeasure": 0.2764918747191911, "rouge2_fmeasure_stderr": 0.0022226866139742037, "rouge2_precision": 0.31849522619958753, "rouge2_precision_stderr": 0.0026674426625603164, "rouge2_recall": 0.2634473671770624, "rouge2_recall_stderr": 0.0024027183676086313, "rougeL_fmeasure": 0.3880623418406085, "rougeL_fmeasure_stderr": 0.002253572082462304, "rougeL_precision": 0.4460280977850625, "rougeL_precision_stderr": 0.002883420399396016, "rougeL_recall": 0.3686322388682046, "rougeL_recall_stderr": 0.0025883932571672094, "rougeLsum_fmeasure": 0.44605053351857943, "rougeLsum_fmeasure_stderr": 0.0024682627803923407, "rougeLsum_precision": 0.5111074159037895, "rougeLsum_precision_stderr": 0.003069948568226211, "rougeLsum_recall": 0.42402608322081503, "rougeLsum_recall_stderr": 0.0028710060076752785}}, "3": {"generate_text_restaurant": {"bleu": 17.026332930226687, "bleu_stderr": 0.11903167676949462, "rouge1_fmeasure": 0.5527798242897692, "rouge1_fmeasure_stderr": 0.0023609895962880094, "rouge1_precision": 0.6234800089627556, "rouge1_precision_stderr": 0.002985926892416307, "rouge1_recall": 0.5280368770938475, "rouge1_recall_stderr": 0.002970293130350025, "rouge2_fmeasure": 0.2851915123694789, "rouge2_fmeasure_stderr": 0.0022547896451545688, "rouge2_precision": 0.32294197782069145, "rouge2_precision_stderr": 0.0026384279907428437, "rouge2_recall": 0.27292001136094557, "rouge2_recall_stderr": 0.002435895120528307, "rougeL_fmeasure": 0.39286344410502555, "rougeL_fmeasure_stderr": 0.002271959169212471, "rougeL_precision": 0.4437004558544396, "rougeL_precision_stderr": 0.002791137093343086, "rougeL_recall": 0.3753688881218395, "rougeL_recall_stderr": 0.0026151098326306237, "rougeLsum_fmeasure": 0.4526273905509783, "rougeLsum_fmeasure_stderr": 0.0024727820067571224, "rougeLsum_precision": 0.5104930833963656, "rougeLsum_precision_stderr": 0.0030114185668342584, "rougeLsum_recall": 0.43245455603275523, "rougeLsum_recall_stderr": 0.0028697413631203587}}, "4": {"generate_text_restaurant": {"bleu": 17.558589903058714, "bleu_stderr": 0.19252630598122317, "rouge1_fmeasure": 0.5553921415579663, "rouge1_fmeasure_stderr": 0.002407231734357769, "rouge1_precision": 0.6273626409449821, "rouge1_precision_stderr": 0.0030713302171131197, "rouge1_recall": 0.5294494567992365, "rouge1_recall_stderr": 0.002956902303979277, "rouge2_fmeasure": 0.28884988764815905, "rouge2_fmeasure_stderr": 0.0023347534029499084, "rouge2_precision": 0.32719445044431117, "rouge2_precision_stderr": 0.0027183018349157878, "rouge2_recall": 0.2758425630167494, "rouge2_recall_stderr": 0.00248527899121024, "rougeL_fmeasure": 0.3972547497667789, "rougeL_fmeasure_stderr": 0.0023220786435570757, "rougeL_precision": 0.449533455998689, "rougeL_precision_stderr": 0.002881175546971357, "rougeL_recall": 0.3784119333044511, "rougeL_recall_stderr": 0.0025979004764086107, "rougeLsum_fmeasure": 0.4592067286169127, "rougeLsum_fmeasure_stderr": 0.0025419130908316176, "rougeLsum_precision": 0.5181230901609982, "rougeLsum_precision_stderr": 0.003092242183416676, "rougeLsum_recall": 0.43804177668765465, "rougeLsum_recall_stderr": 0.002897167034784455}}, "5": {"generate_text_restaurant": {"bleu": 17.466933689644232, "bleu_stderr": 0.18690187822599455, "rouge1_fmeasure": 0.5562560609497972, "rouge1_fmeasure_stderr": 0.002398329952347352, "rouge1_precision": 0.6269119709397524, "rouge1_precision_stderr": 0.003021400454708049, "rouge1_recall": 0.531125474770964, "rouge1_recall_stderr": 0.0029809653866374334, "rouge2_fmeasure": 0.2902670144861217, "rouge2_fmeasure_stderr": 0.0023219039559393296, "rouge2_precision": 0.32818979979206736, "rouge2_precision_stderr": 0.0026868197179766846, "rouge2_recall": 0.2776619098810402, "rouge2_recall_stderr": 0.002495991792698862, "rougeL_fmeasure": 0.39907008563787416, "rougeL_fmeasure_stderr": 0.0023368919447988804, "rougeL_precision": 0.45068802079016396, "rougeL_precision_stderr": 0.0028602873824156013, "rougeL_recall": 0.3808220315827243, "rougeL_recall_stderr": 0.0026446980259427393, "rougeLsum_fmeasure": 0.4606227892947579, "rougeLsum_fmeasure_stderr": 0.0025300476536768424, "rougeLsum_precision": 0.5193139266297329, "rougeLsum_precision_stderr": 0.0030761759468207707, "rougeLsum_recall": 0.4395600438133579, "rougeLsum_recall_stderr": 0.002883428534103374}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.3834259801017654, "bleu_stderr": 0.0740802701072243, "rouge1_fmeasure": 0.19577844561684796, "rouge1_fmeasure_stderr": 0.0023661323114514583, "rouge1_precision": 0.1429593128198406, "rouge1_precision_stderr": 0.0018591114338940178, "rouge1_recall": 0.3336991566205972, "rouge1_recall_stderr": 0.004114441229614053, "rouge2_fmeasure": 0.03823209781440638, "rouge2_fmeasure_stderr": 0.0013087881969693176, "rouge2_precision": 0.02737505019597676, "rouge2_precision_stderr": 0.0009479284145870204, "rouge2_recall": 0.06737976814912207, "rouge2_recall_stderr": 0.002358671989935531, "rougeL_fmeasure": 0.145156122303775, "rougeL_fmeasure_stderr": 0.0016826927327189658, "rougeL_precision": 0.10581308807893287, "rougeL_precision_stderr": 0.001322469794945021, "rougeL_recall": 0.24910808127348322, "rougeL_recall_stderr": 0.0030950005330837695, "rougeLsum_fmeasure": 0.1533609850544498, "rougeLsum_fmeasure_stderr": 0.0019328001233590163, "rougeLsum_precision": 0.11175582176006449, "rougeLsum_precision_stderr": 0.0014972830388992728, "rougeLsum_recall": 0.26293712421392873, "rougeLsum_recall_stderr": 0.0035011677466440213}}, "1": {"article_DOC_summary": {"bleu": 1.4133140766200658, "bleu_stderr": 0.07412977244058566, "rouge1_fmeasure": 0.18942796750148963, "rouge1_fmeasure_stderr": 0.00282635944299824, "rouge1_precision": 0.16522584334850784, "rouge1_precision_stderr": 0.003233343006630082, "rouge1_recall": 0.2738597999200149, "rouge1_recall_stderr": 0.00400710601158638, "rouge2_fmeasure": 0.03681668414425123, "rouge2_fmeasure_stderr": 0.001614740366905874, "rouge2_precision": 0.032496424466021384, "rouge2_precision_stderr": 0.0016335851701900109, "rouge2_recall": 0.05322149486917393, "rouge2_recall_stderr": 0.002261216473133809, "rougeL_fmeasure": 0.1442650387130003, "rougeL_fmeasure_stderr": 0.0021578014797468447, "rougeL_precision": 0.12598829667400124, "rougeL_precision_stderr": 0.002526015434698146, "rougeL_recall": 0.20943225292183257, "rougeL_recall_stderr": 0.0030896306306637806, "rougeLsum_fmeasure": 0.14798185794949825, "rougeLsum_fmeasure_stderr": 0.0022622214421465074, "rougeLsum_precision": 0.1286748059113306, "rougeLsum_precision_stderr": 0.002558422865540148, "rougeLsum_recall": 0.21601653600118456, "rougeLsum_recall_stderr": 0.003388378859394739}}, "2": {"article_DOC_summary": {"bleu": 1.7810250018857277, "bleu_stderr": 0.10532591862898125, "rouge1_fmeasure": 0.2128442344193703, "rouge1_fmeasure_stderr": 0.0031145638697046315, "rouge1_precision": 0.20972653107102984, "rouge1_precision_stderr": 0.003815224682299948, "rouge1_recall": 0.25750934761307187, "rouge1_recall_stderr": 0.00374716575474716, "rouge2_fmeasure": 0.04199512085291508, "rouge2_fmeasure_stderr": 0.00182532086722488, "rouge2_precision": 0.0425341860560324, "rouge2_precision_stderr": 0.0020058542793671467, "rouge2_recall": 0.04887724768581428, "rouge2_recall_stderr": 0.002065454873092856, "rougeL_fmeasure": 0.15887796321938907, "rougeL_fmeasure_stderr": 0.0023655473767136056, "rougeL_precision": 0.15687035055102957, "rougeL_precision_stderr": 0.002917332672132936, "rougeL_recall": 0.19231969888863373, "rougeL_recall_stderr": 0.0028432386523952607, "rougeLsum_fmeasure": 0.16335387015577893, "rougeLsum_fmeasure_stderr": 0.0024148406207269127, "rougeLsum_precision": 0.16018034033217715, "rougeLsum_precision_stderr": 0.00291195594553355, "rougeLsum_recall": 0.19997130764127558, "rougeLsum_recall_stderr": 0.0031005093828670336}}, "3": {"article_DOC_summary": {"bleu": 2.011211386735581, "bleu_stderr": 0.17250343585439765, "rouge1_fmeasure": 0.2071889269059653, "rouge1_fmeasure_stderr": 0.003440607121884097, "rouge1_precision": 0.21376468358942274, "rouge1_precision_stderr": 0.0041326732533511264, "rouge1_recall": 0.23417136837123367, "rouge1_recall_stderr": 0.003943124645233322, "rouge2_fmeasure": 0.04244649755795357, "rouge2_fmeasure_stderr": 0.001958678091231855, "rouge2_precision": 0.044594473304136434, "rouge2_precision_stderr": 0.0021819372373260305, "rouge2_recall": 0.0465766681204507, "rouge2_recall_stderr": 0.0020811174284958464, "rougeL_fmeasure": 0.15630878580902802, "rougeL_fmeasure_stderr": 0.002700606481726682, "rougeL_precision": 0.16128163716269617, "rougeL_precision_stderr": 0.003232919903071769, "rougeL_recall": 0.1771979928014555, "rougeL_recall_stderr": 0.003091898160632376, "rougeLsum_fmeasure": 0.15956178080004038, "rougeLsum_fmeasure_stderr": 0.0027518298470148683, "rougeLsum_precision": 0.1639173241682711, "rougeLsum_precision_stderr": 0.003246670707428661, "rougeLsum_recall": 0.18214281746105063, "rougeLsum_recall_stderr": 0.003263598548597285}}, "4": {"article_DOC_summary": {"bleu": 0.17312487940256435, "bleu_stderr": 0.029925074339360477, "rouge1_fmeasure": 0.05420653615827253, "rouge1_fmeasure_stderr": 0.0030939599681383658, "rouge1_precision": 0.0638565356976348, "rouge1_precision_stderr": 0.003877414171497479, "rouge1_recall": 0.05760213523017414, "rouge1_recall_stderr": 0.0033986796920877156, "rouge2_fmeasure": 0.009504763442679623, "rouge2_fmeasure_stderr": 0.0009300238181360828, "rouge2_precision": 0.010373881118038586, "rouge2_precision_stderr": 0.0010534404229828778, "rouge2_recall": 0.01042257701890787, "rouge2_recall_stderr": 0.0010417332851091305, "rougeL_fmeasure": 0.040027122876600374, "rougeL_fmeasure_stderr": 0.002259653076158055, "rougeL_precision": 0.048044128458970055, "rougeL_precision_stderr": 0.0029962571996274955, "rougeL_recall": 0.04255325347407478, "rougeL_recall_stderr": 0.002495080218318238, "rougeLsum_fmeasure": 0.041566263452711256, "rougeLsum_fmeasure_stderr": 0.0023485349548759424, "rougeLsum_precision": 0.04939643995328391, "rougeLsum_precision_stderr": 0.003054559286533746, "rougeLsum_recall": 0.04473517011374622, "rougeLsum_recall_stderr": 0.0026689496471047707}}, "5": {"article_DOC_summary": {"bleu": 1.4654804656090614e-40, "bleu_stderr": 8.086438720843712e-34, "rouge1_fmeasure": 0.002039646408478482, "rouge1_fmeasure_stderr": 0.0005953236798957056, "rouge1_precision": 0.002336168157769887, "rouge1_precision_stderr": 0.0006890847696356748, "rouge1_recall": 0.0018688139530059714, "rouge1_recall_stderr": 0.0005476439839497603, "rouge2_fmeasure": 7.895668273026765e-05, "rouge2_fmeasure_stderr": 5.584006489039512e-05, "rouge2_precision": 0.0001008979921299566, "rouge2_precision_stderr": 7.131502741691525e-05, "rouge2_recall": 6.493506493506492e-05, "rouge2_recall_stderr": 4.5969907005320445e-05, "rougeL_fmeasure": 0.0014470827826313995, "rougeL_fmeasure_stderr": 0.0004202041853790085, "rougeL_precision": 0.001656834704909943, "rougeL_precision_stderr": 0.000482744249190715, "rougeL_recall": 0.0013296876690360795, "rougeL_recall_stderr": 0.0003919607933712566, "rougeLsum_fmeasure": 0.0014494601434993262, "rougeLsum_fmeasure_stderr": 0.000416339406572967, "rougeLsum_precision": 0.001656834704909943, "rougeLsum_precision_stderr": 0.0004774384132496309, "rougeLsum_recall": 0.0013335860005501915, "rougeLsum_recall_stderr": 0.00038928213049580857}}}}
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.334,0.014922019523732961,0
3
+ anli_r2,acc,0.336,0.014944140233795027,0
4
+ anli_r3,acc,0.3308333333333333,0.013588208070708999,0
5
+ arc_challenge,acc,0.21245733788395904,0.011953482906582954,0
6
+ arc_challenge,acc_norm,0.25597269624573377,0.012753013241244516,0
7
+ arc_easy,acc,0.5054713804713805,0.010259169228615049,0
8
+ arc_easy,acc_norm,0.4452861952861953,0.010198171137873854,0
9
+ boolq,acc,0.5740061162079511,0.008648732832949143,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.19658119658119655,,1
12
+ copa,acc,0.66,0.04760952285695237,0
13
+ hellaswag,acc,0.3604859589723163,0.004791601975612767,0
14
+ hellaswag,acc_norm,0.4444333798048198,0.004958872288442149,0
15
+ piqa,acc,0.6855277475516867,0.01083300906510657,0
16
+ piqa,acc_norm,0.6833514689880305,0.010853160531978484,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.824,0.012048616898597509,0
19
+ sciq,acc_norm,0.717,0.014251810906481744,0
20
+ storycloze_2016,acc,0.6440406199893105,0.011072254184382844,0
21
+ winogrande,acc,0.5232833464877664,0.01403724130957364,0
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_0_lm-eval_global_step80108_2023-05-13-13-52-19_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.334,
5
- "acc_stderr": 0.014922019523732961
6
- },
7
- "anli_r2": {
8
- "acc": 0.336,
9
- "acc_stderr": 0.014944140233795027
10
- },
11
- "anli_r3": {
12
- "acc": 0.3308333333333333,
13
- "acc_stderr": 0.013588208070708999
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.19658119658119655
19
- },
20
- "copa": {
21
- "acc": 0.66,
22
- "acc_stderr": 0.04760952285695237
23
- },
24
- "hellaswag": {
25
- "acc": 0.3604859589723163,
26
- "acc_stderr": 0.004791601975612767,
27
- "acc_norm": 0.4444333798048198,
28
- "acc_norm_stderr": 0.004958872288442149
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5232833464877664,
36
- "acc_stderr": 0.01403724130957364
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6440406199893105,
40
- "acc_stderr": 0.011072254184382844
41
- },
42
- "boolq": {
43
- "acc": 0.5740061162079511,
44
- "acc_stderr": 0.008648732832949143
45
- },
46
- "arc_easy": {
47
- "acc": 0.5054713804713805,
48
- "acc_stderr": 0.010259169228615049,
49
- "acc_norm": 0.4452861952861953,
50
- "acc_norm_stderr": 0.010198171137873854
51
- },
52
- "arc_challenge": {
53
- "acc": 0.21245733788395904,
54
- "acc_stderr": 0.011953482906582954,
55
- "acc_norm": 0.25597269624573377,
56
- "acc_norm_stderr": 0.012753013241244516
57
- },
58
- "sciq": {
59
- "acc": 0.824,
60
- "acc_stderr": 0.012048616898597509,
61
- "acc_norm": 0.717,
62
- "acc_norm_stderr": 0.014251810906481744
63
- },
64
- "piqa": {
65
- "acc": 0.6855277475516867,
66
- "acc_stderr": 0.01083300906510657,
67
- "acc_norm": 0.6833514689880305,
68
- "acc_norm_stderr": 0.010853160531978484
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.325,0.014818724459095526,0
3
+ anli_r2,acc,0.336,0.014944140233795028,0
4
+ anli_r3,acc,0.3416666666666667,0.013696658778002517,0
5
+ arc_challenge,acc,0.22440273037542663,0.012191404938603842,0
6
+ arc_challenge,acc_norm,0.26109215017064846,0.01283552390947385,0
7
+ arc_easy,acc,0.5248316498316499,0.010247123122159281,0
8
+ arc_easy,acc_norm,0.4831649831649832,0.010253966261288898,0
9
+ boolq,acc,0.519571865443425,0.008738352682962235,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.28461538461538466,,1
12
+ copa,acc,0.72,0.04512608598542127,0
13
+ hellaswag,acc,0.36058554072893845,0.004791890625834209,0
14
+ hellaswag,acc_norm,0.445628360884286,0.004960191341430243,0
15
+ piqa,acc,0.6920565832426551,0.01077089236746368,0
16
+ piqa,acc_norm,0.6860718171926007,0.010827928134189646,0
17
+ rte,acc,0.5703971119133574,0.02979666882912467,0
18
+ sciq,acc,0.875,0.010463483381956722,0
19
+ sciq,acc_norm,0.855,0.011139977517890143,0
20
+ storycloze_2016,acc,0.6247995724211651,0.011196472580587938,0
21
+ winogrande,acc,0.5209155485398579,0.014040185494212952,0
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_1_lm-eval_global_step80108_2023-05-13-13-52-19_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.325,
5
- "acc_stderr": 0.014818724459095526
6
- },
7
- "anli_r2": {
8
- "acc": 0.336,
9
- "acc_stderr": 0.014944140233795028
10
- },
11
- "anli_r3": {
12
- "acc": 0.3416666666666667,
13
- "acc_stderr": 0.013696658778002517
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.28461538461538466
19
- },
20
- "copa": {
21
- "acc": 0.72,
22
- "acc_stderr": 0.04512608598542127
23
- },
24
- "hellaswag": {
25
- "acc": 0.36058554072893845,
26
- "acc_stderr": 0.004791890625834209,
27
- "acc_norm": 0.445628360884286,
28
- "acc_norm_stderr": 0.004960191341430243
29
- },
30
- "rte": {
31
- "acc": 0.5703971119133574,
32
- "acc_stderr": 0.02979666882912467
33
- },
34
- "winogrande": {
35
- "acc": 0.5209155485398579,
36
- "acc_stderr": 0.014040185494212952
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6247995724211651,
40
- "acc_stderr": 0.011196472580587938
41
- },
42
- "boolq": {
43
- "acc": 0.519571865443425,
44
- "acc_stderr": 0.008738352682962235
45
- },
46
- "arc_easy": {
47
- "acc": 0.5248316498316499,
48
- "acc_stderr": 0.010247123122159281,
49
- "acc_norm": 0.4831649831649832,
50
- "acc_norm_stderr": 0.010253966261288898
51
- },
52
- "arc_challenge": {
53
- "acc": 0.22440273037542663,
54
- "acc_stderr": 0.012191404938603842,
55
- "acc_norm": 0.26109215017064846,
56
- "acc_norm_stderr": 0.01283552390947385
57
- },
58
- "sciq": {
59
- "acc": 0.875,
60
- "acc_stderr": 0.010463483381956722,
61
- "acc_norm": 0.855,
62
- "acc_norm_stderr": 0.011139977517890143
63
- },
64
- "piqa": {
65
- "acc": 0.6920565832426551,
66
- "acc_stderr": 0.01077089236746368,
67
- "acc_norm": 0.6860718171926007,
68
- "acc_norm_stderr": 0.010827928134189646
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.328,0.014853842487270334,0
3
+ anli_r2,acc,0.343,0.015019206922356951,0
4
+ anli_r3,acc,0.3375,0.01365589718546366,0
5
+ arc_challenge,acc,0.23293515358361774,0.012352507042617396,0
6
+ arc_challenge,acc_norm,0.26535836177474403,0.012902554762313966,0
7
+ arc_easy,acc,0.5244107744107744,0.010247548905242265,0
8
+ arc_easy,acc_norm,0.5122053872053872,0.010256726235129018,0
9
+ boolq,acc,0.5415902140672783,0.008714749017709893,1
10
+ cb,acc,0.5178571428571429,0.06737697508644648,1
11
+ cb,f1,0.35651135005973716,,1
12
+ copa,acc,0.65,0.04793724854411019,0
13
+ hellaswag,acc,0.3602867954590719,0.004791024004587995,0
14
+ hellaswag,acc_norm,0.44523003385779725,0.00495975488205547,0
15
+ piqa,acc,0.6931447225244831,0.010760295070580366,0
16
+ piqa,acc_norm,0.6936887921653971,0.010754970032367323,0
17
+ rte,acc,0.51985559566787,0.030072723167317184,0
18
+ sciq,acc,0.889,0.009938701010583726,0
19
+ sciq,acc_norm,0.876,0.01042749887234396,0
20
+ storycloze_2016,acc,0.6296098343132015,0.011167209707294228,0
21
+ winogrande,acc,0.5327545382794001,0.014022300570434139,0
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_2_lm-eval_global_step80108_2023-05-13-13-52-19_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.328,
5
- "acc_stderr": 0.014853842487270334
6
- },
7
- "anli_r2": {
8
- "acc": 0.343,
9
- "acc_stderr": 0.015019206922356951
10
- },
11
- "anli_r3": {
12
- "acc": 0.3375,
13
- "acc_stderr": 0.01365589718546366
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.35651135005973716
19
- },
20
- "copa": {
21
- "acc": 0.65,
22
- "acc_stderr": 0.04793724854411019
23
- },
24
- "hellaswag": {
25
- "acc": 0.3602867954590719,
26
- "acc_stderr": 0.004791024004587995,
27
- "acc_norm": 0.44523003385779725,
28
- "acc_norm_stderr": 0.00495975488205547
29
- },
30
- "rte": {
31
- "acc": 0.51985559566787,
32
- "acc_stderr": 0.030072723167317184
33
- },
34
- "winogrande": {
35
- "acc": 0.5327545382794001,
36
- "acc_stderr": 0.014022300570434139
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6296098343132015,
40
- "acc_stderr": 0.011167209707294228
41
- },
42
- "boolq": {
43
- "acc": 0.5415902140672783,
44
- "acc_stderr": 0.008714749017709893
45
- },
46
- "arc_easy": {
47
- "acc": 0.5244107744107744,
48
- "acc_stderr": 0.010247548905242265,
49
- "acc_norm": 0.5122053872053872,
50
- "acc_norm_stderr": 0.010256726235129018
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23293515358361774,
54
- "acc_stderr": 0.012352507042617396,
55
- "acc_norm": 0.26535836177474403,
56
- "acc_norm_stderr": 0.012902554762313966
57
- },
58
- "sciq": {
59
- "acc": 0.889,
60
- "acc_stderr": 0.009938701010583726,
61
- "acc_norm": 0.876,
62
- "acc_norm_stderr": 0.01042749887234396
63
- },
64
- "piqa": {
65
- "acc": 0.6931447225244831,
66
- "acc_stderr": 0.010760295070580366,
67
- "acc_norm": 0.6936887921653971,
68
- "acc_norm_stderr": 0.010754970032367323
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.304,0.01455320568795045,0
3
+ anli_r2,acc,0.342,0.015008706182121728,0
4
+ anli_r3,acc,0.3283333333333333,0.013562032919529027,0
5
+ arc_challenge,acc,0.23378839590443687,0.012368225378507148,0
6
+ arc_challenge,acc_norm,0.2593856655290102,0.012808273573927094,0
7
+ arc_easy,acc,0.5197811447811448,0.010251751199542723,0
8
+ arc_easy,acc_norm,0.5130471380471381,0.010256289925058441,0
9
+ boolq,acc,0.5440366972477064,0.008711071588226796,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.36726403823178017,,1
12
+ copa,acc,0.69,0.04648231987117316,0
13
+ hellaswag,acc,0.36128261302529374,0.004793904922401887,0
14
+ hellaswag,acc_norm,0.44811790479984065,0.004962846206125478,0
15
+ piqa,acc,0.7023939064200218,0.010667353792388212,0
16
+ piqa,acc_norm,0.6947769314472253,0.01074426704560648,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.889,0.009938701010583726,0
19
+ sciq,acc_norm,0.888,0.00997775303139724,0
20
+ storycloze_2016,acc,0.632816675574559,0.011147041781368654,0
21
+ winogrande,acc,0.5146014206787688,0.014046492383275835,0
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_3_lm-eval_global_step80108_2023-05-13-13-52-19_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.304,
5
- "acc_stderr": 0.01455320568795045
6
- },
7
- "anli_r2": {
8
- "acc": 0.342,
9
- "acc_stderr": 0.015008706182121728
10
- },
11
- "anli_r3": {
12
- "acc": 0.3283333333333333,
13
- "acc_stderr": 0.013562032919529027
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.36726403823178017
19
- },
20
- "copa": {
21
- "acc": 0.69,
22
- "acc_stderr": 0.04648231987117316
23
- },
24
- "hellaswag": {
25
- "acc": 0.36128261302529374,
26
- "acc_stderr": 0.004793904922401887,
27
- "acc_norm": 0.44811790479984065,
28
- "acc_norm_stderr": 0.004962846206125478
29
- },
30
- "rte": {
31
- "acc": 0.5306859205776173,
32
- "acc_stderr": 0.03003973059219781
33
- },
34
- "winogrande": {
35
- "acc": 0.5146014206787688,
36
- "acc_stderr": 0.014046492383275835
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.632816675574559,
40
- "acc_stderr": 0.011147041781368654
41
- },
42
- "boolq": {
43
- "acc": 0.5440366972477064,
44
- "acc_stderr": 0.008711071588226796
45
- },
46
- "arc_easy": {
47
- "acc": 0.5197811447811448,
48
- "acc_stderr": 0.010251751199542723,
49
- "acc_norm": 0.5130471380471381,
50
- "acc_norm_stderr": 0.010256289925058441
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23378839590443687,
54
- "acc_stderr": 0.012368225378507148,
55
- "acc_norm": 0.2593856655290102,
56
- "acc_norm_stderr": 0.012808273573927094
57
- },
58
- "sciq": {
59
- "acc": 0.889,
60
- "acc_stderr": 0.009938701010583726,
61
- "acc_norm": 0.888,
62
- "acc_norm_stderr": 0.00997775303139724
63
- },
64
- "piqa": {
65
- "acc": 0.7023939064200218,
66
- "acc_stderr": 0.010667353792388212,
67
- "acc_norm": 0.6947769314472253,
68
- "acc_norm_stderr": 0.01074426704560648
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.328,0.014853842487270334,0
3
+ anli_r2,acc,0.345,0.015039986742055231,0
4
+ anli_r3,acc,0.3308333333333333,0.013588208070708997,0
5
+ arc_challenge,acc,0.22866894197952217,0.0122728535825408,0
6
+ arc_challenge,acc_norm,0.26791808873720135,0.012942030195136421,0
7
+ arc_easy,acc,0.5378787878787878,0.010230299628864794,0
8
+ arc_easy,acc_norm,0.5260942760942761,0.01024580199024005,0
9
+ boolq,acc,0.5266055045871559,0.008732665775847753,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.3335679099225897,,1
12
+ copa,acc,0.68,0.04688261722621504,0
13
+ hellaswag,acc,0.3594901414060944,0.004788703173474755,0
14
+ hellaswag,acc_norm,0.4464250149372635,0.004961054589573477,0
15
+ piqa,acc,0.6974972796517954,0.01071719969808388,0
16
+ piqa,acc_norm,0.6926006528835691,0.010765602506939064,0
17
+ rte,acc,0.51985559566787,0.030072723167317184,0
18
+ sciq,acc,0.903,0.00936368937324812,0
19
+ sciq,acc_norm,0.886,0.010055103435823328,0
20
+ storycloze_2016,acc,0.6349545697487974,0.011133301783914874,0
21
+ winogrande,acc,0.5256511444356748,0.014033980956108558,0
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_4_lm-eval_global_step80108_2023-05-13-13-52-19_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.328,
5
- "acc_stderr": 0.014853842487270334
6
- },
7
- "anli_r2": {
8
- "acc": 0.345,
9
- "acc_stderr": 0.015039986742055231
10
- },
11
- "anli_r3": {
12
- "acc": 0.3308333333333333,
13
- "acc_stderr": 0.013588208070708997
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.3335679099225897
19
- },
20
- "copa": {
21
- "acc": 0.68,
22
- "acc_stderr": 0.04688261722621504
23
- },
24
- "hellaswag": {
25
- "acc": 0.3594901414060944,
26
- "acc_stderr": 0.004788703173474755,
27
- "acc_norm": 0.4464250149372635,
28
- "acc_norm_stderr": 0.004961054589573477
29
- },
30
- "rte": {
31
- "acc": 0.51985559566787,
32
- "acc_stderr": 0.030072723167317184
33
- },
34
- "winogrande": {
35
- "acc": 0.5256511444356748,
36
- "acc_stderr": 0.014033980956108558
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6349545697487974,
40
- "acc_stderr": 0.011133301783914874
41
- },
42
- "boolq": {
43
- "acc": 0.5266055045871559,
44
- "acc_stderr": 0.008732665775847753
45
- },
46
- "arc_easy": {
47
- "acc": 0.5378787878787878,
48
- "acc_stderr": 0.010230299628864794,
49
- "acc_norm": 0.5260942760942761,
50
- "acc_norm_stderr": 0.01024580199024005
51
- },
52
- "arc_challenge": {
53
- "acc": 0.22866894197952217,
54
- "acc_stderr": 0.0122728535825408,
55
- "acc_norm": 0.26791808873720135,
56
- "acc_norm_stderr": 0.012942030195136421
57
- },
58
- "sciq": {
59
- "acc": 0.903,
60
- "acc_stderr": 0.00936368937324812,
61
- "acc_norm": 0.886,
62
- "acc_norm_stderr": 0.010055103435823328
63
- },
64
- "piqa": {
65
- "acc": 0.6974972796517954,
66
- "acc_stderr": 0.01071719969808388,
67
- "acc_norm": 0.6926006528835691,
68
- "acc_norm_stderr": 0.010765602506939064
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.014944140233795027,0
3
+ anli_r2,acc,0.343,0.015019206922356953,0
4
+ anli_r3,acc,0.3333333333333333,0.013613950010225608,0
5
+ arc_challenge,acc,0.23378839590443687,0.01236822537850714,0
6
+ arc_challenge,acc_norm,0.2525597269624573,0.012696728980207704,0
7
+ arc_easy,acc,0.5324074074074074,0.0102382103688019,0
8
+ arc_easy,acc_norm,0.5294612794612794,0.010241957728409684,0
9
+ boolq,acc,0.5345565749235474,0.00872414404060481,1
10
+ cb,acc,0.5892857142857143,0.0663363415035954,1
11
+ cb,f1,0.40764635603345284,,1
12
+ copa,acc,0.69,0.04648231987117316,0
13
+ hellaswag,acc,0.3615813582951603,0.004794764843685283,0
14
+ hellaswag,acc_norm,0.4477195777733519,0.0049624298819040255,0
15
+ piqa,acc,0.6887921653971708,0.010802263878045842,0
16
+ piqa,acc_norm,0.6882480957562568,0.01080743142487367,0
17
+ rte,acc,0.5379061371841155,0.030009848912529117,0
18
+ sciq,acc,0.903,0.009363689373248111,0
19
+ sciq,acc_norm,0.897,0.009616833339695796,0
20
+ storycloze_2016,acc,0.6306787814003206,0.011160545865067163,0
21
+ winogrande,acc,0.531965272296764,0.01402373922116638,0
4b284b84b10c4pyseed1/evaluation/rankeval/4b284b84b10c4pyseed1_5_lm-eval_global_step80108_2023-05-13-13-52-19_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795027
6
- },
7
- "anli_r2": {
8
- "acc": 0.343,
9
- "acc_stderr": 0.015019206922356953
10
- },
11
- "anli_r3": {
12
- "acc": 0.3333333333333333,
13
- "acc_stderr": 0.013613950010225608
14
- },
15
- "cb": {
16
- "acc": 0.5892857142857143,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.40764635603345284
19
- },
20
- "copa": {
21
- "acc": 0.69,
22
- "acc_stderr": 0.04648231987117316
23
- },
24
- "hellaswag": {
25
- "acc": 0.3615813582951603,
26
- "acc_stderr": 0.004794764843685283,
27
- "acc_norm": 0.4477195777733519,
28
- "acc_norm_stderr": 0.0049624298819040255
29
- },
30
- "rte": {
31
- "acc": 0.5379061371841155,
32
- "acc_stderr": 0.030009848912529117
33
- },
34
- "winogrande": {
35
- "acc": 0.531965272296764,
36
- "acc_stderr": 0.01402373922116638
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6306787814003206,
40
- "acc_stderr": 0.011160545865067163
41
- },
42
- "boolq": {
43
- "acc": 0.5345565749235474,
44
- "acc_stderr": 0.00872414404060481
45
- },
46
- "arc_easy": {
47
- "acc": 0.5324074074074074,
48
- "acc_stderr": 0.0102382103688019,
49
- "acc_norm": 0.5294612794612794,
50
- "acc_norm_stderr": 0.010241957728409684
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23378839590443687,
54
- "acc_stderr": 0.01236822537850714,
55
- "acc_norm": 0.2525597269624573,
56
- "acc_norm_stderr": 0.012696728980207704
57
- },
58
- "sciq": {
59
- "acc": 0.903,
60
- "acc_stderr": 0.009363689373248111,
61
- "acc_norm": 0.897,
62
- "acc_norm_stderr": 0.009616833339695796
63
- },
64
- "piqa": {
65
- "acc": 0.6887921653971708,
66
- "acc_stderr": 0.010802263878045842,
67
- "acc_norm": 0.6882480957562568,
68
- "acc_norm_stderr": 0.01080743142487367
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed2/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.030990000130207643
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.030990000130207643
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.25856091966979955
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.25856091966979955
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.27706127487413984
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.27706127487413984
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2832952296859798
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2832952296859798
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2852035973993449
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2852035973993449
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2874214398128714
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2874214398128714
14
+ e2e_nlg_cleaned,5,average,multiple,0.23708874359539053
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.0387268948293068
16
+ gem_xsum,0,median,rouge2_fmeasure,0.0387268948293068
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.035920449607511167
18
+ gem_xsum,1,median,rouge2_fmeasure,0.035920449607511167
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.040860035399920204
20
+ gem_xsum,2,median,rouge2_fmeasure,0.040860035399920204
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03981086125537371
22
+ gem_xsum,3,median,rouge2_fmeasure,0.03981086125537371
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010826553835341136
24
+ gem_xsum,4,median,rouge2_fmeasure,0.010826553835341136
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0
27
+ gem_xsum,5,average,multiple,0.027690799154575504
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04691214006430871
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.04691214006430871
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08974078313646336
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.08974078313646336
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09801431861556455
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.09801431861556455
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.1000085236049077
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.1000085236049077
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10873373976256799
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.10873373976256799
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11064119365310371
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.11064119365310371
40
+ web_nlg_en,5,average,multiple,0.092341783139486
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.047249213428620485
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.047249213428620485
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04845379131309357
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.04845379131309357
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.059137197237102555
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.059137197237102555
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04850106238325285
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04850106238325285
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01750958025608534
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.01750958025608534
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0029624685534656845
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0029624685534656845
53
+ wiki_lingua_en,5,average,multiple,0.03730221886193675
4b284b84b10c4pyseed2/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.34839261627209633, "bleu_stderr": 0.04189138517106381, "rouge1_fmeasure": 0.102270263379348, "rouge1_fmeasure_stderr": 0.0019923734949061694, "rouge1_precision": 0.06619671064806756, "rouge1_precision_stderr": 0.0014468929573685865, "rouge1_recall": 0.3037815234613137, "rouge1_recall_stderr": 0.004972987748656106, "rouge2_fmeasure": 0.04691214006430871, "rouge2_fmeasure_stderr": 0.0012472210585906468, "rouge2_precision": 0.030285785799413757, "rouge2_precision_stderr": 0.0008864788604965375, "rouge2_recall": 0.14186114184511642, "rouge2_recall_stderr": 0.0033821444908255366, "rougeL_fmeasure": 0.0975410547748716, "rougeL_fmeasure_stderr": 0.001828479249250883, "rougeL_precision": 0.06287801959879465, "rougeL_precision_stderr": 0.0013050273732277727, "rougeL_recall": 0.29256720244506246, "rougeL_recall_stderr": 0.00480919187309681, "rougeLsum_fmeasure": 0.0962863879535457, "rougeLsum_fmeasure_stderr": 0.0018605063291573778, "rougeLsum_precision": 0.06237581474922931, "rougeLsum_precision_stderr": 0.0013513992782218862, "rougeLsum_recall": 0.284874621954375, "rougeLsum_recall_stderr": 0.004610992963522042}}, "1": {"PALM_prompt": {"bleu": 0.6899337084028382, "bleu_stderr": 0.042333949728137366, "rouge1_fmeasure": 0.17354486269770866, "rouge1_fmeasure_stderr": 0.004186663411506336, "rouge1_precision": 0.15595538943212903, "rouge1_precision_stderr": 0.004906217502987317, "rouge1_recall": 0.3153777663242188, "rouge1_recall_stderr": 0.005218202704487343, "rouge2_fmeasure": 0.08974078313646336, "rouge2_fmeasure_stderr": 0.0028331491525215896, "rouge2_precision": 0.08245214895202854, "rouge2_precision_stderr": 0.0033601915955220707, "rouge2_recall": 0.1643456005403218, "rouge2_recall_stderr": 0.003778189712341191, "rougeL_fmeasure": 0.1548343704963851, "rougeL_fmeasure_stderr": 0.003522461923651231, "rougeL_precision": 0.13758080643383186, "rougeL_precision_stderr": 0.00423481669815462, "rougeL_recall": 0.29135180092738344, "rougeL_recall_stderr": 0.004685784946148678, "rougeLsum_fmeasure": 0.15852424891152261, "rougeLsum_fmeasure_stderr": 0.0036190101660936473, "rougeLsum_precision": 0.14140870737065203, "rougeLsum_precision_stderr": 0.004346859049662128, "rougeLsum_recall": 0.2955162155786936, "rougeLsum_recall_stderr": 0.0047365295873086025}}, "2": {"PALM_prompt": {"bleu": 0.7649798500320898, "bleu_stderr": 0.04247389832870208, "rouge1_fmeasure": 0.18540947919121153, "rouge1_fmeasure_stderr": 0.004303005806856362, "rouge1_precision": 0.16338039225133683, "rouge1_precision_stderr": 0.005096447031211365, "rouge1_recall": 0.34580305839943676, "rouge1_recall_stderr": 0.004929662488040828, "rouge2_fmeasure": 0.09801431861556455, "rouge2_fmeasure_stderr": 0.0029422952865272435, "rouge2_precision": 0.08801451623818388, "rouge2_precision_stderr": 0.0033843266607527923, "rouge2_recall": 0.1837173470500723, "rouge2_recall_stderr": 0.003764945288197601, "rougeL_fmeasure": 0.16567461495228394, "rougeL_fmeasure_stderr": 0.003634221676973485, "rougeL_precision": 0.14396563550384628, "rougeL_precision_stderr": 0.004356175681556258, "rougeL_recall": 0.3197218939310831, "rougeL_recall_stderr": 0.004430246312387707, "rougeLsum_fmeasure": 0.16974882357275767, "rougeLsum_fmeasure_stderr": 0.0037700607731101423, "rougeLsum_precision": 0.14812300541237652, "rougeLsum_precision_stderr": 0.004515918270100535, "rougeLsum_recall": 0.3246513438443792, "rougeLsum_recall_stderr": 0.00453035867512444}}, "3": {"PALM_prompt": {"bleu": 0.7880554947757874, "bleu_stderr": 0.02412583727629669, "rouge1_fmeasure": 0.19107708104932275, "rouge1_fmeasure_stderr": 0.004264264650242928, "rouge1_precision": 0.16899861478290354, "rouge1_precision_stderr": 0.0050752235139585565, "rouge1_recall": 0.353506846868985, "rouge1_recall_stderr": 0.004875035511133627, "rouge2_fmeasure": 0.1000085236049077, "rouge2_fmeasure_stderr": 0.002871935501554795, "rouge2_precision": 0.09086098296509536, "rouge2_precision_stderr": 0.0034139861745634864, "rouge2_recall": 0.18673977142734455, "rouge2_recall_stderr": 0.0036535351316784362, "rougeL_fmeasure": 0.1695071279231006, "rougeL_fmeasure_stderr": 0.003530055608292118, "rougeL_precision": 0.14753232253831605, "rougeL_precision_stderr": 0.004279960594179838, "rougeL_recall": 0.32609579733977945, "rougeL_recall_stderr": 0.004358027862174455, "rougeLsum_fmeasure": 0.17457412379526094, "rougeLsum_fmeasure_stderr": 0.003688514412417398, "rougeLsum_precision": 0.15267722208246193, "rougeLsum_precision_stderr": 0.004457966644732954, "rougeLsum_recall": 0.3323753348969973, "rougeLsum_recall_stderr": 0.004452797094353982}}, "4": {"PALM_prompt": {"bleu": 0.9540230790207443, "bleu_stderr": 0.06250242513533112, "rouge1_fmeasure": 0.20299346028455165, "rouge1_fmeasure_stderr": 0.004364718616817714, "rouge1_precision": 0.18500238861274002, "rouge1_precision_stderr": 0.005456570594699929, "rouge1_recall": 0.3715295810831499, "rouge1_recall_stderr": 0.004821842315560112, "rouge2_fmeasure": 0.10873373976256799, "rouge2_fmeasure_stderr": 0.0029913268937647374, "rouge2_precision": 0.1023629193890621, "rouge2_precision_stderr": 0.0037265074201683706, "rouge2_recall": 0.20023154074752458, "rouge2_recall_stderr": 0.0037485985411402317, "rougeL_fmeasure": 0.17886084919713702, "rougeL_fmeasure_stderr": 0.0035998894139795488, "rougeL_precision": 0.16046157893674384, "rougeL_precision_stderr": 0.004599576804749886, "rougeL_recall": 0.3410601418631107, "rougeL_recall_stderr": 0.004320929090573991, "rougeLsum_fmeasure": 0.18484867505368305, "rougeLsum_fmeasure_stderr": 0.0037882991024945, "rougeLsum_precision": 0.16708618235341705, "rougeLsum_precision_stderr": 0.004847285030413251, "rougeLsum_recall": 0.34808474406483014, "rougeLsum_recall_stderr": 0.00440716128207026}}, "5": {"PALM_prompt": {"bleu": 0.9647815511477145, "bleu_stderr": 0.0525657878771483, "rouge1_fmeasure": 0.20572563969030314, "rouge1_fmeasure_stderr": 0.004359435424168292, "rouge1_precision": 0.18871959903075744, "rouge1_precision_stderr": 0.00548278099566492, "rouge1_recall": 0.37045274798114114, "rouge1_recall_stderr": 0.004835782516299729, "rouge2_fmeasure": 0.11064119365310371, "rouge2_fmeasure_stderr": 0.0030072964859697167, "rouge2_precision": 0.10469670334588807, "rouge2_precision_stderr": 0.0037360412490884593, "rouge2_recall": 0.19913448024040037, "rouge2_recall_stderr": 0.003827076408512348, "rougeL_fmeasure": 0.18154067135527413, "rougeL_fmeasure_stderr": 0.0036190624757514284, "rougeL_precision": 0.16415536523644794, "rougeL_precision_stderr": 0.004649533532960034, "rougeL_recall": 0.33896991159333506, "rougeL_recall_stderr": 0.004314962296089556, "rougeLsum_fmeasure": 0.18803822018280075, "rougeLsum_fmeasure_stderr": 0.0038049604768632885, "rougeLsum_precision": 0.17109281745992913, "rougeLsum_precision_stderr": 0.004878084218108864, "rougeLsum_recall": 0.3470496991360511, "rougeLsum_recall_stderr": 0.0044391141331917225}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.4454034467789323, "bleu_stderr": 0.048681643300429914, "rouge1_fmeasure": 0.19834678342391596, "rouge1_fmeasure_stderr": 0.0019953866660679387, "rouge1_precision": 0.17670294058047709, "rouge1_precision_stderr": 0.002152682535692807, "rouge1_recall": 0.2730966241528598, "rouge1_recall_stderr": 0.002876978115765481, "rouge2_fmeasure": 0.047249213428620485, "rouge2_fmeasure_stderr": 0.001024842198849522, "rouge2_precision": 0.042089495973895955, "rouge2_precision_stderr": 0.0009706252891873274, "rouge2_recall": 0.06719414723840238, "rouge2_recall_stderr": 0.0016691480378475004, "rougeL_fmeasure": 0.1537451423118469, "rougeL_fmeasure_stderr": 0.0014118382552893254, "rougeL_precision": 0.13551080331326906, "rougeL_precision_stderr": 0.0014970233438145126, "rougeL_recall": 0.21623247174319551, "rougeL_recall_stderr": 0.002317064002184152, "rougeLsum_fmeasure": 0.18429256720842216, "rougeLsum_fmeasure_stderr": 0.0018711957404076596, "rougeLsum_precision": 0.16397698442357508, "rougeLsum_precision_stderr": 0.0020098740121521535, "rougeLsum_recall": 0.2543863882031728, "rougeLsum_recall_stderr": 0.0027190494143108086}}, "1": {"tldr_en": {"bleu": 2.95465203533547, "bleu_stderr": 0.11093001827696498, "rouge1_fmeasure": 0.1963876582525413, "rouge1_fmeasure_stderr": 0.0021681662634387322, "rouge1_precision": 0.27867699937334955, "rouge1_precision_stderr": 0.004063680780676404, "rouge1_recall": 0.2177221365380748, "rouge1_recall_stderr": 0.002935293168894932, "rouge2_fmeasure": 0.04845379131309357, "rouge2_fmeasure_stderr": 0.0012281835451445104, "rouge2_precision": 0.0786713577518332, "rouge2_precision_stderr": 0.002618917158531456, "rouge2_recall": 0.05307524806625627, "rouge2_recall_stderr": 0.0014833724805905315, "rougeL_fmeasure": 0.15027424005088746, "rougeL_fmeasure_stderr": 0.0016308280694806295, "rougeL_precision": 0.22083081535229637, "rougeL_precision_stderr": 0.0035092648161666627, "rougeL_recall": 0.16578892009178958, "rougeL_recall_stderr": 0.002233033614624085, "rougeLsum_fmeasure": 0.18432075461675484, "rougeLsum_fmeasure_stderr": 0.0020327320535339717, "rougeLsum_precision": 0.2625014140320661, "rougeLsum_precision_stderr": 0.003898170817322575, "rougeLsum_recall": 0.2045373118466716, "rougeLsum_recall_stderr": 0.0027628376951092893}}, "2": {"tldr_en": {"bleu": 3.6588213420433138, "bleu_stderr": 0.0869578318671135, "rouge1_fmeasure": 0.217579902419, "rouge1_fmeasure_stderr": 0.0021937168466157598, "rouge1_precision": 0.3089441350943573, "rouge1_precision_stderr": 0.004102889940813672, "rouge1_recall": 0.24074943276751445, "rouge1_recall_stderr": 0.003032127470747412, "rouge2_fmeasure": 0.059137197237102555, "rouge2_fmeasure_stderr": 0.001324739329998592, "rouge2_precision": 0.09455554262896264, "rouge2_precision_stderr": 0.0027438614759175994, "rouge2_recall": 0.06467333871804612, "rouge2_recall_stderr": 0.0016038407383362659, "rougeL_fmeasure": 0.1662109540653676, "rougeL_fmeasure_stderr": 0.0016717193324765926, "rougeL_precision": 0.24465844035612289, "rougeL_precision_stderr": 0.0035853339617273367, "rougeL_recall": 0.1831916992404158, "rougeL_recall_stderr": 0.002344577075560811, "rougeLsum_fmeasure": 0.20438491432913247, "rougeLsum_fmeasure_stderr": 0.002069409366853919, "rougeLsum_precision": 0.29211358606681354, "rougeLsum_precision_stderr": 0.003976453867941921, "rougeLsum_recall": 0.22585792605316923, "rougeLsum_recall_stderr": 0.0028508176182757737}}, "3": {"tldr_en": {"bleu": 2.665998479871419, "bleu_stderr": 0.08994134689726999, "rouge1_fmeasure": 0.1802175746907868, "rouge1_fmeasure_stderr": 0.002434167660435035, "rouge1_precision": 0.27297871819584885, "rouge1_precision_stderr": 0.004413443965010257, "rouge1_recall": 0.19387830893477428, "rouge1_recall_stderr": 0.003190515995935855, "rouge2_fmeasure": 0.04850106238325285, "rouge2_fmeasure_stderr": 0.0012362829727260978, "rouge2_precision": 0.0832544835003457, "rouge2_precision_stderr": 0.002719797818406377, "rouge2_recall": 0.05192450103297081, "rouge2_recall_stderr": 0.001491001725745325, "rougeL_fmeasure": 0.1390362065262866, "rougeL_fmeasure_stderr": 0.0018806047212560535, "rougeL_precision": 0.21946768960635085, "rougeL_precision_stderr": 0.003849319818918071, "rougeL_recall": 0.1482709055627487, "rougeL_recall_stderr": 0.0024666015852836337, "rougeLsum_fmeasure": 0.17039501412301605, "rougeLsum_fmeasure_stderr": 0.0023041801877448096, "rougeLsum_precision": 0.2599247216074377, "rougeLsum_precision_stderr": 0.00427549980272435, "rougeLsum_recall": 0.1828585094689675, "rougeLsum_recall_stderr": 0.0030087470117644956}}, "4": {"tldr_en": {"bleu": 0.0725357298585346, "bleu_stderr": 0.013930651719942453, "rouge1_fmeasure": 0.061652879562912176, "rouge1_fmeasure_stderr": 0.0021749036321190972, "rouge1_precision": 0.09868618737266148, "rouge1_precision_stderr": 0.0037147280435652465, "rouge1_recall": 0.06420196511318366, "rouge1_recall_stderr": 0.002494948045980099, "rouge2_fmeasure": 0.01750958025608534, "rouge2_fmeasure_stderr": 0.0009320476953682318, "rouge2_precision": 0.031679178192434765, "rouge2_precision_stderr": 0.0019758967341143878, "rouge2_recall": 0.018171992984595604, "rouge2_recall_stderr": 0.0010876943383536489, "rougeL_fmeasure": 0.048802657588927226, "rougeL_fmeasure_stderr": 0.0017391753956356012, "rougeL_precision": 0.08176253884928666, "rougeL_precision_stderr": 0.003228858979972004, "rougeL_recall": 0.049857351811447195, "rougeL_recall_stderr": 0.001937816613452188, "rougeLsum_fmeasure": 0.058065832418471776, "rougeLsum_fmeasure_stderr": 0.00205281864089678, "rougeLsum_precision": 0.09397209096691526, "rougeLsum_precision_stderr": 0.003582863464186521, "rougeLsum_recall": 0.0602108110622043, "rougeLsum_recall_stderr": 0.002336372909703676}}, "5": {"tldr_en": {"bleu": 1.9616074330528383e-14, "bleu_stderr": 5.195252247364212e-13, "rouge1_fmeasure": 0.010537948590324383, "rouge1_fmeasure_stderr": 0.0010135414744074222, "rouge1_precision": 0.01674924771170946, "rouge1_precision_stderr": 0.0016964353612247622, "rouge1_recall": 0.010957907078967278, "rouge1_recall_stderr": 0.0011627090708120852, "rouge2_fmeasure": 0.0029624685534656845, "rouge2_fmeasure_stderr": 0.00040656767024910826, "rouge2_precision": 0.005613265937833622, "rouge2_precision_stderr": 0.00094409034775436, "rouge2_recall": 0.0033065082101974338, "rouge2_recall_stderr": 0.0005596870642007684, "rougeL_fmeasure": 0.00820030019915529, "rougeL_fmeasure_stderr": 0.0007990026813063043, "rougeL_precision": 0.01371230630574887, "rougeL_precision_stderr": 0.0014803854048282376, "rougeL_recall": 0.008519978078912355, "rougeL_recall_stderr": 0.000933206766564994, "rougeLsum_fmeasure": 0.00969839069256146, "rougeLsum_fmeasure_stderr": 0.0009409294523059373, "rougeLsum_precision": 0.015513502386204657, "rougeLsum_precision_stderr": 0.001601969455168809, "rougeLsum_recall": 0.010162241216887946, "rougeLsum_recall_stderr": 0.0010991397070259491}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.7850824101218064, "bleu_stderr": 0.023216552579721703, "rouge1_fmeasure": 0.15066823598630474, "rouge1_fmeasure_stderr": 0.0022981683960765003, "rouge1_precision": 0.15977584493328995, "rouge1_precision_stderr": 0.0022859347158505927, "rouge1_recall": 0.1539484379028086, "rouge1_recall_stderr": 0.002660543634111301, "rouge2_fmeasure": 0.030990000130207643, "rouge2_fmeasure_stderr": 0.0010337793945203646, "rouge2_precision": 0.03271473035538863, "rouge2_precision_stderr": 0.0010436972512763633, "rouge2_recall": 0.03212980195974011, "rouge2_recall_stderr": 0.001126826812738036, "rougeL_fmeasure": 0.11288996737893989, "rougeL_fmeasure_stderr": 0.0015779083138828134, "rougeL_precision": 0.12064803750541192, "rougeL_precision_stderr": 0.0016007639526862958, "rougeL_recall": 0.11445786166271853, "rougeL_recall_stderr": 0.0018048160007272894, "rougeLsum_fmeasure": 0.14024627335824932, "rougeLsum_fmeasure_stderr": 0.002114616288748232, "rougeLsum_precision": 0.14922484107355077, "rougeLsum_precision_stderr": 0.002112996489926639, "rougeLsum_recall": 0.14293364633930528, "rougeLsum_recall_stderr": 0.0024531822698296694}}, "1": {"generate_text_restaurant": {"bleu": 14.205944116663549, "bleu_stderr": 0.19660692794590715, "rouge1_fmeasure": 0.5189271220066034, "rouge1_fmeasure_stderr": 0.002418580107930576, "rouge1_precision": 0.629117579178031, "rouge1_precision_stderr": 0.003151090759031406, "rouge1_recall": 0.4795403225988654, "rouge1_recall_stderr": 0.0031133800859191145, "rouge2_fmeasure": 0.25856091966979955, "rouge2_fmeasure_stderr": 0.0021853354866567033, "rouge2_precision": 0.3188120045749173, "rouge2_precision_stderr": 0.0028332187634174584, "rouge2_recall": 0.237926246408337, "rouge2_recall_stderr": 0.002308197357419717, "rougeL_fmeasure": 0.37557292388861946, "rougeL_fmeasure_stderr": 0.002225179308696749, "rougeL_precision": 0.4597945468948585, "rougeL_precision_stderr": 0.003115684011688492, "rougeL_recall": 0.3452962091320848, "rougeL_recall_stderr": 0.002523106279855593, "rougeLsum_fmeasure": 0.42601080418001464, "rougeLsum_fmeasure_stderr": 0.002443810293877186, "rougeLsum_precision": 0.5186469438087239, "rougeLsum_precision_stderr": 0.0032543870460336514, "rougeLsum_recall": 0.392689079714104, "rougeLsum_recall_stderr": 0.002843610143245334}}, "2": {"generate_text_restaurant": {"bleu": 16.071111072101633, "bleu_stderr": 0.2309824996590655, "rouge1_fmeasure": 0.5425841397857145, "rouge1_fmeasure_stderr": 0.002320322812438713, "rouge1_precision": 0.6296764082676305, "rouge1_precision_stderr": 0.0030193970190650846, "rouge1_recall": 0.5107223124747078, "rouge1_recall_stderr": 0.002988235629344117, "rouge2_fmeasure": 0.27706127487413984, "rouge2_fmeasure_stderr": 0.002211454462264459, "rouge2_precision": 0.3251232789672217, "rouge2_precision_stderr": 0.002759350219866227, "rouge2_recall": 0.2602584858742124, "rouge2_recall_stderr": 0.0023448744796564446, "rougeL_fmeasure": 0.3922571711440134, "rougeL_fmeasure_stderr": 0.0022435462402305216, "rougeL_precision": 0.4578033945398497, "rougeL_precision_stderr": 0.0029467611947813104, "rougeL_recall": 0.3679450110714652, "rougeL_recall_stderr": 0.0025363954803661686, "rougeLsum_fmeasure": 0.44965466525988723, "rougeLsum_fmeasure_stderr": 0.002432527507187589, "rougeLsum_precision": 0.5229069999116139, "rougeLsum_precision_stderr": 0.0031160482350125096, "rougeLsum_recall": 0.42246062614896984, "rougeLsum_recall_stderr": 0.002814320660639173}}, "3": {"generate_text_restaurant": {"bleu": 16.661820832282935, "bleu_stderr": 0.13584464825094733, "rouge1_fmeasure": 0.5468259622064128, "rouge1_fmeasure_stderr": 0.0023368791133733077, "rouge1_precision": 0.6283682086504476, "rouge1_precision_stderr": 0.003027176846837973, "rouge1_recall": 0.5163584155986533, "rouge1_recall_stderr": 0.0029400460301328033, "rouge2_fmeasure": 0.2832952296859798, "rouge2_fmeasure_stderr": 0.002276085768986989, "rouge2_precision": 0.3285227127217099, "rouge2_precision_stderr": 0.002755658404708219, "rouge2_recall": 0.2671750175948068, "rouge2_recall_stderr": 0.0024177007025591667, "rougeL_fmeasure": 0.39607703174222275, "rougeL_fmeasure_stderr": 0.00227620179746131, "rougeL_precision": 0.45680275725143005, "rougeL_precision_stderr": 0.0029287472273553197, "rougeL_recall": 0.3733889530706762, "rougeL_recall_stderr": 0.0025618954567412744, "rougeLsum_fmeasure": 0.45624372790962664, "rougeLsum_fmeasure_stderr": 0.002469472910726116, "rougeLsum_precision": 0.524885452033707, "rougeLsum_precision_stderr": 0.0031174665098531174, "rougeLsum_recall": 0.4304117335660357, "rougeLsum_recall_stderr": 0.0028265389893181872}}, "4": {"generate_text_restaurant": {"bleu": 16.942102975283234, "bleu_stderr": 0.2084981633485294, "rouge1_fmeasure": 0.5483049560940431, "rouge1_fmeasure_stderr": 0.0023136552508774126, "rouge1_precision": 0.6312614007447952, "rouge1_precision_stderr": 0.0030139804215310135, "rouge1_recall": 0.5164328539609403, "rouge1_recall_stderr": 0.0029184430682994886, "rouge2_fmeasure": 0.2852035973993449, "rouge2_fmeasure_stderr": 0.002275074158075912, "rouge2_precision": 0.33080708293449046, "rouge2_precision_stderr": 0.0027488759000423722, "rouge2_recall": 0.268484119351932, "rouge2_recall_stderr": 0.0024140615360160383, "rougeL_fmeasure": 0.39845530783204897, "rougeL_fmeasure_stderr": 0.0022385569603315335, "rougeL_precision": 0.4599303108022324, "rougeL_precision_stderr": 0.0028723372602033163, "rougeL_recall": 0.374903094367329, "rougeL_recall_stderr": 0.002550115376287096, "rougeLsum_fmeasure": 0.45950622151366366, "rougeLsum_fmeasure_stderr": 0.002480687521893093, "rougeLsum_precision": 0.5288947945367807, "rougeLsum_precision_stderr": 0.0031071223574836125, "rougeLsum_recall": 0.43282798313690257, "rougeLsum_recall_stderr": 0.0028550125485603335}}, "5": {"generate_text_restaurant": {"bleu": 17.000541841925337, "bleu_stderr": 0.14715813248977833, "rouge1_fmeasure": 0.5509023290461471, "rouge1_fmeasure_stderr": 0.002273014256699154, "rouge1_precision": 0.6357874949680151, "rouge1_precision_stderr": 0.0030539024570188913, "rouge1_recall": 0.5175444131455442, "rouge1_recall_stderr": 0.002851597583009859, "rouge2_fmeasure": 0.2874214398128714, "rouge2_fmeasure_stderr": 0.002247772455114546, "rouge2_precision": 0.3351646545228709, "rouge2_precision_stderr": 0.002772544496609677, "rouge2_recall": 0.26928603517530647, "rouge2_recall_stderr": 0.0023598376837782593, "rougeL_fmeasure": 0.4024933039267246, "rougeL_fmeasure_stderr": 0.0022578799277417154, "rougeL_precision": 0.46622379065467295, "rougeL_precision_stderr": 0.0029578940902582024, "rougeL_recall": 0.3773597520223647, "rougeL_recall_stderr": 0.0025234920814772013, "rougeLsum_fmeasure": 0.46375340254506775, "rougeLsum_fmeasure_stderr": 0.0024370043326962243, "rougeLsum_precision": 0.5353285504287588, "rougeLsum_precision_stderr": 0.0031267884082468948, "rougeLsum_recall": 0.43552925393049774, "rougeLsum_recall_stderr": 0.0027896003850241997}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.3666808564694861, "bleu_stderr": 0.03712060415164849, "rouge1_fmeasure": 0.19751118294202907, "rouge1_fmeasure_stderr": 0.0024021392095633014, "rouge1_precision": 0.1415584405102576, "rouge1_precision_stderr": 0.0018342220903453939, "rouge1_recall": 0.3423471837012039, "rouge1_recall_stderr": 0.004049885897676762, "rouge2_fmeasure": 0.0387268948293068, "rouge2_fmeasure_stderr": 0.0013216086839954264, "rouge2_precision": 0.027603541246792315, "rouge2_precision_stderr": 0.0009561551319314068, "rouge2_recall": 0.06858319322142874, "rouge2_recall_stderr": 0.002371005302860338, "rougeL_fmeasure": 0.14458151036645295, "rougeL_fmeasure_stderr": 0.001686848606843291, "rougeL_precision": 0.10342088038536323, "rougeL_precision_stderr": 0.001290711690896704, "rougeL_recall": 0.25266361639119705, "rougeL_recall_stderr": 0.00301407641678503, "rougeLsum_fmeasure": 0.15431152112459381, "rougeLsum_fmeasure_stderr": 0.001959929690278966, "rougeLsum_precision": 0.11034350121620272, "rougeLsum_precision_stderr": 0.0014752788971248753, "rougeLsum_recall": 0.2693856262003054, "rougeLsum_recall_stderr": 0.0034626798052445965}}, "1": {"article_DOC_summary": {"bleu": 1.540235314246414, "bleu_stderr": 0.1314459967628547, "rouge1_fmeasure": 0.1928079155440598, "rouge1_fmeasure_stderr": 0.0029028365384471786, "rouge1_precision": 0.1842422097525459, "rouge1_precision_stderr": 0.0035915109505606675, "rouge1_recall": 0.24758207357013493, "rouge1_recall_stderr": 0.0038175154991861764, "rouge2_fmeasure": 0.035920449607511167, "rouge2_fmeasure_stderr": 0.0016275383746289565, "rouge2_precision": 0.0350810866758887, "rouge2_precision_stderr": 0.0018430115089155975, "rouge2_recall": 0.045860090162403284, "rouge2_recall_stderr": 0.0020289388644285935, "rougeL_fmeasure": 0.14610955496991848, "rougeL_fmeasure_stderr": 0.0021852905097776766, "rougeL_precision": 0.13907841269107943, "rougeL_precision_stderr": 0.002731267156479104, "rougeL_recall": 0.18969373624313668, "rougeL_recall_stderr": 0.002976822276402352, "rougeLsum_fmeasure": 0.1483176659092092, "rougeLsum_fmeasure_stderr": 0.0022504198074081118, "rougeLsum_precision": 0.14082105930643332, "rougeLsum_precision_stderr": 0.0027540898674911435, "rougeLsum_recall": 0.19327165768740787, "rougeLsum_recall_stderr": 0.0031712008624335663}}, "2": {"article_DOC_summary": {"bleu": 1.741793111916229, "bleu_stderr": 0.10363017877588075, "rouge1_fmeasure": 0.206746273779149, "rouge1_fmeasure_stderr": 0.0030879518807743977, "rouge1_precision": 0.20230524620768114, "rouge1_precision_stderr": 0.0038441661424815665, "rouge1_recall": 0.25331001020907545, "rouge1_recall_stderr": 0.0037632181020687654, "rouge2_fmeasure": 0.040860035399920204, "rouge2_fmeasure_stderr": 0.0018111215268303762, "rouge2_precision": 0.04149596537792, "rouge2_precision_stderr": 0.0021203757677911335, "rouge2_recall": 0.04828025200420694, "rouge2_recall_stderr": 0.002004013267225401, "rougeL_fmeasure": 0.15710481533718787, "rougeL_fmeasure_stderr": 0.002385044461084046, "rougeL_precision": 0.153811814062988, "rougeL_precision_stderr": 0.0030255266080624807, "rougeL_recall": 0.19308341637681664, "rougeL_recall_stderr": 0.0028697236806549753, "rougeLsum_fmeasure": 0.16020234890977297, "rougeLsum_fmeasure_stderr": 0.00243602987796027, "rougeLsum_precision": 0.15612887410327278, "rougeLsum_precision_stderr": 0.0030266026245471442, "rougeLsum_recall": 0.198203547839264, "rougeLsum_recall_stderr": 0.0030935390563720244}}, "3": {"article_DOC_summary": {"bleu": 1.6113588628510822, "bleu_stderr": 0.11396623599430974, "rouge1_fmeasure": 0.19990081779928884, "rouge1_fmeasure_stderr": 0.0032358151653303128, "rouge1_precision": 0.199842761557467, "rouge1_precision_stderr": 0.003930101978562163, "rouge1_recall": 0.239057741643307, "rouge1_recall_stderr": 0.003952499479320612, "rouge2_fmeasure": 0.03981086125537371, "rouge2_fmeasure_stderr": 0.001755081460450486, "rouge2_precision": 0.04113079612840519, "rouge2_precision_stderr": 0.0020424233402084185, "rouge2_recall": 0.046338285466016295, "rouge2_recall_stderr": 0.0019373451129642888, "rougeL_fmeasure": 0.15112897374382445, "rougeL_fmeasure_stderr": 0.0025072042680957473, "rougeL_precision": 0.15164211505308783, "rougeL_precision_stderr": 0.003114734733434136, "rougeL_recall": 0.18076409869179713, "rougeL_recall_stderr": 0.0029742234432236295, "rougeLsum_fmeasure": 0.15506313451164785, "rougeLsum_fmeasure_stderr": 0.002575539720164216, "rougeLsum_precision": 0.15460054841726079, "rougeLsum_precision_stderr": 0.0031209530961592478, "rougeLsum_recall": 0.18722148840891398, "rougeLsum_recall_stderr": 0.0032584028075633336}}, "4": {"article_DOC_summary": {"bleu": 0.24204436176963062, "bleu_stderr": 0.049836255040165964, "rouge1_fmeasure": 0.05537758901399952, "rouge1_fmeasure_stderr": 0.0032372384689409446, "rouge1_precision": 0.06141305776935732, "rouge1_precision_stderr": 0.0037852792194572084, "rouge1_recall": 0.06166772338370753, "rouge1_recall_stderr": 0.003691117783479449, "rouge2_fmeasure": 0.010826553835341136, "rouge2_fmeasure_stderr": 0.0011334677529914565, "rouge2_precision": 0.012102488503521497, "rouge2_precision_stderr": 0.0014128421789064009, "rouge2_recall": 0.012216685991029144, "rouge2_recall_stderr": 0.0012769493637188112, "rougeL_fmeasure": 0.040349535280182126, "rougeL_fmeasure_stderr": 0.0023730718998770488, "rougeL_precision": 0.04535979131249153, "rougeL_precision_stderr": 0.0028757909611822202, "rougeL_recall": 0.04534764597254796, "rougeL_recall_stderr": 0.002768325752693544, "rougeLsum_fmeasure": 0.04167770978428173, "rougeLsum_fmeasure_stderr": 0.002441565692107216, "rougeLsum_precision": 0.04664371878837957, "rougeLsum_precision_stderr": 0.0029266385774482846, "rougeLsum_recall": 0.04710287355826148, "rougeLsum_recall_stderr": 0.0028941161204292173}}, "5": {"article_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}}}
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.333,0.014910846164229868,0
3
+ anli_r2,acc,0.334,0.014922019523732963,0
4
+ anli_r3,acc,0.33416666666666667,0.013622434813136781,0
5
+ arc_challenge,acc,0.22184300341296928,0.012141659068147882,0
6
+ arc_challenge,acc_norm,0.24829351535836178,0.012624912868089767,0
7
+ arc_easy,acc,0.5202020202020202,0.010251405621305368,0
8
+ arc_easy,acc_norm,0.45075757575757575,0.010209906101011119,0
9
+ boolq,acc,0.6033639143730887,0.008556148582031999,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.21956970232832299,,1
12
+ copa,acc,0.63,0.048523658709391,0
13
+ hellaswag,acc,0.35988846843258315,0.004789865379084506,0
14
+ hellaswag,acc_norm,0.4446325433180641,0.004959094146471531,0
15
+ piqa,acc,0.6942328618063112,0.01074962736614163,0
16
+ piqa,acc_norm,0.691512513601741,0.010776164678037157,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.814,0.012310790208412805,0
19
+ sciq,acc_norm,0.733,0.013996674851796283,0
20
+ storycloze_2016,acc,0.6451095670764297,0.01106478765990412,0
21
+ winogrande,acc,0.5193370165745856,0.014041972733712972,0
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_0_lm-eval_global_step80108_2023-05-13-13-52-19_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.333,
5
- "acc_stderr": 0.014910846164229868
6
- },
7
- "anli_r2": {
8
- "acc": 0.334,
9
- "acc_stderr": 0.014922019523732963
10
- },
11
- "anli_r3": {
12
- "acc": 0.33416666666666667,
13
- "acc_stderr": 0.013622434813136781
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.21956970232832299
19
- },
20
- "copa": {
21
- "acc": 0.63,
22
- "acc_stderr": 0.048523658709391
23
- },
24
- "hellaswag": {
25
- "acc": 0.35988846843258315,
26
- "acc_stderr": 0.004789865379084506,
27
- "acc_norm": 0.4446325433180641,
28
- "acc_norm_stderr": 0.004959094146471531
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5193370165745856,
36
- "acc_stderr": 0.014041972733712972
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6451095670764297,
40
- "acc_stderr": 0.01106478765990412
41
- },
42
- "boolq": {
43
- "acc": 0.6033639143730887,
44
- "acc_stderr": 0.008556148582031999
45
- },
46
- "arc_easy": {
47
- "acc": 0.5202020202020202,
48
- "acc_stderr": 0.010251405621305368,
49
- "acc_norm": 0.45075757575757575,
50
- "acc_norm_stderr": 0.010209906101011119
51
- },
52
- "arc_challenge": {
53
- "acc": 0.22184300341296928,
54
- "acc_stderr": 0.012141659068147882,
55
- "acc_norm": 0.24829351535836178,
56
- "acc_norm_stderr": 0.012624912868089767
57
- },
58
- "sciq": {
59
- "acc": 0.814,
60
- "acc_stderr": 0.012310790208412805,
61
- "acc_norm": 0.733,
62
- "acc_norm_stderr": 0.013996674851796283
63
- },
64
- "piqa": {
65
- "acc": 0.6942328618063112,
66
- "acc_stderr": 0.01074962736614163,
67
- "acc_norm": 0.691512513601741,
68
- "acc_norm_stderr": 0.010776164678037157
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.321,0.014770821817934642,0
3
+ anli_r2,acc,0.321,0.014770821817934647,0
4
+ anli_r3,acc,0.3283333333333333,0.013562032919529017,0
5
+ arc_challenge,acc,0.23037542662116042,0.01230492841874761,0
6
+ arc_challenge,acc_norm,0.25426621160409557,0.012724999945157744,0
7
+ arc_easy,acc,0.5290404040404041,0.010242463826395617,0
8
+ arc_easy,acc_norm,0.49537037037037035,0.010259343705889734,0
9
+ boolq,acc,0.5715596330275229,0.008655028561519777,1
10
+ cb,acc,0.48214285714285715,0.06737697508644648,1
11
+ cb,f1,0.3421052631578947,,1
12
+ copa,acc,0.67,0.04725815626252607,0
13
+ hellaswag,acc,0.3595897231627166,0.0047889940606542745,0
14
+ hellaswag,acc_norm,0.4453296156144194,0.004959864299178128,0
15
+ piqa,acc,0.6882480957562568,0.01080743142487367,0
16
+ piqa,acc_norm,0.6898803046789989,0.010791876566843031,0
17
+ rte,acc,0.5487364620938628,0.02995314924180895,0
18
+ sciq,acc,0.873,0.010534798620855762,0
19
+ sciq,acc_norm,0.865,0.01081165537241605,0
20
+ storycloze_2016,acc,0.6215927311598076,0.011215325833205825,0
21
+ winogrande,acc,0.5564325177584846,0.0139626949076204,0
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_1_lm-eval_global_step80108_2023-05-13-13-52-19_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.321,
5
- "acc_stderr": 0.014770821817934642
6
- },
7
- "anli_r2": {
8
- "acc": 0.321,
9
- "acc_stderr": 0.014770821817934647
10
- },
11
- "anli_r3": {
12
- "acc": 0.3283333333333333,
13
- "acc_stderr": 0.013562032919529017
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.3421052631578947
19
- },
20
- "copa": {
21
- "acc": 0.67,
22
- "acc_stderr": 0.04725815626252607
23
- },
24
- "hellaswag": {
25
- "acc": 0.3595897231627166,
26
- "acc_stderr": 0.0047889940606542745,
27
- "acc_norm": 0.4453296156144194,
28
- "acc_norm_stderr": 0.004959864299178128
29
- },
30
- "rte": {
31
- "acc": 0.5487364620938628,
32
- "acc_stderr": 0.02995314924180895
33
- },
34
- "winogrande": {
35
- "acc": 0.5564325177584846,
36
- "acc_stderr": 0.0139626949076204
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6215927311598076,
40
- "acc_stderr": 0.011215325833205825
41
- },
42
- "boolq": {
43
- "acc": 0.5715596330275229,
44
- "acc_stderr": 0.008655028561519777
45
- },
46
- "arc_easy": {
47
- "acc": 0.5290404040404041,
48
- "acc_stderr": 0.010242463826395617,
49
- "acc_norm": 0.49537037037037035,
50
- "acc_norm_stderr": 0.010259343705889734
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23037542662116042,
54
- "acc_stderr": 0.01230492841874761,
55
- "acc_norm": 0.25426621160409557,
56
- "acc_norm_stderr": 0.012724999945157744
57
- },
58
- "sciq": {
59
- "acc": 0.873,
60
- "acc_stderr": 0.010534798620855762,
61
- "acc_norm": 0.865,
62
- "acc_norm_stderr": 0.01081165537241605
63
- },
64
- "piqa": {
65
- "acc": 0.6882480957562568,
66
- "acc_stderr": 0.01080743142487367,
67
- "acc_norm": 0.6898803046789989,
68
- "acc_norm_stderr": 0.010791876566843031
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.319,0.014746404865473496,0
3
+ anli_r2,acc,0.348,0.015070604603768408,0
4
+ anli_r3,acc,0.32166666666666666,0.013490095282989521,0
5
+ arc_challenge,acc,0.2380546075085324,0.012445770028026205,0
6
+ arc_challenge,acc_norm,0.26621160409556316,0.012915774781523214,0
7
+ arc_easy,acc,0.5357744107744108,0.01023348870972654,0
8
+ arc_easy,acc_norm,0.5130471380471381,0.010256289925058438,0
9
+ boolq,acc,0.5706422018348624,0.008657333755353677,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.33659445609131144,,1
12
+ copa,acc,0.61,0.04902071300001974,0
13
+ hellaswag,acc,0.36008763194582755,0.004790445139186363,0
14
+ hellaswag,acc_norm,0.44811790479984065,0.00496284620612548,0
15
+ piqa,acc,0.7007616974972797,0.010684130673134581,0
16
+ piqa,acc_norm,0.6974972796517954,0.010717199698083896,0
17
+ rte,acc,0.5054151624548736,0.030094698123239966,0
18
+ sciq,acc,0.899,0.009533618929340992,0
19
+ sciq,acc_norm,0.884,0.010131468138757004,0
20
+ storycloze_2016,acc,0.6290753607696419,0.011170519624693493,0
21
+ winogrande,acc,0.531965272296764,0.014023739221166384,0
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_2_lm-eval_global_step80108_2023-05-13-13-52-19_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.319,
5
- "acc_stderr": 0.014746404865473496
6
- },
7
- "anli_r2": {
8
- "acc": 0.348,
9
- "acc_stderr": 0.015070604603768408
10
- },
11
- "anli_r3": {
12
- "acc": 0.32166666666666666,
13
- "acc_stderr": 0.013490095282989521
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.33659445609131144
19
- },
20
- "copa": {
21
- "acc": 0.61,
22
- "acc_stderr": 0.04902071300001974
23
- },
24
- "hellaswag": {
25
- "acc": 0.36008763194582755,
26
- "acc_stderr": 0.004790445139186363,
27
- "acc_norm": 0.44811790479984065,
28
- "acc_norm_stderr": 0.00496284620612548
29
- },
30
- "rte": {
31
- "acc": 0.5054151624548736,
32
- "acc_stderr": 0.030094698123239966
33
- },
34
- "winogrande": {
35
- "acc": 0.531965272296764,
36
- "acc_stderr": 0.014023739221166384
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6290753607696419,
40
- "acc_stderr": 0.011170519624693493
41
- },
42
- "boolq": {
43
- "acc": 0.5706422018348624,
44
- "acc_stderr": 0.008657333755353677
45
- },
46
- "arc_easy": {
47
- "acc": 0.5357744107744108,
48
- "acc_stderr": 0.01023348870972654,
49
- "acc_norm": 0.5130471380471381,
50
- "acc_norm_stderr": 0.010256289925058438
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2380546075085324,
54
- "acc_stderr": 0.012445770028026205,
55
- "acc_norm": 0.26621160409556316,
56
- "acc_norm_stderr": 0.012915774781523214
57
- },
58
- "sciq": {
59
- "acc": 0.899,
60
- "acc_stderr": 0.009533618929340992,
61
- "acc_norm": 0.884,
62
- "acc_norm_stderr": 0.010131468138757004
63
- },
64
- "piqa": {
65
- "acc": 0.7007616974972797,
66
- "acc_stderr": 0.010684130673134581,
67
- "acc_norm": 0.6974972796517954,
68
- "acc_norm_stderr": 0.010717199698083896
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.3,0.014498627873361428,0
3
+ anli_r2,acc,0.338,0.01496596071022449,0
4
+ anli_r3,acc,0.3333333333333333,0.013613950010225601,0
5
+ arc_challenge,acc,0.23037542662116042,0.01230492841874761,0
6
+ arc_challenge,acc_norm,0.25853242320819114,0.012794553754288672,0
7
+ arc_easy,acc,0.5374579124579124,0.010230952104570801,0
8
+ arc_easy,acc_norm,0.5147306397306397,0.010255329977562103,0
9
+ boolq,acc,0.5605504587155963,0.00868069312581018,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.3726459510357816,,1
12
+ copa,acc,0.63,0.04852365870939099,0
13
+ hellaswag,acc,0.3593905596494722,0.004788412062375705,0
14
+ hellaswag,acc_norm,0.45030870344552876,0.004965078477435574,0
15
+ piqa,acc,0.7029379760609358,0.010661725404814798,0
16
+ piqa,acc_norm,0.7023939064200218,0.010667353792388205,0
17
+ rte,acc,0.51985559566787,0.030072723167317184,0
18
+ sciq,acc,0.903,0.009363689373248106,0
19
+ sciq,acc_norm,0.894,0.00973955126578513,0
20
+ storycloze_2016,acc,0.632816675574559,0.011147041781368648,0
21
+ winogrande,acc,0.5422257300710339,0.014002284504422435,0
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_3_lm-eval_global_step80108_2023-05-13-13-52-19_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.3,
5
- "acc_stderr": 0.014498627873361428
6
- },
7
- "anli_r2": {
8
- "acc": 0.338,
9
- "acc_stderr": 0.01496596071022449
10
- },
11
- "anli_r3": {
12
- "acc": 0.3333333333333333,
13
- "acc_stderr": 0.013613950010225601
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.3726459510357816
19
- },
20
- "copa": {
21
- "acc": 0.63,
22
- "acc_stderr": 0.04852365870939099
23
- },
24
- "hellaswag": {
25
- "acc": 0.3593905596494722,
26
- "acc_stderr": 0.004788412062375705,
27
- "acc_norm": 0.45030870344552876,
28
- "acc_norm_stderr": 0.004965078477435574
29
- },
30
- "rte": {
31
- "acc": 0.51985559566787,
32
- "acc_stderr": 0.030072723167317184
33
- },
34
- "winogrande": {
35
- "acc": 0.5422257300710339,
36
- "acc_stderr": 0.014002284504422435
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.632816675574559,
40
- "acc_stderr": 0.011147041781368648
41
- },
42
- "boolq": {
43
- "acc": 0.5605504587155963,
44
- "acc_stderr": 0.00868069312581018
45
- },
46
- "arc_easy": {
47
- "acc": 0.5374579124579124,
48
- "acc_stderr": 0.010230952104570801,
49
- "acc_norm": 0.5147306397306397,
50
- "acc_norm_stderr": 0.010255329977562103
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23037542662116042,
54
- "acc_stderr": 0.01230492841874761,
55
- "acc_norm": 0.25853242320819114,
56
- "acc_norm_stderr": 0.012794553754288672
57
- },
58
- "sciq": {
59
- "acc": 0.903,
60
- "acc_stderr": 0.009363689373248106,
61
- "acc_norm": 0.894,
62
- "acc_norm_stderr": 0.00973955126578513
63
- },
64
- "piqa": {
65
- "acc": 0.7029379760609358,
66
- "acc_stderr": 0.010661725404814798,
67
- "acc_norm": 0.7023939064200218,
68
- "acc_norm_stderr": 0.010667353792388205
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.305,0.014566646394664392,0
3
+ anli_r2,acc,0.352,0.015110404505648668,0
4
+ anli_r3,acc,0.32666666666666666,0.013544340907003663,0
5
+ arc_challenge,acc,0.23208191126279865,0.012336718284948854,0
6
+ arc_challenge,acc_norm,0.26109215017064846,0.012835523909473847,0
7
+ arc_easy,acc,0.5328282828282829,0.010237645778853874,0
8
+ arc_easy,acc_norm,0.5180976430976431,0.010253060653479164,0
9
+ boolq,acc,0.544954128440367,0.008709637955263423,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.3107226107226107,,1
12
+ copa,acc,0.65,0.047937248544110196,0
13
+ hellaswag,acc,0.3594901414060944,0.004788703173474754,0
14
+ hellaswag,acc_norm,0.4451304521011751,0.004959645263390237,0
15
+ piqa,acc,0.6980413492927094,0.010711732891588357,0
16
+ piqa,acc_norm,0.6964091403699674,0.010728079893076364,0
17
+ rte,acc,0.44404332129963897,0.029907396333795997,0
18
+ sciq,acc,0.906,0.009233052000787733,0
19
+ sciq,acc_norm,0.894,0.009739551265785138,0
20
+ storycloze_2016,acc,0.6419027258150721,0.011087006809925708,0
21
+ winogrande,acc,0.5367008681925809,0.01401457845884326,0
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_4_lm-eval_global_step80108_2023-05-13-13-52-19_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.305,
5
- "acc_stderr": 0.014566646394664392
6
- },
7
- "anli_r2": {
8
- "acc": 0.352,
9
- "acc_stderr": 0.015110404505648668
10
- },
11
- "anli_r3": {
12
- "acc": 0.32666666666666666,
13
- "acc_stderr": 0.013544340907003663
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.3107226107226107
19
- },
20
- "copa": {
21
- "acc": 0.65,
22
- "acc_stderr": 0.047937248544110196
23
- },
24
- "hellaswag": {
25
- "acc": 0.3594901414060944,
26
- "acc_stderr": 0.004788703173474754,
27
- "acc_norm": 0.4451304521011751,
28
- "acc_norm_stderr": 0.004959645263390237
29
- },
30
- "rte": {
31
- "acc": 0.44404332129963897,
32
- "acc_stderr": 0.029907396333795997
33
- },
34
- "winogrande": {
35
- "acc": 0.5367008681925809,
36
- "acc_stderr": 0.01401457845884326
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6419027258150721,
40
- "acc_stderr": 0.011087006809925708
41
- },
42
- "boolq": {
43
- "acc": 0.544954128440367,
44
- "acc_stderr": 0.008709637955263423
45
- },
46
- "arc_easy": {
47
- "acc": 0.5328282828282829,
48
- "acc_stderr": 0.010237645778853874,
49
- "acc_norm": 0.5180976430976431,
50
- "acc_norm_stderr": 0.010253060653479164
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23208191126279865,
54
- "acc_stderr": 0.012336718284948854,
55
- "acc_norm": 0.26109215017064846,
56
- "acc_norm_stderr": 0.012835523909473847
57
- },
58
- "sciq": {
59
- "acc": 0.906,
60
- "acc_stderr": 0.009233052000787733,
61
- "acc_norm": 0.894,
62
- "acc_norm_stderr": 0.009739551265785138
63
- },
64
- "piqa": {
65
- "acc": 0.6980413492927094,
66
- "acc_stderr": 0.010711732891588357,
67
- "acc_norm": 0.6964091403699674,
68
- "acc_norm_stderr": 0.010728079893076364
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.338,0.014965960710224494,0
3
+ anli_r2,acc,0.337,0.014955087918653607,0
4
+ anli_r3,acc,0.33666666666666667,0.0136476029424064,0
5
+ arc_challenge,acc,0.23208191126279865,0.012336718284948854,0
6
+ arc_challenge,acc_norm,0.2525597269624573,0.012696728980207704,0
7
+ arc_easy,acc,0.5353535353535354,0.010234104543411438,0
8
+ arc_easy,acc_norm,0.5290404040404041,0.010242463826395614,0
9
+ boolq,acc,0.5244648318042814,0.008734580382857543,1
10
+ cb,acc,0.48214285714285715,0.06737697508644648,1
11
+ cb,f1,0.3363636363636364,,1
12
+ copa,acc,0.63,0.04852365870939099,0
13
+ hellaswag,acc,0.3599880501892053,0.00479015537099345,0
14
+ hellaswag,acc_norm,0.44722166899024096,0.004961904949171393,0
15
+ piqa,acc,0.6942328618063112,0.010749627366141636,0
16
+ piqa,acc_norm,0.6936887921653971,0.010754970032367321,0
17
+ rte,acc,0.5451263537906137,0.02997363649541526,0
18
+ sciq,acc,0.904,0.009320454434783212,0
19
+ sciq,acc_norm,0.899,0.009533618929340987,0
20
+ storycloze_2016,acc,0.6338856226616782,0.011140201326066447,0
21
+ winogrande,acc,0.5374901341752171,0.014012928183336576,0
4b284b84b10c4pyseed2/evaluation/rankeval/4b284b84b10c4pyseed2_5_lm-eval_global_step80108_2023-05-13-13-52-19_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.338,
5
- "acc_stderr": 0.014965960710224494
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.014955087918653607
10
- },
11
- "anli_r3": {
12
- "acc": 0.33666666666666667,
13
- "acc_stderr": 0.0136476029424064
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.3363636363636364
19
- },
20
- "copa": {
21
- "acc": 0.63,
22
- "acc_stderr": 0.04852365870939099
23
- },
24
- "hellaswag": {
25
- "acc": 0.3599880501892053,
26
- "acc_stderr": 0.00479015537099345,
27
- "acc_norm": 0.44722166899024096,
28
- "acc_norm_stderr": 0.004961904949171393
29
- },
30
- "rte": {
31
- "acc": 0.5451263537906137,
32
- "acc_stderr": 0.02997363649541526
33
- },
34
- "winogrande": {
35
- "acc": 0.5374901341752171,
36
- "acc_stderr": 0.014012928183336576
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6338856226616782,
40
- "acc_stderr": 0.011140201326066447
41
- },
42
- "boolq": {
43
- "acc": 0.5244648318042814,
44
- "acc_stderr": 0.008734580382857543
45
- },
46
- "arc_easy": {
47
- "acc": 0.5353535353535354,
48
- "acc_stderr": 0.010234104543411438,
49
- "acc_norm": 0.5290404040404041,
50
- "acc_norm_stderr": 0.010242463826395614
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23208191126279865,
54
- "acc_stderr": 0.012336718284948854,
55
- "acc_norm": 0.2525597269624573,
56
- "acc_norm_stderr": 0.012696728980207704
57
- },
58
- "sciq": {
59
- "acc": 0.904,
60
- "acc_stderr": 0.009320454434783212,
61
- "acc_norm": 0.899,
62
- "acc_norm_stderr": 0.009533618929340987
63
- },
64
- "piqa": {
65
- "acc": 0.6942328618063112,
66
- "acc_stderr": 0.010749627366141636,
67
- "acc_norm": 0.6936887921653971,
68
- "acc_norm_stderr": 0.010754970032367321
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed3/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.008921217264042648
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.008921217264042648
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.24687670135963216
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.24687670135963216
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.27547143470016905
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.27547143470016905
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2861781255157219
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2861781255157219
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2903858863495276
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2903858863495276
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2956772604735498
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2956772604735498
14
+ e2e_nlg_cleaned,5,average,multiple,0.23391843761044054
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.038403716706480975
16
+ gem_xsum,0,median,rouge2_fmeasure,0.038403716706480975
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.031204994080253484
18
+ gem_xsum,1,median,rouge2_fmeasure,0.031204994080253484
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.0352286750067272
20
+ gem_xsum,2,median,rouge2_fmeasure,0.0352286750067272
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.0350656521495642
22
+ gem_xsum,3,median,rouge2_fmeasure,0.0350656521495642
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.008761882130394149
24
+ gem_xsum,4,median,rouge2_fmeasure,0.008761882130394149
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00035443021875998775
26
+ gem_xsum,5,median,rouge2_fmeasure,0.00035443021875998775
27
+ gem_xsum,5,average,multiple,0.02483655838203
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04616588548171051
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.04616588548171051
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08899166586908325
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.08899166586908325
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09401312544902726
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.09401312544902726
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.0957513484343238
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.0957513484343238
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.09486765760399402
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.09486765760399402
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.1059062507269122
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.1059062507269122
40
+ web_nlg_en,5,average,multiple,0.0876159889275085
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.041400427593311015
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.041400427593311015
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04868595803993464
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.04868595803993464
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.055184577596106794
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.055184577596106794
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04573481227623241
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04573481227623241
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015593405995693707
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.015593405995693707
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0025367525205927523
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0025367525205927523
53
+ wiki_lingua_en,5,average,multiple,0.03485598900364522
4b284b84b10c4pyseed3/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.42620041379310336, "bleu_stderr": 0.04744139420236661, "rouge1_fmeasure": 0.10693418277318126, "rouge1_fmeasure_stderr": 0.0022130023993521437, "rouge1_precision": 0.0691775648686926, "rouge1_precision_stderr": 0.0016503427557122247, "rouge1_recall": 0.32773608546833954, "rouge1_recall_stderr": 0.004637247322176256, "rouge2_fmeasure": 0.04616588548171051, "rouge2_fmeasure_stderr": 0.0013847002123746326, "rouge2_precision": 0.0301266773286609, "rouge2_precision_stderr": 0.0010026938917210792, "rouge2_recall": 0.13791712704620593, "rouge2_recall_stderr": 0.0034364020474699345, "rougeL_fmeasure": 0.09944420906595677, "rougeL_fmeasure_stderr": 0.0019306683660170168, "rougeL_precision": 0.06398810484534598, "rougeL_precision_stderr": 0.0014153823277166157, "rougeL_recall": 0.30991209188143926, "rougeL_recall_stderr": 0.0043729197109120595, "rougeLsum_fmeasure": 0.0992188808452852, "rougeLsum_fmeasure_stderr": 0.0020658917264297283, "rougeLsum_precision": 0.06428324100714272, "rougeLsum_precision_stderr": 0.0015422219926810477, "rougeLsum_recall": 0.30301251912680405, "rougeLsum_recall_stderr": 0.004293941889375713}}, "1": {"PALM_prompt": {"bleu": 0.7896248877512438, "bleu_stderr": 0.04378952181301939, "rouge1_fmeasure": 0.17270187137498372, "rouge1_fmeasure_stderr": 0.0039611078582274025, "rouge1_precision": 0.1573791983940724, "rouge1_precision_stderr": 0.004885287034074671, "rouge1_recall": 0.32267743495858053, "rouge1_recall_stderr": 0.005199153399215404, "rouge2_fmeasure": 0.08899166586908325, "rouge2_fmeasure_stderr": 0.0026700035605150834, "rouge2_precision": 0.08434063015480048, "rouge2_precision_stderr": 0.003467970748362486, "rouge2_recall": 0.1683906505608992, "rouge2_recall_stderr": 0.003721311150353144, "rougeL_fmeasure": 0.154260045157592, "rougeL_fmeasure_stderr": 0.0033859969676789787, "rougeL_precision": 0.13919254041881315, "rougeL_precision_stderr": 0.004269665126635142, "rougeL_recall": 0.29750207569774706, "rougeL_recall_stderr": 0.004696978501629025, "rougeLsum_fmeasure": 0.15815293904291208, "rougeLsum_fmeasure_stderr": 0.003489311470494366, "rougeLsum_precision": 0.14332379688258434, "rougeLsum_precision_stderr": 0.004394526375773512, "rougeLsum_recall": 0.30182729666211616, "rougeLsum_recall_stderr": 0.004755358082997335}}, "2": {"PALM_prompt": {"bleu": 0.8185988147091975, "bleu_stderr": 0.031167193943362814, "rouge1_fmeasure": 0.17810978375983086, "rouge1_fmeasure_stderr": 0.004000593361592057, "rouge1_precision": 0.15788312305455665, "rouge1_precision_stderr": 0.0048620294985970095, "rouge1_recall": 0.3418470717884752, "rouge1_recall_stderr": 0.004837763494219531, "rouge2_fmeasure": 0.09401312544902726, "rouge2_fmeasure_stderr": 0.002776525016089949, "rouge2_precision": 0.08618243800912999, "rouge2_precision_stderr": 0.0034242914868625987, "rouge2_recall": 0.18205273463879626, "rouge2_recall_stderr": 0.003746573085222888, "rougeL_fmeasure": 0.16012386446958077, "rougeL_fmeasure_stderr": 0.0034427444278546772, "rougeL_precision": 0.13995710430411293, "rougeL_precision_stderr": 0.0042258755680366885, "rougeL_recall": 0.3176005083080862, "rougeL_recall_stderr": 0.004433457208163548, "rougeLsum_fmeasure": 0.16423046526923485, "rougeLsum_fmeasure_stderr": 0.0035562406771821875, "rougeLsum_precision": 0.14422849291310377, "rougeLsum_precision_stderr": 0.004374634242078078, "rougeLsum_recall": 0.3227717408592296, "rougeLsum_recall_stderr": 0.0044960908916778335}}, "3": {"PALM_prompt": {"bleu": 0.8615862421964587, "bleu_stderr": 0.041307188779765594, "rouge1_fmeasure": 0.18230770768701607, "rouge1_fmeasure_stderr": 0.004067744882596013, "rouge1_precision": 0.16034021621345107, "rouge1_precision_stderr": 0.00492104337227451, "rouge1_recall": 0.3489810930628782, "rouge1_recall_stderr": 0.004874435565231993, "rouge2_fmeasure": 0.0957513484343238, "rouge2_fmeasure_stderr": 0.002760624435993053, "rouge2_precision": 0.0878609902380165, "rouge2_precision_stderr": 0.0034315400612001425, "rouge2_recall": 0.1841999711130487, "rouge2_recall_stderr": 0.003697561904560479, "rougeL_fmeasure": 0.16203930173447764, "rougeL_fmeasure_stderr": 0.0034022667237539194, "rougeL_precision": 0.14059537511350467, "rougeL_precision_stderr": 0.004182187067596055, "rougeL_recall": 0.3214492134352525, "rougeL_recall_stderr": 0.004418433333388622, "rougeLsum_fmeasure": 0.16769243046364843, "rougeLsum_fmeasure_stderr": 0.0036010895340958735, "rougeLsum_precision": 0.14689411558946444, "rougeLsum_precision_stderr": 0.004463420320817473, "rougeLsum_recall": 0.32816046563066886, "rougeLsum_recall_stderr": 0.004521013158739452}}, "4": {"PALM_prompt": {"bleu": 0.9078967495785858, "bleu_stderr": 0.06199342796845484, "rouge1_fmeasure": 0.1808110212069158, "rouge1_fmeasure_stderr": 0.003932389511051684, "rouge1_precision": 0.15822797510203446, "rouge1_precision_stderr": 0.004799645911513736, "rouge1_recall": 0.35955942360322946, "rouge1_recall_stderr": 0.004858476039481075, "rouge2_fmeasure": 0.09486765760399402, "rouge2_fmeasure_stderr": 0.002716857158419665, "rouge2_precision": 0.08570094130642221, "rouge2_precision_stderr": 0.0032976315326201148, "rouge2_recall": 0.1914726235970588, "rouge2_recall_stderr": 0.003763461263571178, "rougeL_fmeasure": 0.16052571277909425, "rougeL_fmeasure_stderr": 0.0033097551322571576, "rougeL_precision": 0.13827155825737705, "rougeL_precision_stderr": 0.004090970598019047, "rougeL_recall": 0.33108893571795484, "rougeL_recall_stderr": 0.004441814248040771, "rougeLsum_fmeasure": 0.16642566787069105, "rougeLsum_fmeasure_stderr": 0.003485708948631489, "rougeLsum_precision": 0.14478427984968617, "rougeLsum_precision_stderr": 0.0043468501593388175, "rougeLsum_recall": 0.3385335196650446, "rougeLsum_recall_stderr": 0.004541378164710768}}, "5": {"PALM_prompt": {"bleu": 1.0214653425849234, "bleu_stderr": 0.051718802950406026, "rouge1_fmeasure": 0.1980771743113352, "rouge1_fmeasure_stderr": 0.0042397446260209045, "rouge1_precision": 0.17598952447989255, "rouge1_precision_stderr": 0.005109223943052461, "rouge1_recall": 0.37137161642723315, "rouge1_recall_stderr": 0.004895960712486457, "rouge2_fmeasure": 0.1059062507269122, "rouge2_fmeasure_stderr": 0.0029063076470396475, "rouge2_precision": 0.09718312872280611, "rouge2_precision_stderr": 0.003543443468697334, "rouge2_recall": 0.19896096589564022, "rouge2_recall_stderr": 0.003757611433412461, "rougeL_fmeasure": 0.17394746443193657, "rougeL_fmeasure_stderr": 0.003500984838687859, "rougeL_precision": 0.1524952131718066, "rougeL_precision_stderr": 0.004319950451993815, "rougeL_recall": 0.3379337552779587, "rougeL_recall_stderr": 0.004319819601203137, "rougeLsum_fmeasure": 0.18065092950858816, "rougeLsum_fmeasure_stderr": 0.003697677362596537, "rougeLsum_precision": 0.15932050683179452, "rougeLsum_precision_stderr": 0.0045446535843402846, "rougeLsum_recall": 0.34633231005438553, "rougeLsum_recall_stderr": 0.004438946898889946}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.9852070761745417, "bleu_stderr": 0.06299035997470342, "rouge1_fmeasure": 0.19103706374893056, "rouge1_fmeasure_stderr": 0.002043735292347794, "rouge1_precision": 0.16860667344204994, "rouge1_precision_stderr": 0.0021841411395797645, "rouge1_recall": 0.2661731179948035, "rouge1_recall_stderr": 0.00284529639888626, "rouge2_fmeasure": 0.041400427593311015, "rouge2_fmeasure_stderr": 0.0009488470121354593, "rouge2_precision": 0.03671423362447407, "rouge2_precision_stderr": 0.0008960018500616462, "rouge2_recall": 0.059001018243924924, "rouge2_recall_stderr": 0.0015155891903579815, "rougeL_fmeasure": 0.14463343003355403, "rougeL_fmeasure_stderr": 0.0014297659683361935, "rougeL_precision": 0.12613497698427706, "rougeL_precision_stderr": 0.001499797247781424, "rougeL_recall": 0.2064330083805086, "rougeL_recall_stderr": 0.0022522252253678104, "rougeLsum_fmeasure": 0.17784845250905368, "rougeLsum_fmeasure_stderr": 0.0019045988975403565, "rougeLsum_precision": 0.1567855983219522, "rougeLsum_precision_stderr": 0.0020295517089179986, "rougeLsum_recall": 0.2483349619872116, "rougeLsum_recall_stderr": 0.0026794537862227194}}, "1": {"tldr_en": {"bleu": 3.1171475217856495, "bleu_stderr": 0.0732823638652547, "rouge1_fmeasure": 0.19704746668072642, "rouge1_fmeasure_stderr": 0.0021042412574123225, "rouge1_precision": 0.2658507547888603, "rouge1_precision_stderr": 0.004026465911511502, "rouge1_recall": 0.2328368462346454, "rouge1_recall_stderr": 0.0029937833551213467, "rouge2_fmeasure": 0.04868595803993464, "rouge2_fmeasure_stderr": 0.0011798314842419708, "rouge2_precision": 0.0748908482587982, "rouge2_precision_stderr": 0.0025476836148153152, "rouge2_recall": 0.05760057760434295, "rouge2_recall_stderr": 0.0015041020667321094, "rougeL_fmeasure": 0.1514771579545783, "rougeL_fmeasure_stderr": 0.0015957456773506335, "rougeL_precision": 0.21205729170234144, "rougeL_precision_stderr": 0.0035529202213708147, "rougeL_recall": 0.17862428270219943, "rougeL_recall_stderr": 0.0023073429411224035, "rougeLsum_fmeasure": 0.18519966002425114, "rougeLsum_fmeasure_stderr": 0.0019761628687157812, "rougeLsum_precision": 0.25061241113995614, "rougeLsum_precision_stderr": 0.0038496579826934173, "rougeLsum_recall": 0.21923152019893813, "rougeLsum_recall_stderr": 0.0028436451438153894}}, "2": {"tldr_en": {"bleu": 3.4054599658125952, "bleu_stderr": 0.09231440486257102, "rouge1_fmeasure": 0.20560776290639593, "rouge1_fmeasure_stderr": 0.00217736823647119, "rouge1_precision": 0.3027031155227182, "rouge1_precision_stderr": 0.004342038045089806, "rouge1_recall": 0.22922672006822598, "rouge1_recall_stderr": 0.002979758776265915, "rouge2_fmeasure": 0.055184577596106794, "rouge2_fmeasure_stderr": 0.00125807152066459, "rouge2_precision": 0.0934700698168058, "rouge2_precision_stderr": 0.002885135667813671, "rouge2_recall": 0.06042261792614234, "rouge2_recall_stderr": 0.0015396678909290415, "rougeL_fmeasure": 0.1588044948539496, "rougeL_fmeasure_stderr": 0.001660286240181, "rougeL_precision": 0.2422810450258744, "rougeL_precision_stderr": 0.003793118315211268, "rougeL_recall": 0.17672966874029722, "rougeL_recall_stderr": 0.0023137956277683286, "rougeLsum_fmeasure": 0.19285081056007744, "rougeLsum_fmeasure_stderr": 0.0020600874117571713, "rougeLsum_precision": 0.2853014120609418, "rougeLsum_precision_stderr": 0.004180231533196898, "rougeLsum_recall": 0.21513643359500906, "rougeLsum_recall_stderr": 0.0028236262979142858}}, "3": {"tldr_en": {"bleu": 2.4729182434316845, "bleu_stderr": 0.10012650610258268, "rouge1_fmeasure": 0.17108887149601257, "rouge1_fmeasure_stderr": 0.0023757734793230243, "rouge1_precision": 0.2720995231813778, "rouge1_precision_stderr": 0.004571151466538639, "rouge1_recall": 0.18407862548000867, "rouge1_recall_stderr": 0.003112747498941597, "rouge2_fmeasure": 0.04573481227623241, "rouge2_fmeasure_stderr": 0.0012079588728888353, "rouge2_precision": 0.08258605803564657, "rouge2_precision_stderr": 0.0027367206182666052, "rouge2_recall": 0.04885841616109182, "rouge2_recall_stderr": 0.001465000767012408, "rougeL_fmeasure": 0.134646407467939, "rougeL_fmeasure_stderr": 0.001841407151547327, "rougeL_precision": 0.2226588719810121, "rougeL_precision_stderr": 0.003997487635899364, "rougeL_recall": 0.14367889553942859, "rougeL_recall_stderr": 0.0024217014629324563, "rougeLsum_fmeasure": 0.16167387736565225, "rougeLsum_fmeasure_stderr": 0.002248008777332088, "rougeLsum_precision": 0.2583722675377218, "rougeLsum_precision_stderr": 0.004405335044810428, "rougeLsum_recall": 0.17390969762570954, "rougeLsum_recall_stderr": 0.0029535029212216235}}, "4": {"tldr_en": {"bleu": 0.07534938261429568, "bleu_stderr": 0.009704997490064675, "rouge1_fmeasure": 0.0567144191881176, "rouge1_fmeasure_stderr": 0.0020102332262398916, "rouge1_precision": 0.09491794784193329, "rouge1_precision_stderr": 0.0037097710151794178, "rouge1_recall": 0.060899885159978906, "rouge1_recall_stderr": 0.002413174863836663, "rouge2_fmeasure": 0.015593405995693707, "rouge2_fmeasure_stderr": 0.0008587013242704453, "rouge2_precision": 0.03081132865016433, "rouge2_precision_stderr": 0.0020591475926861843, "rouge2_recall": 0.016965395276364273, "rouge2_recall_stderr": 0.0010423476942867118, "rougeL_fmeasure": 0.04491920157032178, "rougeL_fmeasure_stderr": 0.0015902262936374793, "rougeL_precision": 0.07894647636505078, "rougeL_precision_stderr": 0.0032347199346148634, "rougeL_recall": 0.047644376674010516, "rougeL_recall_stderr": 0.0018879539517060884, "rougeLsum_fmeasure": 0.05351208812836595, "rougeLsum_fmeasure_stderr": 0.0018972742099033458, "rougeLsum_precision": 0.09067740043709584, "rougeLsum_precision_stderr": 0.0035871220290805417, "rougeLsum_recall": 0.057217365239962695, "rougeLsum_recall_stderr": 0.0022671391933435326}}, "5": {"tldr_en": {"bleu": 6.707484906406634e-16, "bleu_stderr": 5.552150278336005e-15, "rouge1_fmeasure": 0.00885254442210785, "rouge1_fmeasure_stderr": 0.0008818591247815259, "rouge1_precision": 0.01653643601323088, "rouge1_precision_stderr": 0.0017360359101834665, "rouge1_recall": 0.008925814740369277, "rouge1_recall_stderr": 0.0010099485069104677, "rouge2_fmeasure": 0.0025367525205927523, "rouge2_fmeasure_stderr": 0.00037559906889018366, "rouge2_precision": 0.0057275660825049195, "rouge2_precision_stderr": 0.0009608629353948467, "rouge2_recall": 0.0024005366481846674, "rouge2_recall_stderr": 0.00036284006123770687, "rougeL_fmeasure": 0.007154574258422284, "rougeL_fmeasure_stderr": 0.0007226245089219507, "rougeL_precision": 0.01409370718079132, "rougeL_precision_stderr": 0.0015556216825301857, "rougeL_recall": 0.0071320563211524305, "rougeL_recall_stderr": 0.0008188296967830902, "rougeLsum_fmeasure": 0.008486765696774931, "rougeLsum_fmeasure_stderr": 0.0008463173281760578, "rougeLsum_precision": 0.01599374113140704, "rougeLsum_precision_stderr": 0.0016942291943178897, "rougeLsum_recall": 0.008513262007398674, "rougeLsum_recall_stderr": 0.0009589404294616282}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.4706328406470207, "bleu_stderr": 0.02402757674832147, "rouge1_fmeasure": 0.11778977209562722, "rouge1_fmeasure_stderr": 0.0011376450413788633, "rouge1_precision": 0.13001535554848212, "rouge1_precision_stderr": 0.001124629045107318, "rouge1_recall": 0.11373948556796343, "rouge1_recall_stderr": 0.0013698942103208453, "rouge2_fmeasure": 0.008921217264042648, "rouge2_fmeasure_stderr": 0.0006011323057364968, "rouge2_precision": 0.008708039839815999, "rouge2_precision_stderr": 0.0005776055587872283, "rouge2_recall": 0.009747251976877848, "rouge2_recall_stderr": 0.0006799991254366815, "rougeL_fmeasure": 0.10383431246692064, "rougeL_fmeasure_stderr": 0.0008398575208485227, "rougeL_precision": 0.11561583703651156, "rougeL_precision_stderr": 0.0008722657139688917, "rougeL_recall": 0.099204631786365, "rougeL_recall_stderr": 0.0010001330996009493, "rougeLsum_fmeasure": 0.10042917241344387, "rougeLsum_fmeasure_stderr": 0.0010796150038405905, "rougeLsum_precision": 0.11046233295040617, "rougeLsum_precision_stderr": 0.0010735844974838226, "rougeLsum_recall": 0.09746516930902722, "rougeLsum_recall_stderr": 0.0012884717055250637}}, "1": {"generate_text_restaurant": {"bleu": 13.497846580624541, "bleu_stderr": 0.13629785742458836, "rouge1_fmeasure": 0.5084044586304373, "rouge1_fmeasure_stderr": 0.0024097331353898563, "rouge1_precision": 0.6171679643723159, "rouge1_precision_stderr": 0.0031444818652590573, "rouge1_recall": 0.4728136217430111, "rouge1_recall_stderr": 0.003177619396202482, "rouge2_fmeasure": 0.24687670135963216, "rouge2_fmeasure_stderr": 0.002157904322408305, "rouge2_precision": 0.303186978886608, "rouge2_precision_stderr": 0.002780601166944455, "rouge2_recall": 0.2296098597966958, "rouge2_recall_stderr": 0.0023224207476145774, "rougeL_fmeasure": 0.36246021573102866, "rougeL_fmeasure_stderr": 0.0021575978359088957, "rougeL_precision": 0.4441493181410414, "rougeL_precision_stderr": 0.003020356023768683, "rougeL_recall": 0.3356980558328371, "rougeL_recall_stderr": 0.002550171106453558, "rougeLsum_fmeasure": 0.41065174912709046, "rougeLsum_fmeasure_stderr": 0.0024189825815710666, "rougeLsum_precision": 0.5003782404198703, "rougeLsum_precision_stderr": 0.0032008715366790724, "rougeLsum_recall": 0.38120490327470263, "rougeLsum_recall_stderr": 0.0028930501713341243}}, "2": {"generate_text_restaurant": {"bleu": 16.39076318976977, "bleu_stderr": 0.22117088142057217, "rouge1_fmeasure": 0.5426304200474867, "rouge1_fmeasure_stderr": 0.0023812885636568487, "rouge1_precision": 0.6193845693054287, "rouge1_precision_stderr": 0.0030631073183171615, "rouge1_recall": 0.5194487348948328, "rouge1_recall_stderr": 0.0030700997248089574, "rouge2_fmeasure": 0.27547143470016905, "rouge2_fmeasure_stderr": 0.00225860123685758, "rouge2_precision": 0.31727263084561413, "rouge2_precision_stderr": 0.002768520883139812, "rouge2_recall": 0.2635688218195235, "rouge2_recall_stderr": 0.002430620981236374, "rougeL_fmeasure": 0.38502736323927617, "rougeL_fmeasure_stderr": 0.002241661145743132, "rougeL_precision": 0.441502834398647, "rougeL_precision_stderr": 0.0028948597295175814, "rougeL_recall": 0.3678508824404482, "rougeL_recall_stderr": 0.002585901329346113, "rougeLsum_fmeasure": 0.4450941954735718, "rougeLsum_fmeasure_stderr": 0.002498529702980092, "rougeLsum_precision": 0.5082838863294896, "rougeLsum_precision_stderr": 0.0031130524002642808, "rougeLsum_recall": 0.4258014533583386, "rougeLsum_recall_stderr": 0.002918578065823685}}, "3": {"generate_text_restaurant": {"bleu": 17.47512494933624, "bleu_stderr": 0.3161221490612832, "rouge1_fmeasure": 0.5526196543881303, "rouge1_fmeasure_stderr": 0.002368681827618165, "rouge1_precision": 0.617307785563921, "rouge1_precision_stderr": 0.0029349642071550613, "rouge1_recall": 0.5347467408140201, "rouge1_recall_stderr": 0.0030832783507968586, "rouge2_fmeasure": 0.2861781255157219, "rouge2_fmeasure_stderr": 0.0022468702783637463, "rouge2_precision": 0.32141477337313595, "rouge2_precision_stderr": 0.0026272621038607954, "rouge2_recall": 0.27706737300866596, "rouge2_recall_stderr": 0.0024574457008832646, "rougeL_fmeasure": 0.3930597590221422, "rougeL_fmeasure_stderr": 0.0022719855826562113, "rougeL_precision": 0.44058634250670686, "rougeL_precision_stderr": 0.0027940635014028427, "rougeL_recall": 0.3796119263695181, "rougeL_recall_stderr": 0.002637452524581856, "rougeLsum_fmeasure": 0.45605452188436363, "rougeLsum_fmeasure_stderr": 0.0025194390898929993, "rougeLsum_precision": 0.509014566881018, "rougeLsum_precision_stderr": 0.0029852357045497464, "rougeLsum_recall": 0.44138371830643647, "rougeLsum_recall_stderr": 0.002986548244767623}}, "4": {"generate_text_restaurant": {"bleu": 18.009396791161844, "bleu_stderr": 0.19008845098360633, "rouge1_fmeasure": 0.5573340749029256, "rouge1_fmeasure_stderr": 0.002357215792839256, "rouge1_precision": 0.6146621583397208, "rouge1_precision_stderr": 0.0029816057289008017, "rouge1_recall": 0.5424698564590708, "rouge1_recall_stderr": 0.0029649113493789296, "rouge2_fmeasure": 0.2903858863495276, "rouge2_fmeasure_stderr": 0.002307372222942283, "rouge2_precision": 0.3218711274514794, "rouge2_precision_stderr": 0.0026807768947368035, "rouge2_recall": 0.28282453554846243, "rouge2_recall_stderr": 0.002478020826547479, "rougeL_fmeasure": 0.3979277918045228, "rougeL_fmeasure_stderr": 0.002300951079783796, "rougeL_precision": 0.4396112576083185, "rougeL_precision_stderr": 0.002794840989445228, "rougeL_recall": 0.3872654759045874, "rougeL_recall_stderr": 0.002639448716853359, "rougeLsum_fmeasure": 0.46361359292896365, "rougeLsum_fmeasure_stderr": 0.002533101840745723, "rougeLsum_precision": 0.510791028202631, "rougeLsum_precision_stderr": 0.003018485730410244, "rougeLsum_recall": 0.4516991855287819, "rougeLsum_recall_stderr": 0.002952066207042656}}, "5": {"generate_text_restaurant": {"bleu": 18.228472478078046, "bleu_stderr": 0.11799874471262083, "rouge1_fmeasure": 0.5611673630553764, "rouge1_fmeasure_stderr": 0.002275369180699541, "rouge1_precision": 0.6199035607849839, "rouge1_precision_stderr": 0.0029619994688247927, "rouge1_recall": 0.5430677222902416, "rouge1_recall_stderr": 0.002844834877238402, "rouge2_fmeasure": 0.2956772604735498, "rouge2_fmeasure_stderr": 0.0022741537308339547, "rouge2_precision": 0.328851450184167, "rouge2_precision_stderr": 0.002688937709606261, "rouge2_recall": 0.28583223233597027, "rouge2_recall_stderr": 0.0024229514120271316, "rougeL_fmeasure": 0.4042315271040694, "rougeL_fmeasure_stderr": 0.0022662587321749515, "rougeL_precision": 0.44724055673705954, "rougeL_precision_stderr": 0.002805452221532952, "rougeL_recall": 0.3910928070603256, "rougeL_recall_stderr": 0.0025737577651484527, "rougeLsum_fmeasure": 0.468836840599471, "rougeLsum_fmeasure_stderr": 0.0024861995836027717, "rougeLsum_precision": 0.5180901487507985, "rougeLsum_precision_stderr": 0.0030570748009817517, "rougeLsum_recall": 0.45362451608584736, "rougeLsum_recall_stderr": 0.0028518118059115394}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.3635381485723308, "bleu_stderr": 0.06050569505655452, "rouge1_fmeasure": 0.19593044088320605, "rouge1_fmeasure_stderr": 0.0023793786175484786, "rouge1_precision": 0.14082451804834303, "rouge1_precision_stderr": 0.001812950354275324, "rouge1_recall": 0.3383507216343538, "rouge1_recall_stderr": 0.004089460179758747, "rouge2_fmeasure": 0.038403716706480975, "rouge2_fmeasure_stderr": 0.0013280974828837335, "rouge2_precision": 0.0273256555636052, "rouge2_precision_stderr": 0.0009729530541541723, "rouge2_recall": 0.06848894350180496, "rouge2_recall_stderr": 0.00242885163586263, "rougeL_fmeasure": 0.14229991508716772, "rougeL_fmeasure_stderr": 0.0016746774192458954, "rougeL_precision": 0.10211117618246757, "rougeL_precision_stderr": 0.0012776170044425407, "rougeL_recall": 0.24740100878147744, "rougeL_recall_stderr": 0.0030220836589787484, "rougeLsum_fmeasure": 0.1546903637435788, "rougeLsum_fmeasure_stderr": 0.002002671689798246, "rougeLsum_precision": 0.11095884552432918, "rougeLsum_precision_stderr": 0.0015056892321386529, "rougeLsum_recall": 0.2687519899215706, "rougeLsum_recall_stderr": 0.0035675347714954377}}, "1": {"article_DOC_summary": {"bleu": 1.1406556737637281, "bleu_stderr": 0.09981541250484698, "rouge1_fmeasure": 0.18301590616218394, "rouge1_fmeasure_stderr": 0.002832599681157286, "rouge1_precision": 0.17167329534096928, "rouge1_precision_stderr": 0.003445888231874815, "rouge1_recall": 0.24124968128906518, "rouge1_recall_stderr": 0.003663359734593557, "rouge2_fmeasure": 0.031204994080253484, "rouge2_fmeasure_stderr": 0.0015420497224887875, "rouge2_precision": 0.030156660759442292, "rouge2_precision_stderr": 0.0016962687149287595, "rouge2_recall": 0.04083749130750451, "rouge2_recall_stderr": 0.0019985100499555065, "rougeL_fmeasure": 0.1409441587412128, "rougeL_fmeasure_stderr": 0.0020941529397323897, "rougeL_precision": 0.13131243496598874, "rougeL_precision_stderr": 0.0025520873538345964, "rougeL_recall": 0.1882023954021015, "rougeL_recall_stderr": 0.002904293144403106, "rougeLsum_fmeasure": 0.14155571577698547, "rougeLsum_fmeasure_stderr": 0.002164663392443069, "rougeLsum_precision": 0.13203814985770898, "rougeLsum_precision_stderr": 0.002603695986474907, "rougeLsum_recall": 0.18872432393145935, "rougeLsum_recall_stderr": 0.003009469520035859}}, "2": {"article_DOC_summary": {"bleu": 1.4843882007540472, "bleu_stderr": 0.20033451641735328, "rouge1_fmeasure": 0.19389771480407614, "rouge1_fmeasure_stderr": 0.003134502695385656, "rouge1_precision": 0.19485259009643732, "rouge1_precision_stderr": 0.003807470115583579, "rouge1_recall": 0.2259402670734466, "rouge1_recall_stderr": 0.003518336478116781, "rouge2_fmeasure": 0.0352286750067272, "rouge2_fmeasure_stderr": 0.0017730032817056618, "rouge2_precision": 0.036982684605802583, "rouge2_precision_stderr": 0.0020465819455126153, "rouge2_recall": 0.039321663608461656, "rouge2_recall_stderr": 0.0018667847674129444, "rougeL_fmeasure": 0.14791939549523306, "rougeL_fmeasure_stderr": 0.002385502600929309, "rougeL_precision": 0.14856192844308974, "rougeL_precision_stderr": 0.0029186428427654923, "rougeL_recall": 0.1728672983845539, "rougeL_recall_stderr": 0.002674553032710122, "rougeLsum_fmeasure": 0.14998706737422282, "rougeLsum_fmeasure_stderr": 0.002418195781099263, "rougeLsum_precision": 0.1502521915850602, "rougeLsum_precision_stderr": 0.0029328903711452643, "rougeLsum_recall": 0.17613103926935098, "rougeLsum_recall_stderr": 0.0028040202181348825}}, "3": {"article_DOC_summary": {"bleu": 1.5664417550993706, "bleu_stderr": 0.1194791755353572, "rouge1_fmeasure": 0.18807446336158895, "rouge1_fmeasure_stderr": 0.003277658812407121, "rouge1_precision": 0.19104397654826316, "rouge1_precision_stderr": 0.003922877574340018, "rouge1_recall": 0.21982216880155725, "rouge1_recall_stderr": 0.0038846598039613896, "rouge2_fmeasure": 0.0350656521495642, "rouge2_fmeasure_stderr": 0.001716415729043433, "rouge2_precision": 0.03683914309520842, "rouge2_precision_stderr": 0.001972341139405416, "rouge2_recall": 0.04001740214314747, "rouge2_recall_stderr": 0.0019737645631021654, "rougeL_fmeasure": 0.14228766836223364, "rougeL_fmeasure_stderr": 0.0024286286876757245, "rougeL_precision": 0.14490892838858457, "rougeL_precision_stderr": 0.002985858871192099, "rougeL_recall": 0.16691062603223022, "rougeL_recall_stderr": 0.0029286478537987972, "rougeLsum_fmeasure": 0.1446671413309568, "rougeLsum_fmeasure_stderr": 0.0024867543614546968, "rougeLsum_precision": 0.1469816688598794, "rougeLsum_precision_stderr": 0.0030211862593581013, "rougeLsum_recall": 0.17049348721325655, "rougeLsum_recall_stderr": 0.003095033114717209}}, "4": {"article_DOC_summary": {"bleu": 0.22385698318497463, "bleu_stderr": 0.06694821289848427, "rouge1_fmeasure": 0.04896560426335061, "rouge1_fmeasure_stderr": 0.0028151354920170728, "rouge1_precision": 0.05425545395180225, "rouge1_precision_stderr": 0.003345073255269582, "rouge1_recall": 0.054372012877136756, "rouge1_recall_stderr": 0.0032340989651964015, "rouge2_fmeasure": 0.008761882130394149, "rouge2_fmeasure_stderr": 0.000984526743636971, "rouge2_precision": 0.00997866888571845, "rouge2_precision_stderr": 0.0012899550432704228, "rouge2_recall": 0.009224690419605674, "rouge2_recall_stderr": 0.001017008457046733, "rougeL_fmeasure": 0.03677251614614851, "rougeL_fmeasure_stderr": 0.002113252639836489, "rougeL_precision": 0.0412065017957668, "rougeL_precision_stderr": 0.0025816419770219065, "rougeL_recall": 0.04072292230689789, "rougeL_recall_stderr": 0.002404661059938926, "rougeLsum_fmeasure": 0.03797500164773236, "rougeLsum_fmeasure_stderr": 0.0022004881145241103, "rougeLsum_precision": 0.0424150651678533, "rougeLsum_precision_stderr": 0.002653224532922604, "rougeLsum_recall": 0.04204554594766696, "rougeLsum_recall_stderr": 0.0025081460507114465}}, "5": {"article_DOC_summary": {"bleu": 2.7346193490711173e-40, "bleu_stderr": 1.539388589903143e-37, "rouge1_fmeasure": 0.0027977277650179035, "rouge1_fmeasure_stderr": 0.0007979974499904037, "rouge1_precision": 0.003119355834175625, "rouge1_precision_stderr": 0.0009253611464056162, "rouge1_recall": 0.0026282060581554016, "rouge1_recall_stderr": 0.0007328898031043521, "rouge2_fmeasure": 0.00035443021875998775, "rouge2_fmeasure_stderr": 0.0001726913474248355, "rouge2_precision": 0.00039547882587779784, "rouge2_precision_stderr": 0.00019959735923657775, "rouge2_recall": 0.000334348565480641, "rouge2_recall_stderr": 0.0001630639562483575, "rougeL_fmeasure": 0.0021515320117000713, "rougeL_fmeasure_stderr": 0.0005852809484131203, "rougeL_precision": 0.002372330849826615, "rougeL_precision_stderr": 0.0006620063801604332, "rougeL_recall": 0.0020460099013736585, "rougeL_recall_stderr": 0.0005511097020631205, "rougeLsum_fmeasure": 0.002311466063055002, "rougeLsum_fmeasure_stderr": 0.0006354227534703968, "rougeLsum_precision": 0.0025545778481113487, "rougeLsum_precision_stderr": 0.0007199617326289864, "rougeLsum_recall": 0.002188948723557764, "rougeLsum_recall_stderr": 0.0005936593974260942}}}}
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.334,0.01492201952373296,0
3
+ anli_r2,acc,0.334,0.014922019523732963,0
4
+ anli_r3,acc,0.33666666666666667,0.013647602942406396,0
5
+ arc_challenge,acc,0.2175767918088737,0.012057262020972502,0
6
+ arc_challenge,acc_norm,0.23890784982935154,0.012461071376316623,0
7
+ arc_easy,acc,0.5218855218855218,0.01024995042723416,0
8
+ arc_easy,acc_norm,0.4583333333333333,0.010224097209176594,0
9
+ boolq,acc,0.6107033639143731,0.008528016290984541,1
10
+ cb,acc,0.375,0.06527912098338669,1
11
+ cb,f1,0.1818181818181818,,1
12
+ copa,acc,0.68,0.046882617226215034,0
13
+ hellaswag,acc,0.3625771758613822,0.004797616754372308,0
14
+ hellaswag,acc_norm,0.44722166899024096,0.004961904949171391,0
15
+ piqa,acc,0.6893362350380848,0.010797078933727687,0
16
+ piqa,acc_norm,0.6920565832426551,0.010770892367463676,0
17
+ rte,acc,0.5234657039711191,0.03006330041190266,0
18
+ sciq,acc,0.837,0.011686212712746835,0
19
+ sciq,acc_norm,0.74,0.013877773329774164,0
20
+ storycloze_2016,acc,0.6493853554249065,0.011034317290463294,0
21
+ winogrande,acc,0.5193370165745856,0.014041972733712969,0
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_0_lm-eval_global_step80108_2023-05-13-13-52-19_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.334,
5
- "acc_stderr": 0.01492201952373296
6
- },
7
- "anli_r2": {
8
- "acc": 0.334,
9
- "acc_stderr": 0.014922019523732963
10
- },
11
- "anli_r3": {
12
- "acc": 0.33666666666666667,
13
- "acc_stderr": 0.013647602942406396
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.1818181818181818
19
- },
20
- "copa": {
21
- "acc": 0.68,
22
- "acc_stderr": 0.046882617226215034
23
- },
24
- "hellaswag": {
25
- "acc": 0.3625771758613822,
26
- "acc_stderr": 0.004797616754372308,
27
- "acc_norm": 0.44722166899024096,
28
- "acc_norm_stderr": 0.004961904949171391
29
- },
30
- "rte": {
31
- "acc": 0.5234657039711191,
32
- "acc_stderr": 0.03006330041190266
33
- },
34
- "winogrande": {
35
- "acc": 0.5193370165745856,
36
- "acc_stderr": 0.014041972733712969
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6493853554249065,
40
- "acc_stderr": 0.011034317290463294
41
- },
42
- "boolq": {
43
- "acc": 0.6107033639143731,
44
- "acc_stderr": 0.008528016290984541
45
- },
46
- "arc_easy": {
47
- "acc": 0.5218855218855218,
48
- "acc_stderr": 0.01024995042723416,
49
- "acc_norm": 0.4583333333333333,
50
- "acc_norm_stderr": 0.010224097209176594
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2175767918088737,
54
- "acc_stderr": 0.012057262020972502,
55
- "acc_norm": 0.23890784982935154,
56
- "acc_norm_stderr": 0.012461071376316623
57
- },
58
- "sciq": {
59
- "acc": 0.837,
60
- "acc_stderr": 0.011686212712746835,
61
- "acc_norm": 0.74,
62
- "acc_norm_stderr": 0.013877773329774164
63
- },
64
- "piqa": {
65
- "acc": 0.6893362350380848,
66
- "acc_stderr": 0.010797078933727687,
67
- "acc_norm": 0.6920565832426551,
68
- "acc_norm_stderr": 0.010770892367463676
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.324,0.014806864733738857,0
3
+ anli_r2,acc,0.348,0.01507060460376841,0
4
+ anli_r3,acc,0.34,0.0136804957257678,0
5
+ arc_challenge,acc,0.22098976109215018,0.012124929206818258,0
6
+ arc_challenge,acc_norm,0.24914675767918087,0.012639407111926439,0
7
+ arc_easy,acc,0.5336700336700336,0.010236494647406476,0
8
+ arc_easy,acc_norm,0.5033670033670034,0.01025955089379893,0
9
+ boolq,acc,0.6110091743119266,0.008526800159503202,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.2168674698795181,,1
12
+ copa,acc,0.69,0.04648231987117316,0
13
+ hellaswag,acc,0.359788886675961,0.004789575163418651,0
14
+ hellaswag,acc_norm,0.4447321250746863,0.004959204773046197,0
15
+ piqa,acc,0.6817192600652884,0.010868093932082231,0
16
+ piqa,acc_norm,0.6882480957562568,0.010807431424873669,0
17
+ rte,acc,0.5306859205776173,0.030039730592197812,0
18
+ sciq,acc,0.877,0.010391293421849877,0
19
+ sciq,acc_norm,0.865,0.010811655372416053,0
20
+ storycloze_2016,acc,0.6269374665954035,0.011183612906093182,0
21
+ winogrande,acc,0.5177584846093133,0.014043619596174964,0
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_1_lm-eval_global_step80108_2023-05-13-13-52-19_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.324,
5
- "acc_stderr": 0.014806864733738857
6
- },
7
- "anli_r2": {
8
- "acc": 0.348,
9
- "acc_stderr": 0.01507060460376841
10
- },
11
- "anli_r3": {
12
- "acc": 0.34,
13
- "acc_stderr": 0.0136804957257678
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.2168674698795181
19
- },
20
- "copa": {
21
- "acc": 0.69,
22
- "acc_stderr": 0.04648231987117316
23
- },
24
- "hellaswag": {
25
- "acc": 0.359788886675961,
26
- "acc_stderr": 0.004789575163418651,
27
- "acc_norm": 0.4447321250746863,
28
- "acc_norm_stderr": 0.004959204773046197
29
- },
30
- "rte": {
31
- "acc": 0.5306859205776173,
32
- "acc_stderr": 0.030039730592197812
33
- },
34
- "winogrande": {
35
- "acc": 0.5177584846093133,
36
- "acc_stderr": 0.014043619596174964
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6269374665954035,
40
- "acc_stderr": 0.011183612906093182
41
- },
42
- "boolq": {
43
- "acc": 0.6110091743119266,
44
- "acc_stderr": 0.008526800159503202
45
- },
46
- "arc_easy": {
47
- "acc": 0.5336700336700336,
48
- "acc_stderr": 0.010236494647406476,
49
- "acc_norm": 0.5033670033670034,
50
- "acc_norm_stderr": 0.01025955089379893
51
- },
52
- "arc_challenge": {
53
- "acc": 0.22098976109215018,
54
- "acc_stderr": 0.012124929206818258,
55
- "acc_norm": 0.24914675767918087,
56
- "acc_norm_stderr": 0.012639407111926439
57
- },
58
- "sciq": {
59
- "acc": 0.877,
60
- "acc_stderr": 0.010391293421849877,
61
- "acc_norm": 0.865,
62
- "acc_norm_stderr": 0.010811655372416053
63
- },
64
- "piqa": {
65
- "acc": 0.6817192600652884,
66
- "acc_stderr": 0.010868093932082231,
67
- "acc_norm": 0.6882480957562568,
68
- "acc_norm_stderr": 0.010807431424873669
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.326,0.014830507204541038,0
3
+ anli_r2,acc,0.345,0.015039986742055235,0
4
+ anli_r3,acc,0.3425,0.013704669762934728,0
5
+ arc_challenge,acc,0.22525597269624573,0.0122078399954073,0
6
+ arc_challenge,acc_norm,0.2508532423208191,0.01266819862131543,0
7
+ arc_easy,acc,0.531986531986532,0.010238767643185712,0
8
+ arc_easy,acc_norm,0.5164141414141414,0.010254253565929305,0
9
+ boolq,acc,0.5984709480122324,0.008573784490094752,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.3524384112619406,,1
12
+ copa,acc,0.67,0.047258156262526066,0
13
+ hellaswag,acc,0.3594901414060944,0.0047887031734747615,0
14
+ hellaswag,acc_norm,0.4470225054769966,0.004961693567208812,0
15
+ piqa,acc,0.6920565832426551,0.01077089236746368,0
16
+ piqa,acc_norm,0.6974972796517954,0.01071719969808389,0
17
+ rte,acc,0.5018050541516246,0.030096267148976626,0
18
+ sciq,acc,0.893,0.009779910359847167,0
19
+ sciq,acc_norm,0.891,0.009859828407037188,0
20
+ storycloze_2016,acc,0.6247995724211651,0.011196472580587938,0
21
+ winogrande,acc,0.5193370165745856,0.01404197273371297,0
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_2_lm-eval_global_step80108_2023-05-13-13-52-19_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.326,
5
- "acc_stderr": 0.014830507204541038
6
- },
7
- "anli_r2": {
8
- "acc": 0.345,
9
- "acc_stderr": 0.015039986742055235
10
- },
11
- "anli_r3": {
12
- "acc": 0.3425,
13
- "acc_stderr": 0.013704669762934728
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.3524384112619406
19
- },
20
- "copa": {
21
- "acc": 0.67,
22
- "acc_stderr": 0.047258156262526066
23
- },
24
- "hellaswag": {
25
- "acc": 0.3594901414060944,
26
- "acc_stderr": 0.0047887031734747615,
27
- "acc_norm": 0.4470225054769966,
28
- "acc_norm_stderr": 0.004961693567208812
29
- },
30
- "rte": {
31
- "acc": 0.5018050541516246,
32
- "acc_stderr": 0.030096267148976626
33
- },
34
- "winogrande": {
35
- "acc": 0.5193370165745856,
36
- "acc_stderr": 0.01404197273371297
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6247995724211651,
40
- "acc_stderr": 0.011196472580587938
41
- },
42
- "boolq": {
43
- "acc": 0.5984709480122324,
44
- "acc_stderr": 0.008573784490094752
45
- },
46
- "arc_easy": {
47
- "acc": 0.531986531986532,
48
- "acc_stderr": 0.010238767643185712,
49
- "acc_norm": 0.5164141414141414,
50
- "acc_norm_stderr": 0.010254253565929305
51
- },
52
- "arc_challenge": {
53
- "acc": 0.22525597269624573,
54
- "acc_stderr": 0.0122078399954073,
55
- "acc_norm": 0.2508532423208191,
56
- "acc_norm_stderr": 0.01266819862131543
57
- },
58
- "sciq": {
59
- "acc": 0.893,
60
- "acc_stderr": 0.009779910359847167,
61
- "acc_norm": 0.891,
62
- "acc_norm_stderr": 0.009859828407037188
63
- },
64
- "piqa": {
65
- "acc": 0.6920565832426551,
66
- "acc_stderr": 0.01077089236746368,
67
- "acc_norm": 0.6974972796517954,
68
- "acc_norm_stderr": 0.01071719969808389
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.313,0.01467127282297789,0
3
+ anli_r2,acc,0.352,0.015110404505648666,0
4
+ anli_r3,acc,0.31916666666666665,0.013462309712005129,0
5
+ arc_challenge,acc,0.23208191126279865,0.012336718284948854,0
6
+ arc_challenge,acc_norm,0.26023890784982934,0.012821930225112558,0
7
+ arc_easy,acc,0.5353535353535354,0.010234104543411435,0
8
+ arc_easy,acc_norm,0.515993265993266,0.010254533589288184,0
9
+ boolq,acc,0.6027522935779817,0.008558401855851154,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.31417624521072796,,1
12
+ copa,acc,0.66,0.04760952285695237,0
13
+ hellaswag,acc,0.3615813582951603,0.004794764843685289,0
14
+ hellaswag,acc_norm,0.4511053574985063,0.004965866098318165,0
15
+ piqa,acc,0.6953210010881393,0.010738889044325165,0
16
+ piqa,acc_norm,0.6947769314472253,0.01074426704560648,0
17
+ rte,acc,0.5306859205776173,0.030039730592197812,0
18
+ sciq,acc,0.89,0.009899393819724442,0
19
+ sciq,acc_norm,0.889,0.009938701010583726,0
20
+ storycloze_2016,acc,0.6349545697487974,0.011133301783914874,0
21
+ winogrande,acc,0.5430149960536701,0.01400038676159829,0
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_3_lm-eval_global_step80108_2023-05-13-13-52-19_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.313,
5
- "acc_stderr": 0.01467127282297789
6
- },
7
- "anli_r2": {
8
- "acc": 0.352,
9
- "acc_stderr": 0.015110404505648666
10
- },
11
- "anli_r3": {
12
- "acc": 0.31916666666666665,
13
- "acc_stderr": 0.013462309712005129
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.31417624521072796
19
- },
20
- "copa": {
21
- "acc": 0.66,
22
- "acc_stderr": 0.04760952285695237
23
- },
24
- "hellaswag": {
25
- "acc": 0.3615813582951603,
26
- "acc_stderr": 0.004794764843685289,
27
- "acc_norm": 0.4511053574985063,
28
- "acc_norm_stderr": 0.004965866098318165
29
- },
30
- "rte": {
31
- "acc": 0.5306859205776173,
32
- "acc_stderr": 0.030039730592197812
33
- },
34
- "winogrande": {
35
- "acc": 0.5430149960536701,
36
- "acc_stderr": 0.01400038676159829
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6349545697487974,
40
- "acc_stderr": 0.011133301783914874
41
- },
42
- "boolq": {
43
- "acc": 0.6027522935779817,
44
- "acc_stderr": 0.008558401855851154
45
- },
46
- "arc_easy": {
47
- "acc": 0.5353535353535354,
48
- "acc_stderr": 0.010234104543411435,
49
- "acc_norm": 0.515993265993266,
50
- "acc_norm_stderr": 0.010254533589288184
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23208191126279865,
54
- "acc_stderr": 0.012336718284948854,
55
- "acc_norm": 0.26023890784982934,
56
- "acc_norm_stderr": 0.012821930225112558
57
- },
58
- "sciq": {
59
- "acc": 0.89,
60
- "acc_stderr": 0.009899393819724442,
61
- "acc_norm": 0.889,
62
- "acc_norm_stderr": 0.009938701010583726
63
- },
64
- "piqa": {
65
- "acc": 0.6953210010881393,
66
- "acc_stderr": 0.010738889044325165,
67
- "acc_norm": 0.6947769314472253,
68
- "acc_norm_stderr": 0.01074426704560648
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.014944140233795027,0
3
+ anli_r2,acc,0.348,0.01507060460376841,0
4
+ anli_r3,acc,0.33166666666666667,0.013596836729485159,0
5
+ arc_challenge,acc,0.2431740614334471,0.01253655414458709,0
6
+ arc_challenge,acc_norm,0.26023890784982934,0.01282193022511256,0
7
+ arc_easy,acc,0.5315656565656566,0.010239317603199505,0
8
+ arc_easy,acc_norm,0.5189393939393939,0.0102524204968945,0
9
+ boolq,acc,0.6009174311926605,0.008565077958836789,1
10
+ cb,acc,0.5178571428571429,0.06737697508644647,1
11
+ cb,f1,0.2735042735042735,,1
12
+ copa,acc,0.67,0.047258156262526066,0
13
+ hellaswag,acc,0.3632742481577375,0.004799599840397386,0
14
+ hellaswag,acc_norm,0.44742083250348536,0.004962115526014296,0
15
+ piqa,acc,0.6964091403699674,0.010728079893076352,0
16
+ piqa,acc_norm,0.7002176278563657,0.010689686967138089,0
17
+ rte,acc,0.44404332129963897,0.02990739633379599,0
18
+ sciq,acc,0.901,0.009449248027662727,0
19
+ sciq,acc_norm,0.899,0.009533618929340968,0
20
+ storycloze_2016,acc,0.6317477284874399,0.01115382325853174,0
21
+ winogrande,acc,0.5311760063141279,0.014025142640639513,0
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_4_lm-eval_global_step80108_2023-05-13-13-52-19_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795027
6
- },
7
- "anli_r2": {
8
- "acc": 0.348,
9
- "acc_stderr": 0.01507060460376841
10
- },
11
- "anli_r3": {
12
- "acc": 0.33166666666666667,
13
- "acc_stderr": 0.013596836729485159
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644647,
18
- "f1": 0.2735042735042735
19
- },
20
- "copa": {
21
- "acc": 0.67,
22
- "acc_stderr": 0.047258156262526066
23
- },
24
- "hellaswag": {
25
- "acc": 0.3632742481577375,
26
- "acc_stderr": 0.004799599840397386,
27
- "acc_norm": 0.44742083250348536,
28
- "acc_norm_stderr": 0.004962115526014296
29
- },
30
- "rte": {
31
- "acc": 0.44404332129963897,
32
- "acc_stderr": 0.02990739633379599
33
- },
34
- "winogrande": {
35
- "acc": 0.5311760063141279,
36
- "acc_stderr": 0.014025142640639513
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6317477284874399,
40
- "acc_stderr": 0.01115382325853174
41
- },
42
- "boolq": {
43
- "acc": 0.6009174311926605,
44
- "acc_stderr": 0.008565077958836789
45
- },
46
- "arc_easy": {
47
- "acc": 0.5315656565656566,
48
- "acc_stderr": 0.010239317603199505,
49
- "acc_norm": 0.5189393939393939,
50
- "acc_norm_stderr": 0.0102524204968945
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2431740614334471,
54
- "acc_stderr": 0.01253655414458709,
55
- "acc_norm": 0.26023890784982934,
56
- "acc_norm_stderr": 0.01282193022511256
57
- },
58
- "sciq": {
59
- "acc": 0.901,
60
- "acc_stderr": 0.009449248027662727,
61
- "acc_norm": 0.899,
62
- "acc_norm_stderr": 0.009533618929340968
63
- },
64
- "piqa": {
65
- "acc": 0.6964091403699674,
66
- "acc_stderr": 0.010728079893076352,
67
- "acc_norm": 0.7002176278563657,
68
- "acc_norm_stderr": 0.010689686967138089
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.348,0.01507060460376841,0
3
+ anli_r2,acc,0.332,0.01489959724281148,0
4
+ anli_r3,acc,0.31916666666666665,0.013462309712005127,0
5
+ arc_challenge,acc,0.23122866894197952,0.012320858834772273,0
6
+ arc_challenge,acc_norm,0.2645051194539249,0.012889272949313368,0
7
+ arc_easy,acc,0.5429292929292929,0.01022189756425605,0
8
+ arc_easy,acc_norm,0.5214646464646465,0.01025032515945665,0
9
+ boolq,acc,0.6048929663608563,0.0085504542482809,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.28097560975609753,,1
12
+ copa,acc,0.68,0.04688261722621505,0
13
+ hellaswag,acc,0.3611830312686716,0.004793617835645059,0
14
+ hellaswag,acc_norm,0.4444333798048198,0.004958872288442147,0
15
+ piqa,acc,0.6980413492927094,0.010711732891588364,0
16
+ piqa,acc_norm,0.6947769314472253,0.01074426704560648,0
17
+ rte,acc,0.49097472924187724,0.030091559826331334,0
18
+ sciq,acc,0.907,0.009188875634996681,0
19
+ sciq,acc_norm,0.908,0.0091443763931511,0
20
+ storycloze_2016,acc,0.632816675574559,0.01114704178136865,0
21
+ winogrande,acc,0.5374901341752171,0.014012928183336574,0
4b284b84b10c4pyseed3/evaluation/rankeval/4b284b84b10c4pyseed3_5_lm-eval_global_step80108_2023-05-13-13-52-19_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.348,
5
- "acc_stderr": 0.01507060460376841
6
- },
7
- "anli_r2": {
8
- "acc": 0.332,
9
- "acc_stderr": 0.01489959724281148
10
- },
11
- "anli_r3": {
12
- "acc": 0.31916666666666665,
13
- "acc_stderr": 0.013462309712005127
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.28097560975609753
19
- },
20
- "copa": {
21
- "acc": 0.68,
22
- "acc_stderr": 0.04688261722621505
23
- },
24
- "hellaswag": {
25
- "acc": 0.3611830312686716,
26
- "acc_stderr": 0.004793617835645059,
27
- "acc_norm": 0.4444333798048198,
28
- "acc_norm_stderr": 0.004958872288442147
29
- },
30
- "rte": {
31
- "acc": 0.49097472924187724,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.5374901341752171,
36
- "acc_stderr": 0.014012928183336574
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.632816675574559,
40
- "acc_stderr": 0.01114704178136865
41
- },
42
- "boolq": {
43
- "acc": 0.6048929663608563,
44
- "acc_stderr": 0.0085504542482809
45
- },
46
- "arc_easy": {
47
- "acc": 0.5429292929292929,
48
- "acc_stderr": 0.01022189756425605,
49
- "acc_norm": 0.5214646464646465,
50
- "acc_norm_stderr": 0.01025032515945665
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23122866894197952,
54
- "acc_stderr": 0.012320858834772273,
55
- "acc_norm": 0.2645051194539249,
56
- "acc_norm_stderr": 0.012889272949313368
57
- },
58
- "sciq": {
59
- "acc": 0.907,
60
- "acc_stderr": 0.009188875634996681,
61
- "acc_norm": 0.908,
62
- "acc_norm_stderr": 0.0091443763931511
63
- },
64
- "piqa": {
65
- "acc": 0.6980413492927094,
66
- "acc_stderr": 0.010711732891588364,
67
- "acc_norm": 0.6947769314472253,
68
- "acc_norm_stderr": 0.01074426704560648
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed4/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.019875685552769976
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.019875685552769976
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2458860659703384
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.2458860659703384
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.28277007498560486
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.28277007498560486
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.29667899957066707
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.29667899957066707
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2995925527308815
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2995925527308815
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.30257765977165146
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.30257765977165146
14
+ e2e_nlg_cleaned,5,average,multiple,0.24123017309698555
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03753366067926952
16
+ gem_xsum,0,median,rouge2_fmeasure,0.03753366067926952
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.043055373688934306
18
+ gem_xsum,1,median,rouge2_fmeasure,0.043055373688934306
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04816447431232779
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04816447431232779
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04433348477474905
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04433348477474905
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01036458826567488
24
+ gem_xsum,4,median,rouge2_fmeasure,0.01036458826567488
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0
27
+ gem_xsum,5,average,multiple,0.030575263620159256
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05054966908352207
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05054966908352207
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08509943057807091
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.08509943057807091
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09958032315087247
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.09958032315087247
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10535121143782143
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.10535121143782143
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10380815592757812
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.10380815592757812
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.1124270685660257
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.1124270685660257
40
+ web_nlg_en,5,average,multiple,0.09280264312398179
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.0374002473743124
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.0374002473743124
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03408094461273399
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.03408094461273399
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.051994446683955885
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.051994446683955885
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04665370905924281
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04665370905924281
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.016160010218408506
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.016160010218408506
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0025899683938137505
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0025899683938137505
53
+ wiki_lingua_en,5,average,multiple,0.031479887723744555
4b284b84b10c4pyseed4/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.37687776486311836, "bleu_stderr": 0.03684406202058806, "rouge1_fmeasure": 0.10993367038197949, "rouge1_fmeasure_stderr": 0.0019204362806703376, "rouge1_precision": 0.07218893046258328, "rouge1_precision_stderr": 0.0015989305177609053, "rouge1_recall": 0.3176774420607981, "rouge1_recall_stderr": 0.004935231830501088, "rouge2_fmeasure": 0.05054966908352207, "rouge2_fmeasure_stderr": 0.001197068121606185, "rouge2_precision": 0.03323889651238857, "rouge2_precision_stderr": 0.001042847648886505, "rouge2_recall": 0.15151563407536922, "rouge2_recall_stderr": 0.0034853090030710367, "rougeL_fmeasure": 0.10560682489631945, "rougeL_fmeasure_stderr": 0.0018070678180295806, "rougeL_precision": 0.06924280717773994, "rougeL_precision_stderr": 0.0015257437945109431, "rougeL_recall": 0.30736969757277377, "rougeL_recall_stderr": 0.004785090864980112, "rougeLsum_fmeasure": 0.10273331022191107, "rougeLsum_fmeasure_stderr": 0.001797594804780858, "rougeLsum_precision": 0.06766003883037222, "rougeLsum_precision_stderr": 0.001533804508079854, "rougeLsum_recall": 0.2956013015757491, "rougeLsum_recall_stderr": 0.004546271507628255}}, "1": {"PALM_prompt": {"bleu": 0.7331262163875337, "bleu_stderr": 0.06286081358741213, "rouge1_fmeasure": 0.16897025773670477, "rouge1_fmeasure_stderr": 0.00375751255247858, "rouge1_precision": 0.1489217395815071, "rouge1_precision_stderr": 0.004702917388242341, "rouge1_recall": 0.34160819597956565, "rouge1_recall_stderr": 0.0051690583415908535, "rouge2_fmeasure": 0.08509943057807091, "rouge2_fmeasure_stderr": 0.0024540103133841662, "rouge2_precision": 0.07486331655754194, "rouge2_precision_stderr": 0.0029731777438258902, "rouge2_recall": 0.17523637532876682, "rouge2_recall_stderr": 0.0036250150096323653, "rougeL_fmeasure": 0.15252761660929445, "rougeL_fmeasure_stderr": 0.003181032883291334, "rougeL_precision": 0.13339733487645286, "rougeL_precision_stderr": 0.0041246203563033825, "rougeL_recall": 0.3177993743731315, "rougeL_recall_stderr": 0.004719316374591283, "rougeLsum_fmeasure": 0.15531207061049937, "rougeLsum_fmeasure_stderr": 0.003266292118944347, "rougeLsum_precision": 0.13627770806959558, "rougeLsum_precision_stderr": 0.004224051889654252, "rougeLsum_recall": 0.3208601301863638, "rougeLsum_recall_stderr": 0.004725034951969304}}, "2": {"PALM_prompt": {"bleu": 0.8062281926485573, "bleu_stderr": 0.02668785352209344, "rouge1_fmeasure": 0.19154364519510117, "rouge1_fmeasure_stderr": 0.004206980279080634, "rouge1_precision": 0.1714407410547217, "rouge1_precision_stderr": 0.005122240818287197, "rouge1_recall": 0.35509767734977477, "rouge1_recall_stderr": 0.004890976660089392, "rouge2_fmeasure": 0.09958032315087247, "rouge2_fmeasure_stderr": 0.002891661398861244, "rouge2_precision": 0.09115502152266673, "rouge2_precision_stderr": 0.00347088337534582, "rouge2_recall": 0.1867493604498584, "rouge2_recall_stderr": 0.0037109520417074795, "rougeL_fmeasure": 0.1694121395769645, "rougeL_fmeasure_stderr": 0.003526259582751137, "rougeL_precision": 0.14987264131739117, "rougeL_precision_stderr": 0.00440001618966381, "rougeL_recall": 0.3257379537954472, "rougeL_recall_stderr": 0.0044068872204719375, "rougeLsum_fmeasure": 0.1745725058970203, "rougeLsum_fmeasure_stderr": 0.003657570294487318, "rougeLsum_precision": 0.1550521624934605, "rougeLsum_precision_stderr": 0.004554083480685833, "rougeLsum_recall": 0.33221264407068385, "rougeLsum_recall_stderr": 0.004487237900882061}}, "3": {"PALM_prompt": {"bleu": 0.9185609333562764, "bleu_stderr": 0.03886633337963366, "rouge1_fmeasure": 0.1961967100431015, "rouge1_fmeasure_stderr": 0.004352689803770881, "rouge1_precision": 0.17708906886107606, "rouge1_precision_stderr": 0.0053130963828574, "rouge1_recall": 0.3609706610678434, "rouge1_recall_stderr": 0.004919960324695938, "rouge2_fmeasure": 0.10535121143782143, "rouge2_fmeasure_stderr": 0.0030449049071045718, "rouge2_precision": 0.09757147172089832, "rouge2_precision_stderr": 0.003684127344769059, "rouge2_recall": 0.19331737518575148, "rouge2_recall_stderr": 0.003758863007652629, "rougeL_fmeasure": 0.17364281001136087, "rougeL_fmeasure_stderr": 0.003591977175817447, "rougeL_precision": 0.15465889142319666, "rougeL_precision_stderr": 0.004506531082634542, "rougeL_recall": 0.33229654831015554, "rougeL_recall_stderr": 0.004401097217227131, "rougeLsum_fmeasure": 0.17841891362629145, "rougeLsum_fmeasure_stderr": 0.0037581650385131735, "rougeLsum_precision": 0.15998055130325117, "rougeLsum_precision_stderr": 0.004719394117745236, "rougeLsum_recall": 0.3374146553768538, "rougeLsum_recall_stderr": 0.004466355187011048}}, "4": {"PALM_prompt": {"bleu": 0.9138064929250845, "bleu_stderr": 0.06298005107949606, "rouge1_fmeasure": 0.19584188887829568, "rouge1_fmeasure_stderr": 0.004217435544277068, "rouge1_precision": 0.1798886006639874, "rouge1_precision_stderr": 0.005311518484999322, "rouge1_recall": 0.3634726076113904, "rouge1_recall_stderr": 0.0047117980695889185, "rouge2_fmeasure": 0.10380815592757812, "rouge2_fmeasure_stderr": 0.0029084446712034397, "rouge2_precision": 0.09762692415556522, "rouge2_precision_stderr": 0.0036079829856292414, "rouge2_recall": 0.1955993065430807, "rouge2_recall_stderr": 0.0037047436826261122, "rougeL_fmeasure": 0.17403911858594262, "rougeL_fmeasure_stderr": 0.003526088408873111, "rougeL_precision": 0.1570664229103416, "rougeL_precision_stderr": 0.004490252070021222, "rougeL_recall": 0.3362616471502904, "rougeL_recall_stderr": 0.00428615781566303, "rougeLsum_fmeasure": 0.18081310443901466, "rougeLsum_fmeasure_stderr": 0.0037345515302121307, "rougeLsum_precision": 0.16477024941528062, "rougeLsum_precision_stderr": 0.004784498889503596, "rougeLsum_recall": 0.34374379256441034, "rougeLsum_recall_stderr": 0.004360169982679271}}, "5": {"PALM_prompt": {"bleu": 0.9979794206004006, "bleu_stderr": 0.046116489490293164, "rouge1_fmeasure": 0.2072521979826879, "rouge1_fmeasure_stderr": 0.004374853807345208, "rouge1_precision": 0.1915522292689441, "rouge1_precision_stderr": 0.005531353290238754, "rouge1_recall": 0.37621061814646006, "rouge1_recall_stderr": 0.004819161696715767, "rouge2_fmeasure": 0.1124270685660257, "rouge2_fmeasure_stderr": 0.0030208226695706652, "rouge2_precision": 0.10768850050531253, "rouge2_precision_stderr": 0.0038453481996777606, "rouge2_recall": 0.2047835101789359, "rouge2_recall_stderr": 0.0038071017547460597, "rougeL_fmeasure": 0.18362209233435506, "rougeL_fmeasure_stderr": 0.0035980114815775165, "rougeL_precision": 0.16753012595318004, "rougeL_precision_stderr": 0.0047010458205493785, "rougeL_recall": 0.3468435554889511, "rougeL_recall_stderr": 0.004296173023629769, "rougeLsum_fmeasure": 0.18962495148248798, "rougeLsum_fmeasure_stderr": 0.0037909912216198853, "rougeLsum_precision": 0.17453717309387756, "rougeLsum_precision_stderr": 0.004972374005930917, "rougeLsum_recall": 0.35356068966593823, "rougeLsum_recall_stderr": 0.00438869392150967}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.4124638677986368, "bleu_stderr": 0.0913949395876688, "rouge1_fmeasure": 0.1656497200775244, "rouge1_fmeasure_stderr": 0.0023403270932876754, "rouge1_precision": 0.15241899486484972, "rouge1_precision_stderr": 0.0025758580761762874, "rouge1_recall": 0.22622003784875558, "rouge1_recall_stderr": 0.003225159419344125, "rouge2_fmeasure": 0.0374002473743124, "rouge2_fmeasure_stderr": 0.001001869454258014, "rouge2_precision": 0.03365869334044392, "rouge2_precision_stderr": 0.0009496244318742025, "rouge2_recall": 0.052192489445291104, "rouge2_recall_stderr": 0.001507407427250903, "rougeL_fmeasure": 0.12738926732808806, "rougeL_fmeasure_stderr": 0.0017146135931701816, "rougeL_precision": 0.1167535496480465, "rougeL_precision_stderr": 0.001979884815504692, "rougeL_recall": 0.177913806828241, "rougeL_recall_stderr": 0.002573001517553247, "rougeLsum_fmeasure": 0.15345530395613216, "rougeLsum_fmeasure_stderr": 0.0021822059357909573, "rougeLsum_precision": 0.14146299620687117, "rougeLsum_precision_stderr": 0.002432354325938443, "rougeLsum_recall": 0.2099572906512204, "rougeLsum_recall_stderr": 0.0030342576169786286}}, "1": {"tldr_en": {"bleu": 2.285443178397197, "bleu_stderr": 0.12233037542442222, "rouge1_fmeasure": 0.16127019002684886, "rouge1_fmeasure_stderr": 0.0021093903974917323, "rouge1_precision": 0.2122115543086062, "rouge1_precision_stderr": 0.0037182868245250215, "rouge1_recall": 0.18726907691017194, "rouge1_recall_stderr": 0.00283359191936615, "rouge2_fmeasure": 0.03408094461273399, "rouge2_fmeasure_stderr": 0.0010753282767068434, "rouge2_precision": 0.053221431038386904, "rouge2_precision_stderr": 0.0022491648794078552, "rouge2_recall": 0.03932598055256129, "rouge2_recall_stderr": 0.001321956870981169, "rougeL_fmeasure": 0.12418650427656959, "rougeL_fmeasure_stderr": 0.00158846876288037, "rougeL_precision": 0.16833637285552672, "rougeL_precision_stderr": 0.0031731714621385503, "rougeL_recall": 0.14387587260034954, "rougeL_recall_stderr": 0.0021412096973104887, "rougeLsum_fmeasure": 0.15199294493088417, "rougeLsum_fmeasure_stderr": 0.0019727612256225237, "rougeLsum_precision": 0.20072521032678659, "rougeLsum_precision_stderr": 0.0035454968350690256, "rougeLsum_recall": 0.17623013074850405, "rougeLsum_recall_stderr": 0.002643723922954331}}, "2": {"tldr_en": {"bleu": 3.3515359139986787, "bleu_stderr": 0.12082556190965345, "rouge1_fmeasure": 0.19895013761874297, "rouge1_fmeasure_stderr": 0.0021803232439580743, "rouge1_precision": 0.27634714405388516, "rouge1_precision_stderr": 0.004053769032973144, "rouge1_recall": 0.22670793822109728, "rouge1_recall_stderr": 0.002962026981114172, "rouge2_fmeasure": 0.051994446683955885, "rouge2_fmeasure_stderr": 0.0012610814251367089, "rouge2_precision": 0.08213054646721735, "rouge2_precision_stderr": 0.0026986546324134194, "rouge2_recall": 0.05815967071065101, "rouge2_recall_stderr": 0.0014965259307692736, "rougeL_fmeasure": 0.1548337816267131, "rougeL_fmeasure_stderr": 0.0016871850033081633, "rougeL_precision": 0.22215168912004465, "rougeL_precision_stderr": 0.0035599687704218146, "rougeL_recall": 0.17597887611402802, "rougeL_recall_stderr": 0.0023085064692905766, "rougeLsum_fmeasure": 0.18715896075785868, "rougeLsum_fmeasure_stderr": 0.00205756543067454, "rougeLsum_precision": 0.26237581882554983, "rougeLsum_precision_stderr": 0.003957519290330249, "rougeLsum_recall": 0.2128274073770694, "rougeLsum_recall_stderr": 0.0027819241586660034}}, "3": {"tldr_en": {"bleu": 2.7530645054476843, "bleu_stderr": 0.0997208403803559, "rouge1_fmeasure": 0.17257314674183524, "rouge1_fmeasure_stderr": 0.0024092622990308935, "rouge1_precision": 0.265352120681397, "rouge1_precision_stderr": 0.004522856591801314, "rouge1_recall": 0.18687001453482432, "rouge1_recall_stderr": 0.003120211129655446, "rouge2_fmeasure": 0.04665370905924281, "rouge2_fmeasure_stderr": 0.0012472758965020566, "rouge2_precision": 0.08084504326991336, "rouge2_precision_stderr": 0.002780291101777949, "rouge2_recall": 0.05027384895387968, "rouge2_recall_stderr": 0.001505382389120513, "rougeL_fmeasure": 0.13592046345162895, "rougeL_fmeasure_stderr": 0.0018829794064955324, "rougeL_precision": 0.2165144128062488, "rougeL_precision_stderr": 0.003948750939502902, "rougeL_recall": 0.14649334550428744, "rougeL_recall_stderr": 0.0024587358102167657, "rougeLsum_fmeasure": 0.16227137067332212, "rougeLsum_fmeasure_stderr": 0.002275037334492735, "rougeLsum_precision": 0.25163522027691293, "rougeLsum_precision_stderr": 0.004363908236656514, "rougeLsum_recall": 0.1753723309422891, "rougeLsum_recall_stderr": 0.0029425819913422263}}, "4": {"tldr_en": {"bleu": 0.07933724232422462, "bleu_stderr": 0.011643166061897523, "rouge1_fmeasure": 0.0591848306216064, "rouge1_fmeasure_stderr": 0.0020964766447059695, "rouge1_precision": 0.09613744260997967, "rouge1_precision_stderr": 0.0036759103009842233, "rouge1_recall": 0.061851532629326224, "rouge1_recall_stderr": 0.0023896673695294225, "rouge2_fmeasure": 0.016160010218408506, "rouge2_fmeasure_stderr": 0.0008891970026205639, "rouge2_precision": 0.02888399780694895, "rouge2_precision_stderr": 0.0018570333624466481, "rouge2_recall": 0.016369492815724262, "rouge2_recall_stderr": 0.0009392471973071003, "rougeL_fmeasure": 0.047211539864124935, "rougeL_fmeasure_stderr": 0.0016824165200882981, "rougeL_precision": 0.07942098803183872, "rougeL_precision_stderr": 0.003155971380848626, "rougeL_recall": 0.04862002344033355, "rougeL_recall_stderr": 0.001864772758982904, "rougeLsum_fmeasure": 0.05566602328784957, "rougeLsum_fmeasure_stderr": 0.001974302642935771, "rougeLsum_precision": 0.09160683954518144, "rougeLsum_precision_stderr": 0.0035491244154694512, "rougeLsum_recall": 0.05798616147639231, "rougeLsum_recall_stderr": 0.0022404251869301965}}, "5": {"tldr_en": {"bleu": 8.777175803724697e-14, "bleu_stderr": 9.081686271757665e-12, "rouge1_fmeasure": 0.009917981525185601, "rouge1_fmeasure_stderr": 0.0009832678179967852, "rouge1_precision": 0.016450222619484842, "rouge1_precision_stderr": 0.0017035052448524074, "rouge1_recall": 0.010626060130620773, "rouge1_recall_stderr": 0.0011535446624529534, "rouge2_fmeasure": 0.0025899683938137505, "rouge2_fmeasure_stderr": 0.0003735919580894839, "rouge2_precision": 0.004970619964230962, "rouge2_precision_stderr": 0.0008899410081918668, "rouge2_recall": 0.0028278660850337943, "rouge2_recall_stderr": 0.0004524607196131978, "rougeL_fmeasure": 0.007576393607841029, "rougeL_fmeasure_stderr": 0.0007684492822090447, "rougeL_precision": 0.013209887820998075, "rougeL_precision_stderr": 0.0014642323654067313, "rougeL_recall": 0.008125712247962694, "rougeL_recall_stderr": 0.0009130225272738773, "rougeLsum_fmeasure": 0.009326154250301305, "rougeLsum_fmeasure_stderr": 0.0009201789217205499, "rougeLsum_precision": 0.015751842943279706, "rougeLsum_precision_stderr": 0.0016546373822257892, "rougeLsum_recall": 0.010023884760103447, "rougeLsum_recall_stderr": 0.0010922162907297795}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.45157430779084795, "bleu_stderr": 0.020312067005750303, "rouge1_fmeasure": 0.12126740156287513, "rouge1_fmeasure_stderr": 0.002453178154700304, "rouge1_precision": 0.11904027947959508, "rouge1_precision_stderr": 0.0023446375085163784, "rouge1_recall": 0.13243756044602717, "rouge1_recall_stderr": 0.0029757523756588486, "rouge2_fmeasure": 0.019875685552769976, "rouge2_fmeasure_stderr": 0.0010470891564325782, "rouge2_precision": 0.018112020399388428, "rouge2_precision_stderr": 0.0009650885423511505, "rouge2_recall": 0.024078996694527267, "rouge2_recall_stderr": 0.0013022047590821445, "rougeL_fmeasure": 0.09555010635521315, "rougeL_fmeasure_stderr": 0.0017736595417563498, "rougeL_precision": 0.09416799170833806, "rougeL_precision_stderr": 0.0017151752196978443, "rougeL_recall": 0.10377264991208293, "rougeL_recall_stderr": 0.00214142301132201, "rougeLsum_fmeasure": 0.10880624077941087, "rougeLsum_fmeasure_stderr": 0.002242675222061131, "rougeLsum_precision": 0.10650075357501369, "rougeLsum_precision_stderr": 0.002121228890440893, "rougeLsum_recall": 0.1193830394118181, "rougeLsum_recall_stderr": 0.00276121157366126}}, "1": {"generate_text_restaurant": {"bleu": 13.519253103394183, "bleu_stderr": 0.24429832540427907, "rouge1_fmeasure": 0.5097898106288475, "rouge1_fmeasure_stderr": 0.0024753487988326975, "rouge1_precision": 0.6152062167827251, "rouge1_precision_stderr": 0.003204414361798554, "rouge1_recall": 0.4746865167466136, "rouge1_recall_stderr": 0.0031769026037383515, "rouge2_fmeasure": 0.2458860659703384, "rouge2_fmeasure_stderr": 0.0022156534867832744, "rouge2_precision": 0.30010183611056995, "rouge2_precision_stderr": 0.002828250589814168, "rouge2_recall": 0.2289720227066138, "rouge2_recall_stderr": 0.002357812531538643, "rougeL_fmeasure": 0.3645735058212257, "rougeL_fmeasure_stderr": 0.002245888336923098, "rougeL_precision": 0.4437729713274751, "rougeL_precision_stderr": 0.003063605923863207, "rougeL_recall": 0.3381245050342295, "rougeL_recall_stderr": 0.0025918282483563737, "rougeLsum_fmeasure": 0.4144563002637345, "rougeLsum_fmeasure_stderr": 0.0025005645125822976, "rougeLsum_precision": 0.501909988982239, "rougeLsum_precision_stderr": 0.003262136424657466, "rougeLsum_recall": 0.38520269092999865, "rougeLsum_recall_stderr": 0.0029233354995314466}}, "2": {"generate_text_restaurant": {"bleu": 16.532587062141825, "bleu_stderr": 0.24628122688413334, "rouge1_fmeasure": 0.5535435393060334, "rouge1_fmeasure_stderr": 0.0023181561502634744, "rouge1_precision": 0.6391995937597211, "rouge1_precision_stderr": 0.003005481083260041, "rouge1_recall": 0.5239222252831156, "rouge1_recall_stderr": 0.0030310288003591664, "rouge2_fmeasure": 0.28277007498560486, "rouge2_fmeasure_stderr": 0.0022866524907569213, "rouge2_precision": 0.3294768007338346, "rouge2_precision_stderr": 0.0027954986440451293, "rouge2_recall": 0.267528520989109, "rouge2_recall_stderr": 0.0024485234887143942, "rougeL_fmeasure": 0.3950519159135797, "rougeL_fmeasure_stderr": 0.0022585818970320315, "rougeL_precision": 0.4584441418091013, "rougeL_precision_stderr": 0.002944068965018029, "rougeL_recall": 0.37300127880426376, "rougeL_recall_stderr": 0.002589164473121498, "rougeLsum_fmeasure": 0.4535459365510049, "rougeLsum_fmeasure_stderr": 0.0024759202231393806, "rougeLsum_precision": 0.5245113443589354, "rougeLsum_precision_stderr": 0.003126688701324384, "rougeLsum_recall": 0.428702991593199, "rougeLsum_recall_stderr": 0.0028852783980185476}}, "3": {"generate_text_restaurant": {"bleu": 17.90398157020638, "bleu_stderr": 0.21044536455674664, "rouge1_fmeasure": 0.5663606550952044, "rouge1_fmeasure_stderr": 0.0023331762866810792, "rouge1_precision": 0.6401832525822847, "rouge1_precision_stderr": 0.0029717952973887483, "rouge1_recall": 0.5420898062454031, "rouge1_recall_stderr": 0.003034436774393523, "rouge2_fmeasure": 0.29667899957066707, "rouge2_fmeasure_stderr": 0.0023563351483575604, "rouge2_precision": 0.337049008209819, "rouge2_precision_stderr": 0.0027762392358002446, "rouge2_recall": 0.28450727274979776, "rouge2_recall_stderr": 0.002554220788784057, "rougeL_fmeasure": 0.40482387885914795, "rougeL_fmeasure_stderr": 0.0023224567983998303, "rougeL_precision": 0.4592214530942679, "rougeL_precision_stderr": 0.0029194293050611676, "rougeL_recall": 0.38675823900046985, "rougeL_recall_stderr": 0.002653794453051874, "rougeLsum_fmeasure": 0.46563733868309887, "rougeLsum_fmeasure_stderr": 0.00251193491076274, "rougeLsum_precision": 0.5270027613054903, "rougeLsum_precision_stderr": 0.003112254757314433, "rougeLsum_recall": 0.44518959531331104, "rougeLsum_recall_stderr": 0.0029203507620738155}}, "4": {"generate_text_restaurant": {"bleu": 18.190285924587176, "bleu_stderr": 0.1638135744035078, "rouge1_fmeasure": 0.568802850827204, "rouge1_fmeasure_stderr": 0.002330482193693418, "rouge1_precision": 0.6426079486703759, "rouge1_precision_stderr": 0.002995814360085234, "rouge1_recall": 0.5426774653794442, "rouge1_recall_stderr": 0.002974153563271246, "rouge2_fmeasure": 0.2995925527308815, "rouge2_fmeasure_stderr": 0.0023201466187651645, "rouge2_precision": 0.34077031298246946, "rouge2_precision_stderr": 0.0027563967571098387, "rouge2_recall": 0.285814700524824, "rouge2_recall_stderr": 0.002491572781759064, "rougeL_fmeasure": 0.4081098640042389, "rougeL_fmeasure_stderr": 0.002344302992696762, "rougeL_precision": 0.46230684946979184, "rougeL_precision_stderr": 0.0029251094137301872, "rougeL_recall": 0.38880143243534787, "rougeL_recall_stderr": 0.0026493526307550383, "rougeLsum_fmeasure": 0.47218224276577575, "rougeLsum_fmeasure_stderr": 0.0025245194090443065, "rougeLsum_precision": 0.5335422595556751, "rougeLsum_precision_stderr": 0.0030991974296569903, "rougeLsum_recall": 0.45022345528876645, "rougeLsum_recall_stderr": 0.002908788520691941}}, "5": {"generate_text_restaurant": {"bleu": 18.211026701872324, "bleu_stderr": 0.30144615462613134, "rouge1_fmeasure": 0.5705499580808961, "rouge1_fmeasure_stderr": 0.002298437967096296, "rouge1_precision": 0.6465557735169711, "rouge1_precision_stderr": 0.0029875460613617345, "rouge1_recall": 0.5419122055283038, "rouge1_recall_stderr": 0.0029126395431989037, "rouge2_fmeasure": 0.30257765977165146, "rouge2_fmeasure_stderr": 0.0023357899915752876, "rouge2_precision": 0.3453499399492929, "rouge2_precision_stderr": 0.002793233931803476, "rouge2_recall": 0.28738168401692166, "rouge2_recall_stderr": 0.0024887619252834084, "rougeL_fmeasure": 0.4097982646744773, "rougeL_fmeasure_stderr": 0.0022980595262775106, "rougeL_precision": 0.46538464288158515, "rougeL_precision_stderr": 0.0028927145385525713, "rougeL_recall": 0.3889824764835787, "rougeL_recall_stderr": 0.0026053767805729136, "rougeLsum_fmeasure": 0.4737608635950133, "rougeLsum_fmeasure_stderr": 0.0024827552604322002, "rougeLsum_precision": 0.5375927529422523, "rougeLsum_precision_stderr": 0.0031147670230521206, "rougeLsum_recall": 0.44946513034344515, "rougeLsum_recall_stderr": 0.002838853223535406}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.3037801114698624, "bleu_stderr": 0.0680619690949938, "rouge1_fmeasure": 0.20403975963704862, "rouge1_fmeasure_stderr": 0.002319289771450204, "rouge1_precision": 0.1469605934979757, "rouge1_precision_stderr": 0.0017865724277199948, "rouge1_recall": 0.3506422207988605, "rouge1_recall_stderr": 0.003986742971646998, "rouge2_fmeasure": 0.03753366067926952, "rouge2_fmeasure_stderr": 0.0013078865741626474, "rouge2_precision": 0.026744641241732637, "rouge2_precision_stderr": 0.0009422276007560746, "rouge2_recall": 0.06643486894158677, "rouge2_recall_stderr": 0.002386033428452579, "rougeL_fmeasure": 0.14389180965207674, "rougeL_fmeasure_stderr": 0.001617135793928111, "rougeL_precision": 0.10351977997625217, "rougeL_precision_stderr": 0.0012474436254771419, "rougeL_recall": 0.24870680758461805, "rougeL_recall_stderr": 0.0029112495820551383, "rougeLsum_fmeasure": 0.1596041065260684, "rougeLsum_fmeasure_stderr": 0.0019365489034377087, "rougeLsum_precision": 0.11474003075800913, "rougeLsum_precision_stderr": 0.0014646256993554021, "rougeLsum_recall": 0.2755300286180654, "rougeLsum_recall_stderr": 0.0034159886285073187}}, "1": {"article_DOC_summary": {"bleu": 2.168559209994118, "bleu_stderr": 0.19261438655149657, "rouge1_fmeasure": 0.2157560715659971, "rouge1_fmeasure_stderr": 0.0033705430690821898, "rouge1_precision": 0.22602947500467982, "rouge1_precision_stderr": 0.004119165978624385, "rouge1_recall": 0.23440867515632266, "rouge1_recall_stderr": 0.0037365422011483326, "rouge2_fmeasure": 0.043055373688934306, "rouge2_fmeasure_stderr": 0.002069633541517841, "rouge2_precision": 0.04592289638609667, "rouge2_precision_stderr": 0.0023185763635273395, "rouge2_recall": 0.04556855878424412, "rouge2_recall_stderr": 0.0021493670944354244, "rougeL_fmeasure": 0.16096135585574012, "rougeL_fmeasure_stderr": 0.002621329499887791, "rougeL_precision": 0.1685531717852416, "rougeL_precision_stderr": 0.003182125073698959, "rougeL_recall": 0.1754200215304007, "rougeL_recall_stderr": 0.0028986455712001757, "rougeLsum_fmeasure": 0.16285837059497105, "rougeLsum_fmeasure_stderr": 0.0026510785772407936, "rougeLsum_precision": 0.170122073153452, "rougeLsum_precision_stderr": 0.0031868674106724855, "rougeLsum_recall": 0.17832126706963158, "rougeLsum_recall_stderr": 0.003034366151076325}}, "2": {"article_DOC_summary": {"bleu": 2.6249618135375687, "bleu_stderr": 0.24538192806609463, "rouge1_fmeasure": 0.2285075423791832, "rouge1_fmeasure_stderr": 0.0034187200506676963, "rouge1_precision": 0.24890172495306734, "rouge1_precision_stderr": 0.004118420628611812, "rouge1_recall": 0.2282385147731723, "rouge1_recall_stderr": 0.003558425026436, "rouge2_fmeasure": 0.04816447431232779, "rouge2_fmeasure_stderr": 0.0021228962417586235, "rouge2_precision": 0.05352044108848955, "rouge2_precision_stderr": 0.002442043526846074, "rouge2_recall": 0.04720092118923226, "rouge2_recall_stderr": 0.0020790734792126065, "rougeL_fmeasure": 0.17085822389220204, "rougeL_fmeasure_stderr": 0.0026636249494725543, "rougeL_precision": 0.186053603967522, "rougeL_precision_stderr": 0.0032096455158984885, "rougeL_recall": 0.17082987354933227, "rougeL_recall_stderr": 0.002739076869406153, "rougeLsum_fmeasure": 0.1722430045797887, "rougeLsum_fmeasure_stderr": 0.00267553314413038, "rougeLsum_precision": 0.18739175143511438, "rougeLsum_precision_stderr": 0.003213663387935582, "rougeLsum_recall": 0.1723698558903064, "rougeLsum_recall_stderr": 0.002768453468484286}}, "3": {"article_DOC_summary": {"bleu": 2.3805127986599266, "bleu_stderr": 0.12707789500554498, "rouge1_fmeasure": 0.21580377017329594, "rouge1_fmeasure_stderr": 0.00361950598897709, "rouge1_precision": 0.23695305900983066, "rouge1_precision_stderr": 0.004292416342462862, "rouge1_recall": 0.21521830097594002, "rouge1_recall_stderr": 0.0037611332592705727, "rouge2_fmeasure": 0.04433348477474905, "rouge2_fmeasure_stderr": 0.002034325676810133, "rouge2_precision": 0.0499513379933262, "rouge2_precision_stderr": 0.0023665140307114396, "rouge2_recall": 0.0434126797948661, "rouge2_recall_stderr": 0.0020088708573565597, "rougeL_fmeasure": 0.16094461354838566, "rougeL_fmeasure_stderr": 0.002836256881281109, "rougeL_precision": 0.1774080493096266, "rougeL_precision_stderr": 0.003422872640882046, "rougeL_recall": 0.16027273164729536, "rougeL_recall_stderr": 0.0029059912834387253, "rougeLsum_fmeasure": 0.1627788398441729, "rougeLsum_fmeasure_stderr": 0.002858369326723205, "rougeLsum_precision": 0.17921755273539258, "rougeLsum_precision_stderr": 0.0034349720388968576, "rougeLsum_recall": 0.1623691302794534, "rougeLsum_recall_stderr": 0.002956632470703601}}, "4": {"article_DOC_summary": {"bleu": 0.05589247441592409, "bleu_stderr": 0.02878165974261674, "rouge1_fmeasure": 0.051618708263210175, "rouge1_fmeasure_stderr": 0.003227894215590072, "rouge1_precision": 0.06448847219263751, "rouge1_precision_stderr": 0.004195684351970971, "rouge1_recall": 0.0492814222658454, "rouge1_recall_stderr": 0.003194070647438585, "rouge2_fmeasure": 0.01036458826567488, "rouge2_fmeasure_stderr": 0.0012194138335374684, "rouge2_precision": 0.012255432940403887, "rouge2_precision_stderr": 0.001451425462748262, "rouge2_recall": 0.009788249292977388, "rouge2_recall_stderr": 0.0011811100501098585, "rougeL_fmeasure": 0.03852778744386901, "rougeL_fmeasure_stderr": 0.0024723231124259898, "rougeL_precision": 0.04957733430301481, "rougeL_precision_stderr": 0.0034412246022957156, "rougeL_recall": 0.036630679493315124, "rougeL_recall_stderr": 0.0024250580808923396, "rougeLsum_fmeasure": 0.03879473101200802, "rougeLsum_fmeasure_stderr": 0.002485972178510378, "rougeLsum_precision": 0.049855400831076535, "rougeLsum_precision_stderr": 0.0034486786998475854, "rougeLsum_recall": 0.0368906646433281, "rougeLsum_recall_stderr": 0.0024479195719888545}}, "5": {"article_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}}}
4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.334,0.01492201952373296,0
3
+ anli_r2,acc,0.334,0.014922019523732963,0
4
+ anli_r3,acc,0.335,0.013630871843821479,0
5
+ arc_challenge,acc,0.21416382252559726,0.011988383205966496,0
6
+ arc_challenge,acc_norm,0.25170648464163825,0.012682496334042961,0
7
+ arc_easy,acc,0.515993265993266,0.01025453358928817,0
8
+ arc_easy,acc_norm,0.45707070707070707,0.01022189756425606,0
9
+ boolq,acc,0.5740061162079511,0.008648732832949143,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.1940928270042194,,1
12
+ copa,acc,0.71,0.045604802157206845,0
13
+ hellaswag,acc,0.36277633937462656,0.004798184463156354,0
14
+ hellaswag,acc_norm,0.4466241784505079,0.004961268387512965,0
15
+ piqa,acc,0.6882480957562568,0.010807431424873677,0
16
+ piqa,acc_norm,0.6920565832426551,0.010770892367463676,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.832,0.011828605831454266,0
19
+ sciq,acc_norm,0.73,0.01404625563263391,0
20
+ storycloze_2016,acc,0.6472474612506681,0.011049673577950937,0
21
+ winogrande,acc,0.5445935280189423,0.013996485037729782,0
4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_0_lm-eval_global_step80108_2023-05-13-13-52-19_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.334,
5
- "acc_stderr": 0.01492201952373296
6
- },
7
- "anli_r2": {
8
- "acc": 0.334,
9
- "acc_stderr": 0.014922019523732963
10
- },
11
- "anli_r3": {
12
- "acc": 0.335,
13
- "acc_stderr": 0.013630871843821479
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.1940928270042194
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.36277633937462656,
26
- "acc_stderr": 0.004798184463156354,
27
- "acc_norm": 0.4466241784505079,
28
- "acc_norm_stderr": 0.004961268387512965
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5445935280189423,
36
- "acc_stderr": 0.013996485037729782
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6472474612506681,
40
- "acc_stderr": 0.011049673577950937
41
- },
42
- "boolq": {
43
- "acc": 0.5740061162079511,
44
- "acc_stderr": 0.008648732832949143
45
- },
46
- "arc_easy": {
47
- "acc": 0.515993265993266,
48
- "acc_stderr": 0.01025453358928817,
49
- "acc_norm": 0.45707070707070707,
50
- "acc_norm_stderr": 0.01022189756425606
51
- },
52
- "arc_challenge": {
53
- "acc": 0.21416382252559726,
54
- "acc_stderr": 0.011988383205966496,
55
- "acc_norm": 0.25170648464163825,
56
- "acc_norm_stderr": 0.012682496334042961
57
- },
58
- "sciq": {
59
- "acc": 0.832,
60
- "acc_stderr": 0.011828605831454266,
61
- "acc_norm": 0.73,
62
- "acc_norm_stderr": 0.01404625563263391
63
- },
64
- "piqa": {
65
- "acc": 0.6882480957562568,
66
- "acc_stderr": 0.010807431424873677,
67
- "acc_norm": 0.6920565832426551,
68
- "acc_norm_stderr": 0.010770892367463676
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.324,0.014806864733738863,0
3
+ anli_r2,acc,0.349,0.015080663991563104,0
4
+ anli_r3,acc,0.32666666666666666,0.013544340907003663,0
5
+ arc_challenge,acc,0.2235494880546075,0.012174896631202605,0
6
+ arc_challenge,acc_norm,0.25,0.012653835621466646,0
7
+ arc_easy,acc,0.5286195286195287,0.010242962617927197,0
8
+ arc_easy,acc_norm,0.49915824915824913,0.010259768981815241,0
9
+ boolq,acc,0.4972477064220184,0.008744922485713843,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.30431086824529446,,1
12
+ copa,acc,0.7,0.046056618647183814,0
13
+ hellaswag,acc,0.3571997610037841,0.004781950883460504,0
14
+ hellaswag,acc_norm,0.44811790479984065,0.004962846206125481,0
15
+ piqa,acc,0.6838955386289445,0.010848148455700453,0
16
+ piqa,acc_norm,0.6855277475516867,0.010833009065106565,0
17
+ rte,acc,0.5667870036101083,0.029826764082138288,0
18
+ sciq,acc,0.887,0.010016552866696856,0
19
+ sciq,acc_norm,0.876,0.010427498872343958,0
20
+ storycloze_2016,acc,0.6162479957242116,0.011245591019345452,0
21
+ winogrande,acc,0.5122336227308603,0.01404827882040562,0
4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_1_lm-eval_global_step80108_2023-05-13-13-52-19_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.324,
5
- "acc_stderr": 0.014806864733738863
6
- },
7
- "anli_r2": {
8
- "acc": 0.349,
9
- "acc_stderr": 0.015080663991563104
10
- },
11
- "anli_r3": {
12
- "acc": 0.32666666666666666,
13
- "acc_stderr": 0.013544340907003663
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.30431086824529446
19
- },
20
- "copa": {
21
- "acc": 0.7,
22
- "acc_stderr": 0.046056618647183814
23
- },
24
- "hellaswag": {
25
- "acc": 0.3571997610037841,
26
- "acc_stderr": 0.004781950883460504,
27
- "acc_norm": 0.44811790479984065,
28
- "acc_norm_stderr": 0.004962846206125481
29
- },
30
- "rte": {
31
- "acc": 0.5667870036101083,
32
- "acc_stderr": 0.029826764082138288
33
- },
34
- "winogrande": {
35
- "acc": 0.5122336227308603,
36
- "acc_stderr": 0.01404827882040562
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6162479957242116,
40
- "acc_stderr": 0.011245591019345452
41
- },
42
- "boolq": {
43
- "acc": 0.4972477064220184,
44
- "acc_stderr": 0.008744922485713843
45
- },
46
- "arc_easy": {
47
- "acc": 0.5286195286195287,
48
- "acc_stderr": 0.010242962617927197,
49
- "acc_norm": 0.49915824915824913,
50
- "acc_norm_stderr": 0.010259768981815241
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2235494880546075,
54
- "acc_stderr": 0.012174896631202605,
55
- "acc_norm": 0.25,
56
- "acc_norm_stderr": 0.012653835621466646
57
- },
58
- "sciq": {
59
- "acc": 0.887,
60
- "acc_stderr": 0.010016552866696856,
61
- "acc_norm": 0.876,
62
- "acc_norm_stderr": 0.010427498872343958
63
- },
64
- "piqa": {
65
- "acc": 0.6838955386289445,
66
- "acc_stderr": 0.010848148455700453,
67
- "acc_norm": 0.6855277475516867,
68
- "acc_norm_stderr": 0.010833009065106565
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.313,0.01467127282297789,0
3
+ anli_r2,acc,0.338,0.014965960710224475,0
4
+ anli_r3,acc,0.3383333333333333,0.013664144006618275,0
5
+ arc_challenge,acc,0.23208191126279865,0.012336718284948854,0
6
+ arc_challenge,acc_norm,0.257679180887372,0.012780770562768409,0
7
+ arc_easy,acc,0.5336700336700336,0.010236494647406476,0
8
+ arc_easy,acc_norm,0.5185185185185185,0.010252744217435626,0
9
+ boolq,acc,0.48623853211009177,0.008741742106878652,1
10
+ cb,acc,0.4107142857142857,0.06633634150359538,1
11
+ cb,f1,0.2798088410991637,,1
12
+ copa,acc,0.69,0.04648231987117316,0
13
+ hellaswag,acc,0.3572993427604063,0.004782246931194997,0
14
+ hellaswag,acc_norm,0.44981079466241786,0.004964579685712437,0
15
+ piqa,acc,0.6871599564744287,0.010817714425701102,0
16
+ piqa,acc_norm,0.6893362350380848,0.010797078933727673,0
17
+ rte,acc,0.555956678700361,0.02990739633379598,0
18
+ sciq,acc,0.903,0.009363689373248095,0
19
+ sciq,acc_norm,0.89,0.00989939381972443,0
20
+ storycloze_2016,acc,0.6215927311598076,0.011215325833205825,0
21
+ winogrande,acc,0.531965272296764,0.014023739221166382,0
4b284b84b10c4pyseed4/evaluation/rankeval/4b284b84b10c4pyseed4_2_lm-eval_global_step80108_2023-05-13-13-52-19_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.313,
5
- "acc_stderr": 0.01467127282297789
6
- },
7
- "anli_r2": {
8
- "acc": 0.338,
9
- "acc_stderr": 0.014965960710224475
10
- },
11
- "anli_r3": {
12
- "acc": 0.3383333333333333,
13
- "acc_stderr": 0.013664144006618275
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.06633634150359538,
18
- "f1": 0.2798088410991637
19
- },
20
- "copa": {
21
- "acc": 0.69,
22
- "acc_stderr": 0.04648231987117316
23
- },
24
- "hellaswag": {
25
- "acc": 0.3572993427604063,
26
- "acc_stderr": 0.004782246931194997,
27
- "acc_norm": 0.44981079466241786,
28
- "acc_norm_stderr": 0.004964579685712437
29
- },
30
- "rte": {
31
- "acc": 0.555956678700361,
32
- "acc_stderr": 0.02990739633379598
33
- },
34
- "winogrande": {
35
- "acc": 0.531965272296764,
36
- "acc_stderr": 0.014023739221166382
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6215927311598076,
40
- "acc_stderr": 0.011215325833205825
41
- },
42
- "boolq": {
43
- "acc": 0.48623853211009177,
44
- "acc_stderr": 0.008741742106878652
45
- },
46
- "arc_easy": {
47
- "acc": 0.5336700336700336,
48
- "acc_stderr": 0.010236494647406476,
49
- "acc_norm": 0.5185185185185185,
50
- "acc_norm_stderr": 0.010252744217435626
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23208191126279865,
54
- "acc_stderr": 0.012336718284948854,
55
- "acc_norm": 0.257679180887372,
56
- "acc_norm_stderr": 0.012780770562768409
57
- },
58
- "sciq": {
59
- "acc": 0.903,
60
- "acc_stderr": 0.009363689373248095,
61
- "acc_norm": 0.89,
62
- "acc_norm_stderr": 0.00989939381972443
63
- },
64
- "piqa": {
65
- "acc": 0.6871599564744287,
66
- "acc_stderr": 0.010817714425701102,
67
- "acc_norm": 0.6893362350380848,
68
- "acc_norm_stderr": 0.010797078933727673
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }