Muennighoff commited on
Commit
b9a33e8
·
1 Parent(s): f392bed
perplexity/evaluation/rankeval/perplexity_4.json CHANGED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.6425076452599389,
44
  "acc_stderr": 0.008382336069484898
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.6425076452599389,
44
  "acc_stderr": 0.008382336069484898
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6506734006734006,
48
+ "acc_stderr": 0.009782853449399284,
49
+ "acc_norm": 0.6300505050505051,
50
+ "acc_norm_stderr": 0.009906656266021148
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.3165529010238908,
54
+ "acc_stderr": 0.01359243151906808,
55
+ "acc_norm": 0.3378839590443686,
56
+ "acc_norm_stderr": 0.013822047922283509
57
+ },
58
+ "sciq": {
59
+ "acc": 0.935,
60
+ "acc_stderr": 0.007799733061832011,
61
+ "acc_norm": 0.925,
62
+ "acc_norm_stderr": 0.008333333333333364
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7622415669205659,
66
+ "acc_stderr": 0.009932525779525489,
67
+ "acc_norm": 0.779651795429815,
68
+ "acc_norm_stderr": 0.009670535456853148
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
perplexity/evaluation/rankeval/perplexity_4_lm-eval_global_step80108_2023-05-13-09-53-07_4shots_backup.json CHANGED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.6425076452599389,
44
  "acc_stderr": 0.008382336069484898
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.6425076452599389,
44
  "acc_stderr": 0.008382336069484898
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6506734006734006,
48
+ "acc_stderr": 0.009782853449399284,
49
+ "acc_norm": 0.6300505050505051,
50
+ "acc_norm_stderr": 0.009906656266021148
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.3165529010238908,
54
+ "acc_stderr": 0.01359243151906808,
55
+ "acc_norm": 0.3378839590443686,
56
+ "acc_norm_stderr": 0.013822047922283509
57
+ },
58
+ "sciq": {
59
+ "acc": 0.935,
60
+ "acc_stderr": 0.007799733061832011,
61
+ "acc_norm": 0.925,
62
+ "acc_norm_stderr": 0.008333333333333364
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7622415669205659,
66
+ "acc_stderr": 0.009932525779525489,
67
+ "acc_norm": 0.779651795429815,
68
+ "acc_norm_stderr": 0.009670535456853148
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
perplexity/evaluation/rankeval/perplexity_5.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.7477284874398717,
40
  "acc_stderr": 0.010043504206387307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.7477284874398717,
40
  "acc_stderr": 0.010043504206387307
41
+ },
42
+ "boolq": {
43
+ "acc": 0.634862385321101,
44
+ "acc_stderr": 0.008420941009417812
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6447811447811448,
48
+ "acc_stderr": 0.009820245899287117,
49
+ "acc_norm": 0.627104377104377,
50
+ "acc_norm_stderr": 0.009922743197129238
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.3250853242320819,
54
+ "acc_stderr": 0.013688147309729119,
55
+ "acc_norm": 0.3515358361774744,
56
+ "acc_norm_stderr": 0.013952413699600938
57
+ },
58
+ "sciq": {
59
+ "acc": 0.935,
60
+ "acc_stderr": 0.007799733061832013,
61
+ "acc_norm": 0.933,
62
+ "acc_norm_stderr": 0.007910345983177549
63
+ },
64
+ "piqa": {
65
+ "acc": 0.763873775843308,
66
+ "acc_stderr": 0.009908965890558213,
67
+ "acc_norm": 0.7834602829162133,
68
+ "acc_norm_stderr": 0.009609984714384593
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
perplexity/evaluation/rankeval/perplexity_5_lm-eval_global_step80108_2023-05-13-09-53-07_5shots_backup.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.7477284874398717,
40
  "acc_stderr": 0.010043504206387307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.7477284874398717,
40
  "acc_stderr": 0.010043504206387307
41
+ },
42
+ "boolq": {
43
+ "acc": 0.634862385321101,
44
+ "acc_stderr": 0.008420941009417812
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6447811447811448,
48
+ "acc_stderr": 0.009820245899287117,
49
+ "acc_norm": 0.627104377104377,
50
+ "acc_norm_stderr": 0.009922743197129238
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.3250853242320819,
54
+ "acc_stderr": 0.013688147309729119,
55
+ "acc_norm": 0.3515358361774744,
56
+ "acc_norm_stderr": 0.013952413699600938
57
+ },
58
+ "sciq": {
59
+ "acc": 0.935,
60
+ "acc_stderr": 0.007799733061832013,
61
+ "acc_norm": 0.933,
62
+ "acc_norm_stderr": 0.007910345983177549
63
+ },
64
+ "piqa": {
65
+ "acc": 0.763873775843308,
66
+ "acc_stderr": 0.009908965890558213,
67
+ "acc_norm": 0.7834602829162133,
68
+ "acc_norm_stderr": 0.009609984714384593
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
perplexity25/evaluation/rankeval/perplexity25_4.json CHANGED
@@ -54,6 +54,18 @@
54
  "acc_stderr": 0.013715847940719346,
55
  "acc_norm": 0.3728668941979522,
56
  "acc_norm_stderr": 0.014131176760131163
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
  },
59
  "versions": {
@@ -68,6 +80,8 @@
68
  "storycloze_2016": 0,
69
  "boolq": 1,
70
  "arc_easy": 0,
71
- "arc_challenge": 0
 
 
72
  }
73
  }
 
54
  "acc_stderr": 0.013715847940719346,
55
  "acc_norm": 0.3728668941979522,
56
  "acc_norm_stderr": 0.014131176760131163
57
+ },
58
+ "sciq": {
59
+ "acc": 0.927,
60
+ "acc_stderr": 0.008230354715244054,
61
+ "acc_norm": 0.921,
62
+ "acc_norm_stderr": 0.008534156773333435
63
+ },
64
+ "piqa": {
65
+ "acc": 0.73449401523395,
66
+ "acc_stderr": 0.010303308653024427,
67
+ "acc_norm": 0.7383025027203483,
68
+ "acc_norm_stderr": 0.010255630772708227
69
  }
70
  },
71
  "versions": {
 
80
  "storycloze_2016": 0,
81
  "boolq": 1,
82
  "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
perplexity25/evaluation/rankeval/perplexity25_4_lm-eval_global_step80108_2023-05-13-09-53-07_4shots_backup.json CHANGED
@@ -54,6 +54,18 @@
54
  "acc_stderr": 0.013715847940719346,
55
  "acc_norm": 0.3728668941979522,
56
  "acc_norm_stderr": 0.014131176760131163
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
  },
59
  "versions": {
@@ -68,6 +80,8 @@
68
  "storycloze_2016": 0,
69
  "boolq": 1,
70
  "arc_easy": 0,
71
- "arc_challenge": 0
 
 
72
  }
73
  }
 
54
  "acc_stderr": 0.013715847940719346,
55
  "acc_norm": 0.3728668941979522,
56
  "acc_norm_stderr": 0.014131176760131163
57
+ },
58
+ "sciq": {
59
+ "acc": 0.927,
60
+ "acc_stderr": 0.008230354715244054,
61
+ "acc_norm": 0.921,
62
+ "acc_norm_stderr": 0.008534156773333435
63
+ },
64
+ "piqa": {
65
+ "acc": 0.73449401523395,
66
+ "acc_stderr": 0.010303308653024427,
67
+ "acc_norm": 0.7383025027203483,
68
+ "acc_norm_stderr": 0.010255630772708227
69
  }
70
  },
71
  "versions": {
 
80
  "storycloze_2016": 0,
81
  "boolq": 1,
82
  "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
perplexity25/evaluation/rankeval/perplexity25_5.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.7493319080705505,
40
  "acc_stderr": 0.010022263975606228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.7493319080705505,
40
  "acc_stderr": 0.010022263975606228
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6501529051987768,
44
+ "acc_stderr": 0.008341409251946758
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.680976430976431,
48
+ "acc_stderr": 0.009564133249441074,
49
+ "acc_norm": 0.6658249158249159,
50
+ "acc_norm_stderr": 0.009679106032919058
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.34044368600682595,
54
+ "acc_stderr": 0.013847460518892981,
55
+ "acc_norm": 0.36945392491467577,
56
+ "acc_norm_stderr": 0.0141045783664919
57
+ },
58
+ "sciq": {
59
+ "acc": 0.927,
60
+ "acc_stderr": 0.00823035471524406,
61
+ "acc_norm": 0.921,
62
+ "acc_norm_stderr": 0.008534156773333442
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7383025027203483,
66
+ "acc_stderr": 0.010255630772708229,
67
+ "acc_norm": 0.735038084874864,
68
+ "acc_norm_stderr": 0.010296557993316037
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
perplexity25/evaluation/rankeval/perplexity25_5_lm-eval_global_step80108_2023-05-13-09-53-07_5shots_backup.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.7493319080705505,
40
  "acc_stderr": 0.010022263975606228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.7493319080705505,
40
  "acc_stderr": 0.010022263975606228
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6501529051987768,
44
+ "acc_stderr": 0.008341409251946758
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.680976430976431,
48
+ "acc_stderr": 0.009564133249441074,
49
+ "acc_norm": 0.6658249158249159,
50
+ "acc_norm_stderr": 0.009679106032919058
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.34044368600682595,
54
+ "acc_stderr": 0.013847460518892981,
55
+ "acc_norm": 0.36945392491467577,
56
+ "acc_norm_stderr": 0.0141045783664919
57
+ },
58
+ "sciq": {
59
+ "acc": 0.927,
60
+ "acc_stderr": 0.00823035471524406,
61
+ "acc_norm": 0.921,
62
+ "acc_norm_stderr": 0.008534156773333442
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7383025027203483,
66
+ "acc_stderr": 0.010255630772708229,
67
+ "acc_norm": 0.735038084874864,
68
+ "acc_norm_stderr": 0.010296557993316037
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
perplexity50/evaluation/rankeval/perplexity50_5.json CHANGED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.6204892966360857,
44
  "acc_stderr": 0.00848734197575683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.6204892966360857,
44
  "acc_stderr": 0.00848734197575683
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6662457912457912,
48
+ "acc_stderr": 0.009676065683575472,
49
+ "acc_norm": 0.656986531986532,
50
+ "acc_norm_stderr": 0.009740965666489234
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.3302047781569966,
54
+ "acc_stderr": 0.013743085603760427,
55
+ "acc_norm": 0.3464163822525597,
56
+ "acc_norm_stderr": 0.013905011180063246
57
+ },
58
+ "sciq": {
59
+ "acc": 0.922,
60
+ "acc_stderr": 0.008484573530118585,
61
+ "acc_norm": 0.922,
62
+ "acc_norm_stderr": 0.008484573530118585
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7573449401523396,
66
+ "acc_stderr": 0.010002002569708698,
67
+ "acc_norm": 0.7633297062023939,
68
+ "acc_norm_stderr": 0.009916841655042809
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
perplexity50/evaluation/rankeval/perplexity50_5_lm-eval_global_step80108_2023-05-13-09-53-07_5shots_backup.json CHANGED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.6204892966360857,
44
  "acc_stderr": 0.00848734197575683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.6204892966360857,
44
  "acc_stderr": 0.00848734197575683
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6662457912457912,
48
+ "acc_stderr": 0.009676065683575472,
49
+ "acc_norm": 0.656986531986532,
50
+ "acc_norm_stderr": 0.009740965666489234
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.3302047781569966,
54
+ "acc_stderr": 0.013743085603760427,
55
+ "acc_norm": 0.3464163822525597,
56
+ "acc_norm_stderr": 0.013905011180063246
57
+ },
58
+ "sciq": {
59
+ "acc": 0.922,
60
+ "acc_stderr": 0.008484573530118585,
61
+ "acc_norm": 0.922,
62
+ "acc_norm_stderr": 0.008484573530118585
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7573449401523396,
66
+ "acc_stderr": 0.010002002569708698,
67
+ "acc_norm": 0.7633297062023939,
68
+ "acc_norm_stderr": 0.009916841655042809
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }