Muennighoff commited on
Commit
006459a
·
1 Parent(s): fd4b89b
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_2.csv CHANGED
@@ -12,6 +12,10 @@ cb,f1,0.2557471264367816,,1
12
  copa,acc,0.73,0.044619604333847394,0
13
  hellaswag,acc,0.45030870344552876,0.004965078477435579,0
14
  hellaswag,acc_norm,0.599183429595698,0.004890623693243619,0
 
 
15
  rte,acc,0.5342960288808665,0.030025579819366426,0
 
 
16
  storycloze_2016,acc,0.6996258685195083,0.010600915927985021,0
17
  winogrande,acc,0.5611681136543015,0.013946933444507032,0
 
12
  copa,acc,0.73,0.044619604333847394,0
13
  hellaswag,acc,0.45030870344552876,0.004965078477435579,0
14
  hellaswag,acc_norm,0.599183429595698,0.004890623693243619,0
15
+ piqa,acc,0.7524483133841132,0.010069703966857106,0
16
+ piqa,acc_norm,0.750272034820457,0.010099232969867469,0
17
  rte,acc,0.5342960288808665,0.030025579819366426,0
18
+ sciq,acc,0.881,0.010244215145336664,0
19
+ sciq,acc_norm,0.87,0.010640169792499349,0
20
  storycloze_2016,acc,0.6996258685195083,0.010600915927985021,0
21
  winogrande,acc,0.5611681136543015,0.013946933444507032,0
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_2.json CHANGED
@@ -54,6 +54,18 @@
54
  "acc_stderr": 0.012980954547659554,
55
  "acc_norm": 0.2883959044368601,
56
  "acc_norm_stderr": 0.013238394422428175
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
  },
59
  "versions": {
@@ -68,6 +80,8 @@
68
  "storycloze_2016": 0,
69
  "boolq": 1,
70
  "arc_easy": 0,
71
- "arc_challenge": 0
 
 
72
  }
73
  }
 
54
  "acc_stderr": 0.012980954547659554,
55
  "acc_norm": 0.2883959044368601,
56
  "acc_norm_stderr": 0.013238394422428175
57
+ },
58
+ "sciq": {
59
+ "acc": 0.881,
60
+ "acc_stderr": 0.010244215145336664,
61
+ "acc_norm": 0.87,
62
+ "acc_norm_stderr": 0.010640169792499349
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7524483133841132,
66
+ "acc_stderr": 0.010069703966857106,
67
+ "acc_norm": 0.750272034820457,
68
+ "acc_norm_stderr": 0.010099232969867469
69
  }
70
  },
71
  "versions": {
 
80
  "storycloze_2016": 0,
81
  "boolq": 1,
82
  "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_3.csv CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.33,0.014876872027456732,0
3
  anli_r2,acc,0.371,0.015283736211823188,0
4
  anli_r3,acc,0.32166666666666666,0.013490095282989521,0
 
 
 
 
 
5
  cb,acc,0.35714285714285715,0.06460957383809221,1
6
  cb,f1,0.3051529790660225,,1
7
  copa,acc,0.81,0.03942772444036623,0
8
  hellaswag,acc,0.45309699263095,0.004967778940011933,0
9
  hellaswag,acc_norm,0.6016729735112527,0.004885529674958343,0
 
 
10
  rte,acc,0.5234657039711191,0.03006330041190266,0
 
 
11
  storycloze_2016,acc,0.7076429716729022,0.010518239729787736,0
12
  winogrande,acc,0.5706393054459353,0.01391153749996916,0
 
2
  anli_r1,acc,0.33,0.014876872027456732,0
3
  anli_r2,acc,0.371,0.015283736211823188,0
4
  anli_r3,acc,0.32166666666666666,0.013490095282989521,0
5
+ arc_challenge,acc,0.2713310580204778,0.012993807727545803,0
6
+ arc_challenge,acc_norm,0.28498293515358364,0.013191348179838795,0
7
+ arc_easy,acc,0.5942760942760943,0.010075755540128871,0
8
+ arc_easy,acc_norm,0.5820707070707071,0.010120628211017883,0
9
+ boolq,acc,0.5804281345565749,0.008631175489166717,1
10
  cb,acc,0.35714285714285715,0.06460957383809221,1
11
  cb,f1,0.3051529790660225,,1
12
  copa,acc,0.81,0.03942772444036623,0
13
  hellaswag,acc,0.45309699263095,0.004967778940011933,0
14
  hellaswag,acc_norm,0.6016729735112527,0.004885529674958343,0
15
+ piqa,acc,0.750272034820457,0.010099232969867493,0
16
+ piqa,acc_norm,0.7546245919477693,0.010039831320422387,0
17
  rte,acc,0.5234657039711191,0.03006330041190266,0
18
+ sciq,acc,0.875,0.010463483381956722,0
19
+ sciq,acc_norm,0.865,0.010811655372416051,0
20
  storycloze_2016,acc,0.7076429716729022,0.010518239729787736,0
21
  winogrande,acc,0.5706393054459353,0.01391153749996916,0
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_3.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.7076429716729022,
40
  "acc_stderr": 0.010518239729787736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.7076429716729022,
40
  "acc_stderr": 0.010518239729787736
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5804281345565749,
44
+ "acc_stderr": 0.008631175489166717
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5942760942760943,
48
+ "acc_stderr": 0.010075755540128871,
49
+ "acc_norm": 0.5820707070707071,
50
+ "acc_norm_stderr": 0.010120628211017883
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2713310580204778,
54
+ "acc_stderr": 0.012993807727545803,
55
+ "acc_norm": 0.28498293515358364,
56
+ "acc_norm_stderr": 0.013191348179838795
57
+ },
58
+ "sciq": {
59
+ "acc": 0.875,
60
+ "acc_stderr": 0.010463483381956722,
61
+ "acc_norm": 0.865,
62
+ "acc_norm_stderr": 0.010811655372416051
63
+ },
64
+ "piqa": {
65
+ "acc": 0.750272034820457,
66
+ "acc_stderr": 0.010099232969867493,
67
+ "acc_norm": 0.7546245919477693,
68
+ "acc_norm_stderr": 0.010039831320422387
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_4.csv CHANGED
@@ -2,10 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.339,0.01497675877162034,0
3
  anli_r2,acc,0.342,0.01500870618212173,0
4
  anli_r3,acc,0.33416666666666667,0.01362243481313678,0
 
 
 
 
 
5
  cb,acc,0.375,0.06527912098338669,1
6
  cb,f1,0.3162533392229865,,1
7
  copa,acc,0.78,0.04163331998932261,0
8
  hellaswag,acc,0.44981079466241786,0.004964579685712441,0
9
  hellaswag,acc_norm,0.6027683728340968,0.0048832465794966485,0
 
 
10
  rte,acc,0.516245487364621,0.030080573208738064,0
 
 
 
11
  winogrande,acc,0.5816890292028414,0.013863669961195911,0
 
2
  anli_r1,acc,0.339,0.01497675877162034,0
3
  anli_r2,acc,0.342,0.01500870618212173,0
4
  anli_r3,acc,0.33416666666666667,0.01362243481313678,0
5
+ arc_challenge,acc,0.2773037542662116,0.013082095839059374,0
6
+ arc_challenge,acc_norm,0.2883959044368601,0.013238394422428173,0
7
+ arc_easy,acc,0.6056397306397306,0.010028176038392999,0
8
+ arc_easy,acc_norm,0.5862794612794613,0.010105878530238137,0
9
+ boolq,acc,0.5954128440366973,0.008584355308932694,1
10
  cb,acc,0.375,0.06527912098338669,1
11
  cb,f1,0.3162533392229865,,1
12
  copa,acc,0.78,0.04163331998932261,0
13
  hellaswag,acc,0.44981079466241786,0.004964579685712441,0
14
  hellaswag,acc_norm,0.6027683728340968,0.0048832465794966485,0
15
+ piqa,acc,0.7470076169749728,0.010142888698862462,0
16
+ piqa,acc_norm,0.7540805223068553,0.010047331865625182,0
17
  rte,acc,0.516245487364621,0.030080573208738064,0
18
+ sciq,acc,0.888,0.009977753031397236,0
19
+ sciq,acc_norm,0.884,0.010131468138756976,0
20
+ storycloze_2016,acc,0.7071084981293426,0.010523873293246304,0
21
  winogrande,acc,0.5816890292028414,0.013863669961195911,0
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_4.json CHANGED
@@ -34,6 +34,38 @@
34
  "winogrande": {
35
  "acc": 0.5816890292028414,
36
  "acc_stderr": 0.013863669961195911
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
38
  },
39
  "versions": {
@@ -44,6 +76,12 @@
44
  "copa": 0,
45
  "hellaswag": 0,
46
  "rte": 0,
47
- "winogrande": 0
 
 
 
 
 
 
48
  }
49
  }
 
34
  "winogrande": {
35
  "acc": 0.5816890292028414,
36
  "acc_stderr": 0.013863669961195911
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.7071084981293426,
40
+ "acc_stderr": 0.010523873293246304
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5954128440366973,
44
+ "acc_stderr": 0.008584355308932694
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6056397306397306,
48
+ "acc_stderr": 0.010028176038392999,
49
+ "acc_norm": 0.5862794612794613,
50
+ "acc_norm_stderr": 0.010105878530238137
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2773037542662116,
54
+ "acc_stderr": 0.013082095839059374,
55
+ "acc_norm": 0.2883959044368601,
56
+ "acc_norm_stderr": 0.013238394422428173
57
+ },
58
+ "sciq": {
59
+ "acc": 0.888,
60
+ "acc_stderr": 0.009977753031397236,
61
+ "acc_norm": 0.884,
62
+ "acc_norm_stderr": 0.010131468138756976
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7470076169749728,
66
+ "acc_stderr": 0.010142888698862462,
67
+ "acc_norm": 0.7540805223068553,
68
+ "acc_norm_stderr": 0.010047331865625182
69
  }
70
  },
71
  "versions": {
 
76
  "copa": 0,
77
  "hellaswag": 0,
78
  "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_5.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.35,0.015090650341444233,0
3
  anli_r2,acc,0.361,0.015195720118175127,0
4
  anli_r3,acc,0.31416666666666665,0.013405399314984101,0
 
 
 
 
 
5
  cb,acc,0.39285714285714285,0.0658538889806635,1
6
  cb,f1,0.3300527326188914,,1
7
  copa,acc,0.7,0.046056618647183814,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.35,0.015090650341444233,0
3
  anli_r2,acc,0.361,0.015195720118175127,0
4
  anli_r3,acc,0.31416666666666665,0.013405399314984101,0
5
+ arc_challenge,acc,0.2764505119453925,0.013069662474252425,0
6
+ arc_challenge,acc_norm,0.2977815699658703,0.013363080107244487,0
7
+ arc_easy,acc,0.5942760942760943,0.010075755540128873,0
8
+ arc_easy,acc_norm,0.5728114478114478,0.010150415974210875,0
9
+ boolq,acc,0.5929663608562691,0.00859256288706887,1
10
  cb,acc,0.39285714285714285,0.0658538889806635,1
11
  cb,f1,0.3300527326188914,,1
12
  copa,acc,0.7,0.046056618647183814,0
13
+ hellaswag,acc,0.45180242979486157,0.004966544724452228,0
14
+ hellaswag,acc_norm,0.6045608444532962,0.00487945547466382,0
15
+ piqa,acc,0.7453754080522307,0.01016443223706049,0
16
+ piqa,acc_norm,0.7551686615886833,0.01003230910556881,0
17
+ rte,acc,0.5342960288808665,0.030025579819366426,0
18
+ sciq,acc,0.898,0.009575368801653897,0
19
+ sciq,acc_norm,0.892,0.009820001651345703,0
20
+ storycloze_2016,acc,0.6985569214323891,0.010611646032767588,0
21
+ winogrande,acc,0.5603788476716653,0.013949649776015694,0
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.7,
22
  "acc_stderr": 0.046056618647183814
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.7,
22
  "acc_stderr": 0.046056618647183814
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.45180242979486157,
26
+ "acc_stderr": 0.004966544724452228,
27
+ "acc_norm": 0.6045608444532962,
28
+ "acc_norm_stderr": 0.00487945547466382
29
+ },
30
+ "rte": {
31
+ "acc": 0.5342960288808665,
32
+ "acc_stderr": 0.030025579819366426
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5603788476716653,
36
+ "acc_stderr": 0.013949649776015694
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6985569214323891,
40
+ "acc_stderr": 0.010611646032767588
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5929663608562691,
44
+ "acc_stderr": 0.00859256288706887
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5942760942760943,
48
+ "acc_stderr": 0.010075755540128873,
49
+ "acc_norm": 0.5728114478114478,
50
+ "acc_norm_stderr": 0.010150415974210875
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2764505119453925,
54
+ "acc_stderr": 0.013069662474252425,
55
+ "acc_norm": 0.2977815699658703,
56
+ "acc_norm_stderr": 0.013363080107244487
57
+ },
58
+ "sciq": {
59
+ "acc": 0.898,
60
+ "acc_stderr": 0.009575368801653897,
61
+ "acc_norm": 0.892,
62
+ "acc_norm_stderr": 0.009820001651345703
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7453754080522307,
66
+ "acc_stderr": 0.01016443223706049,
67
+ "acc_norm": 0.7551686615886833,
68
+ "acc_norm_stderr": 0.01003230910556881
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }