Commit
·
006459a
1
Parent(s):
fd4b89b
Add eval
Browse files- evaluation/rankeval/lm1-4b2-84b-c4-perplexity_2.csv +4 -0
- evaluation/rankeval/lm1-4b2-84b-c4-perplexity_2.json +15 -1
- evaluation/rankeval/lm1-4b2-84b-c4-perplexity_3.csv +9 -0
- evaluation/rankeval/lm1-4b2-84b-c4-perplexity_3.json +34 -1
- evaluation/rankeval/lm1-4b2-84b-c4-perplexity_4.csv +10 -0
- evaluation/rankeval/lm1-4b2-84b-c4-perplexity_4.json +39 -1
- evaluation/rankeval/lm1-4b2-84b-c4-perplexity_5.csv +14 -0
- evaluation/rankeval/lm1-4b2-84b-c4-perplexity_5.json +56 -1
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_2.csv
CHANGED
@@ -12,6 +12,10 @@ cb,f1,0.2557471264367816,,1
|
|
12 |
copa,acc,0.73,0.044619604333847394,0
|
13 |
hellaswag,acc,0.45030870344552876,0.004965078477435579,0
|
14 |
hellaswag,acc_norm,0.599183429595698,0.004890623693243619,0
|
|
|
|
|
15 |
rte,acc,0.5342960288808665,0.030025579819366426,0
|
|
|
|
|
16 |
storycloze_2016,acc,0.6996258685195083,0.010600915927985021,0
|
17 |
winogrande,acc,0.5611681136543015,0.013946933444507032,0
|
|
|
12 |
copa,acc,0.73,0.044619604333847394,0
|
13 |
hellaswag,acc,0.45030870344552876,0.004965078477435579,0
|
14 |
hellaswag,acc_norm,0.599183429595698,0.004890623693243619,0
|
15 |
+
piqa,acc,0.7524483133841132,0.010069703966857106,0
|
16 |
+
piqa,acc_norm,0.750272034820457,0.010099232969867469,0
|
17 |
rte,acc,0.5342960288808665,0.030025579819366426,0
|
18 |
+
sciq,acc,0.881,0.010244215145336664,0
|
19 |
+
sciq,acc_norm,0.87,0.010640169792499349,0
|
20 |
storycloze_2016,acc,0.6996258685195083,0.010600915927985021,0
|
21 |
winogrande,acc,0.5611681136543015,0.013946933444507032,0
|
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_2.json
CHANGED
@@ -54,6 +54,18 @@
|
|
54 |
"acc_stderr": 0.012980954547659554,
|
55 |
"acc_norm": 0.2883959044368601,
|
56 |
"acc_norm_stderr": 0.013238394422428175
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
}
|
58 |
},
|
59 |
"versions": {
|
@@ -68,6 +80,8 @@
|
|
68 |
"storycloze_2016": 0,
|
69 |
"boolq": 1,
|
70 |
"arc_easy": 0,
|
71 |
-
"arc_challenge": 0
|
|
|
|
|
72 |
}
|
73 |
}
|
|
|
54 |
"acc_stderr": 0.012980954547659554,
|
55 |
"acc_norm": 0.2883959044368601,
|
56 |
"acc_norm_stderr": 0.013238394422428175
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.881,
|
60 |
+
"acc_stderr": 0.010244215145336664,
|
61 |
+
"acc_norm": 0.87,
|
62 |
+
"acc_norm_stderr": 0.010640169792499349
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7524483133841132,
|
66 |
+
"acc_stderr": 0.010069703966857106,
|
67 |
+
"acc_norm": 0.750272034820457,
|
68 |
+
"acc_norm_stderr": 0.010099232969867469
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_3.csv
CHANGED
@@ -2,11 +2,20 @@ task,metric,value,err,version
|
|
2 |
anli_r1,acc,0.33,0.014876872027456732,0
|
3 |
anli_r2,acc,0.371,0.015283736211823188,0
|
4 |
anli_r3,acc,0.32166666666666666,0.013490095282989521,0
|
|
|
|
|
|
|
|
|
|
|
5 |
cb,acc,0.35714285714285715,0.06460957383809221,1
|
6 |
cb,f1,0.3051529790660225,,1
|
7 |
copa,acc,0.81,0.03942772444036623,0
|
8 |
hellaswag,acc,0.45309699263095,0.004967778940011933,0
|
9 |
hellaswag,acc_norm,0.6016729735112527,0.004885529674958343,0
|
|
|
|
|
10 |
rte,acc,0.5234657039711191,0.03006330041190266,0
|
|
|
|
|
11 |
storycloze_2016,acc,0.7076429716729022,0.010518239729787736,0
|
12 |
winogrande,acc,0.5706393054459353,0.01391153749996916,0
|
|
|
2 |
anli_r1,acc,0.33,0.014876872027456732,0
|
3 |
anli_r2,acc,0.371,0.015283736211823188,0
|
4 |
anli_r3,acc,0.32166666666666666,0.013490095282989521,0
|
5 |
+
arc_challenge,acc,0.2713310580204778,0.012993807727545803,0
|
6 |
+
arc_challenge,acc_norm,0.28498293515358364,0.013191348179838795,0
|
7 |
+
arc_easy,acc,0.5942760942760943,0.010075755540128871,0
|
8 |
+
arc_easy,acc_norm,0.5820707070707071,0.010120628211017883,0
|
9 |
+
boolq,acc,0.5804281345565749,0.008631175489166717,1
|
10 |
cb,acc,0.35714285714285715,0.06460957383809221,1
|
11 |
cb,f1,0.3051529790660225,,1
|
12 |
copa,acc,0.81,0.03942772444036623,0
|
13 |
hellaswag,acc,0.45309699263095,0.004967778940011933,0
|
14 |
hellaswag,acc_norm,0.6016729735112527,0.004885529674958343,0
|
15 |
+
piqa,acc,0.750272034820457,0.010099232969867493,0
|
16 |
+
piqa,acc_norm,0.7546245919477693,0.010039831320422387,0
|
17 |
rte,acc,0.5234657039711191,0.03006330041190266,0
|
18 |
+
sciq,acc,0.875,0.010463483381956722,0
|
19 |
+
sciq,acc_norm,0.865,0.010811655372416051,0
|
20 |
storycloze_2016,acc,0.7076429716729022,0.010518239729787736,0
|
21 |
winogrande,acc,0.5706393054459353,0.01391153749996916,0
|
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_3.json
CHANGED
@@ -38,6 +38,34 @@
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.7076429716729022,
|
40 |
"acc_stderr": 0.010518239729787736
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
}
|
42 |
},
|
43 |
"versions": {
|
@@ -49,6 +77,11 @@
|
|
49 |
"hellaswag": 0,
|
50 |
"rte": 0,
|
51 |
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
}
|
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.7076429716729022,
|
40 |
"acc_stderr": 0.010518239729787736
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5804281345565749,
|
44 |
+
"acc_stderr": 0.008631175489166717
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.5942760942760943,
|
48 |
+
"acc_stderr": 0.010075755540128871,
|
49 |
+
"acc_norm": 0.5820707070707071,
|
50 |
+
"acc_norm_stderr": 0.010120628211017883
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.2713310580204778,
|
54 |
+
"acc_stderr": 0.012993807727545803,
|
55 |
+
"acc_norm": 0.28498293515358364,
|
56 |
+
"acc_norm_stderr": 0.013191348179838795
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.875,
|
60 |
+
"acc_stderr": 0.010463483381956722,
|
61 |
+
"acc_norm": 0.865,
|
62 |
+
"acc_norm_stderr": 0.010811655372416051
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.750272034820457,
|
66 |
+
"acc_stderr": 0.010099232969867493,
|
67 |
+
"acc_norm": 0.7546245919477693,
|
68 |
+
"acc_norm_stderr": 0.010039831320422387
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_4.csv
CHANGED
@@ -2,10 +2,20 @@ task,metric,value,err,version
|
|
2 |
anli_r1,acc,0.339,0.01497675877162034,0
|
3 |
anli_r2,acc,0.342,0.01500870618212173,0
|
4 |
anli_r3,acc,0.33416666666666667,0.01362243481313678,0
|
|
|
|
|
|
|
|
|
|
|
5 |
cb,acc,0.375,0.06527912098338669,1
|
6 |
cb,f1,0.3162533392229865,,1
|
7 |
copa,acc,0.78,0.04163331998932261,0
|
8 |
hellaswag,acc,0.44981079466241786,0.004964579685712441,0
|
9 |
hellaswag,acc_norm,0.6027683728340968,0.0048832465794966485,0
|
|
|
|
|
10 |
rte,acc,0.516245487364621,0.030080573208738064,0
|
|
|
|
|
|
|
11 |
winogrande,acc,0.5816890292028414,0.013863669961195911,0
|
|
|
2 |
anli_r1,acc,0.339,0.01497675877162034,0
|
3 |
anli_r2,acc,0.342,0.01500870618212173,0
|
4 |
anli_r3,acc,0.33416666666666667,0.01362243481313678,0
|
5 |
+
arc_challenge,acc,0.2773037542662116,0.013082095839059374,0
|
6 |
+
arc_challenge,acc_norm,0.2883959044368601,0.013238394422428173,0
|
7 |
+
arc_easy,acc,0.6056397306397306,0.010028176038392999,0
|
8 |
+
arc_easy,acc_norm,0.5862794612794613,0.010105878530238137,0
|
9 |
+
boolq,acc,0.5954128440366973,0.008584355308932694,1
|
10 |
cb,acc,0.375,0.06527912098338669,1
|
11 |
cb,f1,0.3162533392229865,,1
|
12 |
copa,acc,0.78,0.04163331998932261,0
|
13 |
hellaswag,acc,0.44981079466241786,0.004964579685712441,0
|
14 |
hellaswag,acc_norm,0.6027683728340968,0.0048832465794966485,0
|
15 |
+
piqa,acc,0.7470076169749728,0.010142888698862462,0
|
16 |
+
piqa,acc_norm,0.7540805223068553,0.010047331865625182,0
|
17 |
rte,acc,0.516245487364621,0.030080573208738064,0
|
18 |
+
sciq,acc,0.888,0.009977753031397236,0
|
19 |
+
sciq,acc_norm,0.884,0.010131468138756976,0
|
20 |
+
storycloze_2016,acc,0.7071084981293426,0.010523873293246304,0
|
21 |
winogrande,acc,0.5816890292028414,0.013863669961195911,0
|
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_4.json
CHANGED
@@ -34,6 +34,38 @@
|
|
34 |
"winogrande": {
|
35 |
"acc": 0.5816890292028414,
|
36 |
"acc_stderr": 0.013863669961195911
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
}
|
38 |
},
|
39 |
"versions": {
|
@@ -44,6 +76,12 @@
|
|
44 |
"copa": 0,
|
45 |
"hellaswag": 0,
|
46 |
"rte": 0,
|
47 |
-
"winogrande": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
}
|
49 |
}
|
|
|
34 |
"winogrande": {
|
35 |
"acc": 0.5816890292028414,
|
36 |
"acc_stderr": 0.013863669961195911
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.7071084981293426,
|
40 |
+
"acc_stderr": 0.010523873293246304
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5954128440366973,
|
44 |
+
"acc_stderr": 0.008584355308932694
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.6056397306397306,
|
48 |
+
"acc_stderr": 0.010028176038392999,
|
49 |
+
"acc_norm": 0.5862794612794613,
|
50 |
+
"acc_norm_stderr": 0.010105878530238137
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.2773037542662116,
|
54 |
+
"acc_stderr": 0.013082095839059374,
|
55 |
+
"acc_norm": 0.2883959044368601,
|
56 |
+
"acc_norm_stderr": 0.013238394422428173
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.888,
|
60 |
+
"acc_stderr": 0.009977753031397236,
|
61 |
+
"acc_norm": 0.884,
|
62 |
+
"acc_norm_stderr": 0.010131468138756976
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7470076169749728,
|
66 |
+
"acc_stderr": 0.010142888698862462,
|
67 |
+
"acc_norm": 0.7540805223068553,
|
68 |
+
"acc_norm_stderr": 0.010047331865625182
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
76 |
"copa": 0,
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_5.csv
CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
|
|
2 |
anli_r1,acc,0.35,0.015090650341444233,0
|
3 |
anli_r2,acc,0.361,0.015195720118175127,0
|
4 |
anli_r3,acc,0.31416666666666665,0.013405399314984101,0
|
|
|
|
|
|
|
|
|
|
|
5 |
cb,acc,0.39285714285714285,0.0658538889806635,1
|
6 |
cb,f1,0.3300527326188914,,1
|
7 |
copa,acc,0.7,0.046056618647183814,0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
anli_r1,acc,0.35,0.015090650341444233,0
|
3 |
anli_r2,acc,0.361,0.015195720118175127,0
|
4 |
anli_r3,acc,0.31416666666666665,0.013405399314984101,0
|
5 |
+
arc_challenge,acc,0.2764505119453925,0.013069662474252425,0
|
6 |
+
arc_challenge,acc_norm,0.2977815699658703,0.013363080107244487,0
|
7 |
+
arc_easy,acc,0.5942760942760943,0.010075755540128873,0
|
8 |
+
arc_easy,acc_norm,0.5728114478114478,0.010150415974210875,0
|
9 |
+
boolq,acc,0.5929663608562691,0.00859256288706887,1
|
10 |
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
cb,f1,0.3300527326188914,,1
|
12 |
copa,acc,0.7,0.046056618647183814,0
|
13 |
+
hellaswag,acc,0.45180242979486157,0.004966544724452228,0
|
14 |
+
hellaswag,acc_norm,0.6045608444532962,0.00487945547466382,0
|
15 |
+
piqa,acc,0.7453754080522307,0.01016443223706049,0
|
16 |
+
piqa,acc_norm,0.7551686615886833,0.01003230910556881,0
|
17 |
+
rte,acc,0.5342960288808665,0.030025579819366426,0
|
18 |
+
sciq,acc,0.898,0.009575368801653897,0
|
19 |
+
sciq,acc_norm,0.892,0.009820001651345703,0
|
20 |
+
storycloze_2016,acc,0.6985569214323891,0.010611646032767588,0
|
21 |
+
winogrande,acc,0.5603788476716653,0.013949649776015694,0
|
evaluation/rankeval/lm1-4b2-84b-c4-perplexity_5.json
CHANGED
@@ -20,6 +20,52 @@
|
|
20 |
"copa": {
|
21 |
"acc": 0.7,
|
22 |
"acc_stderr": 0.046056618647183814
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
}
|
24 |
},
|
25 |
"versions": {
|
@@ -27,6 +73,15 @@
|
|
27 |
"anli_r2": 0,
|
28 |
"anli_r3": 0,
|
29 |
"cb": 1,
|
30 |
-
"copa": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
}
|
32 |
}
|
|
|
20 |
"copa": {
|
21 |
"acc": 0.7,
|
22 |
"acc_stderr": 0.046056618647183814
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.45180242979486157,
|
26 |
+
"acc_stderr": 0.004966544724452228,
|
27 |
+
"acc_norm": 0.6045608444532962,
|
28 |
+
"acc_norm_stderr": 0.00487945547466382
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5342960288808665,
|
32 |
+
"acc_stderr": 0.030025579819366426
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5603788476716653,
|
36 |
+
"acc_stderr": 0.013949649776015694
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.6985569214323891,
|
40 |
+
"acc_stderr": 0.010611646032767588
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5929663608562691,
|
44 |
+
"acc_stderr": 0.00859256288706887
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.5942760942760943,
|
48 |
+
"acc_stderr": 0.010075755540128873,
|
49 |
+
"acc_norm": 0.5728114478114478,
|
50 |
+
"acc_norm_stderr": 0.010150415974210875
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.2764505119453925,
|
54 |
+
"acc_stderr": 0.013069662474252425,
|
55 |
+
"acc_norm": 0.2977815699658703,
|
56 |
+
"acc_norm_stderr": 0.013363080107244487
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.898,
|
60 |
+
"acc_stderr": 0.009575368801653897,
|
61 |
+
"acc_norm": 0.892,
|
62 |
+
"acc_norm_stderr": 0.009820001651345703
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7453754080522307,
|
66 |
+
"acc_stderr": 0.01016443223706049,
|
67 |
+
"acc_norm": 0.7551686615886833,
|
68 |
+
"acc_norm_stderr": 0.01003230910556881
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
73 |
"anli_r2": 0,
|
74 |
"anli_r3": 0,
|
75 |
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|