diff --git a/4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json deleted file mode 100644 index 99f874d3a25c8315a5d493aed9776de54f8fc547..0000000000000000000000000000000000000000 --- a/4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932575 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.014922019523732961 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077249 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.23306878306878312 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.4695279824736108, - "acc_stderr": 0.0049805063294075845, - "acc_norm": 0.6132244572794264, - "acc_norm_stderr": 0.004860162076330956 - }, - "rte": { - "acc": 0.5812274368231047, - "acc_stderr": 0.02969666108123484 - }, - "winogrande": { - "acc": 0.5753749013417522, - "acc_stderr": 0.013891893150264218 - }, - "storycloze_2016": { - "acc": 0.711918760021379, - "acc_stderr": 0.010472537019822578 - }, - "boolq": { - "acc": 0.5464831804281346, - "acc_stderr": 0.008707182331111644 - }, - "arc_easy": { - "acc": 0.5538720538720538, - "acc_stderr": 0.01020005782876501, - "acc_norm": 0.4936868686868687, - "acc_norm_stderr": 0.01025896566804443 - }, - "arc_challenge": { - "acc": 0.2636518771331058, - "acc_stderr": 0.012875929151297049, - "acc_norm": 0.2883959044368601, - "acc_norm_stderr": 0.013238394422428175 - }, - "sciq": { - "acc": 0.82, - "acc_stderr": 0.012155153135511965, - "acc_norm": 0.749, - "acc_norm_stderr": 0.013718133516888921 - }, - "piqa": { - "acc": 0.73449401523395, - "acc_stderr": 0.010303308653024429, - "acc_norm": 0.7475516866158868, - "acc_norm_stderr": 0.010135665547362354 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json deleted file mode 100644 index 766eff5aaab727fcee1b34569d0050e6bdf55b46..0000000000000000000000000000000000000000 --- a/4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229868 - }, - "anli_r2": { - "acc": 0.326, - "acc_stderr": 0.01483050720454104 - }, - "anli_r3": { - "acc": 0.3475, - "acc_stderr": 0.013751753243291852 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.37227304714989445 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.47191794463254333, - "acc_stderr": 0.004981905293878145, - "acc_norm": 0.6139215295757817, - "acc_norm_stderr": 0.004858539527872466 - }, - "rte": { - "acc": 0.5703971119133574, - "acc_stderr": 0.029796668829124674 - }, - "winogrande": { - "acc": 0.5706393054459353, - "acc_stderr": 0.013911537499969163 - }, - "storycloze_2016": { - "acc": 0.7151256012827365, - "acc_stderr": 0.01043751398661172 - }, - "boolq": { - "acc": 0.5669724770642202, - "acc_stderr": 0.00866625130551806 - }, - "arc_easy": { - "acc": 0.5913299663299664, - "acc_stderr": 0.010087174498762883, - "acc_norm": 0.5496632996632996, - "acc_norm_stderr": 0.010209047724374145 - }, - "arc_challenge": { - "acc": 0.2627986348122867, - "acc_stderr": 0.012862523175351333, - "acc_norm": 0.30716723549488056, - "acc_norm_stderr": 0.013481034054980943 - }, - "sciq": { - "acc": 0.836, - "acc_stderr": 0.011715000693181331, - "acc_norm": 0.781, - "acc_norm_stderr": 0.013084731950262012 - }, - "piqa": { - "acc": 0.7448313384113167, - "acc_stderr": 0.010171571592521822, - "acc_norm": 0.7535364526659413, - "acc_norm_stderr": 0.01005481078967181 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json deleted file mode 100644 index 4adaea156e9fa3d315781bc5dc84f4c8de4c462b..0000000000000000000000000000000000000000 --- a/4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928354 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795027 - }, - "anli_r3": { - "acc": 0.3383333333333333, - "acc_stderr": 0.013664144006618266 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.06737697508644648, - "f1": 0.3338011695906433 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4697271459868552, - "acc_stderr": 0.004980627287147585, - "acc_norm": 0.6141206930890261, - "acc_norm_stderr": 0.004858074013443988 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.574585635359116, - "acc_stderr": 0.013895257666646378 - }, - "storycloze_2016": { - "acc": 0.7156600748262961, - "acc_stderr": 0.010431614128665253 - }, - "boolq": { - "acc": 0.5660550458715596, - "acc_stderr": 0.008668405003744129 - }, - "arc_easy": { - "acc": 0.5993265993265994, - "acc_stderr": 0.01005530447425557, - "acc_norm": 0.5576599326599326, - "acc_norm_stderr": 0.01019133444422085 - }, - "arc_challenge": { - "acc": 0.2781569965870307, - "acc_stderr": 0.013094469919538805, - "acc_norm": 0.30887372013651876, - "acc_norm_stderr": 0.013501770929344003 - }, - "sciq": { - "acc": 0.835, - "acc_stderr": 0.011743632866916145, - "acc_norm": 0.79, - "acc_norm_stderr": 0.01288666233227453 - }, - "piqa": { - "acc": 0.7470076169749728, - "acc_stderr": 0.01014288869886246, - "acc_norm": 0.7519042437431991, - "acc_norm_stderr": 0.010077118315574706 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json deleted file mode 100644 index cfec1a1379ef3785a474c68e2d94a790aae2ea7e..0000000000000000000000000000000000000000 --- a/4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811485 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.014922019523732963 - }, - "anli_r3": { - "acc": 0.35, - "acc_stderr": 0.013774667009018554 - }, - "cb": { - "acc": 0.6071428571428571, - "acc_stderr": 0.0658538889806635, - "f1": 0.42400932400932395 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036622 - }, - "hellaswag": { - "acc": 0.47241585341565423, - "acc_stderr": 0.004982182323923561, - "acc_norm": 0.6199960167297351, - "acc_norm_stderr": 0.004843954338451449 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529113 - }, - "winogrande": { - "acc": 0.5737963693764798, - "acc_stderr": 0.013898585965412338 - }, - "storycloze_2016": { - "acc": 0.7124532335649385, - "acc_stderr": 0.010466744473098363 - }, - "boolq": { - "acc": 0.5587155963302752, - "acc_stderr": 0.008684548127832637 - }, - "arc_easy": { - "acc": 0.5955387205387206, - "acc_stderr": 0.010070746648278783, - "acc_norm": 0.5740740740740741, - "acc_norm_stderr": 0.010146568651002255 - }, - "arc_challenge": { - "acc": 0.2815699658703072, - "acc_stderr": 0.013143376735009022, - "acc_norm": 0.3122866894197952, - "acc_norm_stderr": 0.013542598541688067 - }, - "sciq": { - "acc": 0.841, - "acc_stderr": 0.01156947936827129, - "acc_norm": 0.796, - "acc_norm_stderr": 0.012749374359024384 - }, - "piqa": { - "acc": 0.7513601741022851, - "acc_stderr": 0.01008451123429685, - "acc_norm": 0.7578890097932536, - "acc_norm_stderr": 0.009994371269104397 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json deleted file mode 100644 index 362c01eb141bc03ddaa0a42615c78c3520a1c857..0000000000000000000000000000000000000000 --- a/4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.337, - "acc_stderr": 0.014955087918653603 - }, - "anli_r2": { - "acc": 0.349, - "acc_stderr": 0.015080663991563102 - }, - "anli_r3": { - "acc": 0.36666666666666664, - "acc_stderr": 0.013916893275819938 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.067031892279424, - "f1": 0.3176100628930817 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4722166899024099, - "acc_stderr": 0.004982072108448081, - "acc_norm": 0.6184027086237801, - "acc_norm_stderr": 0.004847857546957481 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.03000984891252911 - }, - "winogrande": { - "acc": 0.56353591160221, - "acc_stderr": 0.013938569465677023 - }, - "storycloze_2016": { - "acc": 0.7194013896312133, - "acc_stderr": 0.010389809647288821 - }, - "boolq": { - "acc": 0.5636085626911315, - "acc_stderr": 0.008674000467432068 - }, - "arc_easy": { - "acc": 0.6039562289562289, - "acc_stderr": 0.010035580962097942, - "acc_norm": 0.5702861952861953, - "acc_norm_stderr": 0.010157908005763674 - }, - "arc_challenge": { - "acc": 0.2790102389078498, - "acc_stderr": 0.013106784883601346, - "acc_norm": 0.3165529010238908, - "acc_norm_stderr": 0.013592431519068077 - }, - "sciq": { - "acc": 0.842, - "acc_stderr": 0.011539894677559568, - "acc_norm": 0.789, - "acc_norm_stderr": 0.012909130321042092 - }, - "piqa": { - "acc": 0.7431991294885746, - "acc_stderr": 0.010192864802278045, - "acc_norm": 0.7568008705114254, - "acc_norm_stderr": 0.010009611953858915 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json deleted file mode 100644 index 013a059067dc8e5e4a9354909d895aeb9562a851..0000000000000000000000000000000000000000 --- a/4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json +++ /dev/null @@ -1,66 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811487 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928357 - }, - "anli_r3": { - "acc": 0.3541666666666667, - "acc_stderr": 0.013811933499570954 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942395, - "f1": 0.38376730002345766 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.47400916152160927, - "acc_stderr": 0.004983035420235716, - "acc_norm": 0.619896434973113, - "acc_norm_stderr": 0.004844199910173026 - }, - "rte": { - "acc": 0.516245487364621, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5722178374112076, - "acc_stderr": 0.013905134013839944 - }, - "storycloze_2016": { - "acc": 0.7177979690005345, - "acc_stderr": 0.010407834479647675 - }, - "boolq": { - "acc": 0.5648318042813456, - "acc_stderr": 0.008671229580582118 - }, - "arc_easy": { - "acc": 0.5997474747474747, - "acc_stderr": 0.010053550119896127, - "acc_norm": 0.569023569023569, - "acc_norm_stderr": 0.010161552863493746 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_0.json b/4b284b12bc4/evaluation/rankeval/4b284b12bc4_0.json similarity index 100% rename from 4b284b12bc4/evaluation/4b284b12bc4_0.json rename to 4b284b12bc4/evaluation/rankeval/4b284b12bc4_0.json diff --git a/4b284b12bc4/evaluation/4b284b12bc4_1.json b/4b284b12bc4/evaluation/rankeval/4b284b12bc4_1.json similarity index 100% rename from 4b284b12bc4/evaluation/4b284b12bc4_1.json rename to 4b284b12bc4/evaluation/rankeval/4b284b12bc4_1.json diff --git a/4b284b12bc4/evaluation/4b284b12bc4_2.json b/4b284b12bc4/evaluation/rankeval/4b284b12bc4_2.json similarity index 100% rename from 4b284b12bc4/evaluation/4b284b12bc4_2.json rename to 4b284b12bc4/evaluation/rankeval/4b284b12bc4_2.json diff --git a/4b284b12bc4/evaluation/4b284b12bc4_3.json b/4b284b12bc4/evaluation/rankeval/4b284b12bc4_3.json similarity index 100% rename from 4b284b12bc4/evaluation/4b284b12bc4_3.json rename to 4b284b12bc4/evaluation/rankeval/4b284b12bc4_3.json diff --git a/4b284b12bc4/evaluation/4b284b12bc4_4.json b/4b284b12bc4/evaluation/rankeval/4b284b12bc4_4.json similarity index 100% rename from 4b284b12bc4/evaluation/4b284b12bc4_4.json rename to 4b284b12bc4/evaluation/rankeval/4b284b12bc4_4.json diff --git a/4b284b12bc4/evaluation/4b284b12bc4_5.json b/4b284b12bc4/evaluation/rankeval/4b284b12bc4_5.json similarity index 71% rename from 4b284b12bc4/evaluation/4b284b12bc4_5.json rename to 4b284b12bc4/evaluation/rankeval/4b284b12bc4_5.json index 013a059067dc8e5e4a9354909d895aeb9562a851..5e0fa20b822406e3484f2308d4c517eb73cb5499 100644 --- a/4b284b12bc4/evaluation/4b284b12bc4_5.json +++ b/4b284b12bc4/evaluation/rankeval/4b284b12bc4_5.json @@ -48,6 +48,24 @@ "acc_stderr": 0.010053550119896127, "acc_norm": 0.569023569023569, "acc_norm_stderr": 0.010161552863493746 + }, + "arc_challenge": { + "acc": 0.27559726962457337, + "acc_stderr": 0.01305716965576184, + "acc_norm": 0.31569965870307165, + "acc_norm_stderr": 0.013582571095815291 + }, + "sciq": { + "acc": 0.844, + "acc_stderr": 0.01148023500612236, + "acc_norm": 0.794, + "acc_norm_stderr": 0.012795613612786551 + }, + "piqa": { + "acc": 0.7399347116430903, + "acc_stderr": 0.0102348932490613, + "acc_norm": 0.7595212187159956, + "acc_norm_stderr": 0.009971345364651064 } }, "versions": { @@ -61,6 +79,9 @@ "winogrande": 0, "storycloze_2016": 0, "boolq": 1, - "arc_easy": 0 + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json deleted file mode 100644 index 6a235436eb04fb33ff7a7cb572bbde1279ced56f..0000000000000000000000000000000000000000 --- a/4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811478 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928362 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.013759437498874075 - }, - "cb": { - "acc": 0.5714285714285714, - "acc_stderr": 0.06672848092813058, - "f1": 0.3888888888888889 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.469627564230233, - "acc_stderr": 0.004980566907790459, - "acc_norm": 0.6134236207926708, - "acc_norm_stderr": 0.004859699562451462 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5737963693764798, - "acc_stderr": 0.013898585965412338 - }, - "storycloze_2016": { - "acc": 0.7108498129342598, - "acc_stderr": 0.010484068799942072 - }, - "boolq": { - "acc": 0.5623853211009174, - "acc_stderr": 0.008676717715731632 - }, - "arc_easy": { - "acc": 0.6052188552188552, - "acc_stderr": 0.010030038935883584, - "acc_norm": 0.5429292929292929, - "acc_norm_stderr": 0.01022189756425604 - }, - "arc_challenge": { - "acc": 0.26791808873720135, - "acc_stderr": 0.012942030195136437, - "acc_norm": 0.2883959044368601, - "acc_norm_stderr": 0.013238394422428171 - }, - "sciq": { - "acc": 0.852, - "acc_stderr": 0.011234866364235235, - "acc_norm": 0.764, - "acc_norm_stderr": 0.013434451402438678 - }, - "piqa": { - "acc": 0.7578890097932536, - "acc_stderr": 0.00999437126910438, - "acc_norm": 0.7622415669205659, - "acc_norm_stderr": 0.009932525779525492 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json deleted file mode 100644 index 4572b5ea5b09badd72d95a263315a8e40e583db3..0000000000000000000000000000000000000000 --- a/4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.32, - "acc_stderr": 0.014758652303574886 - }, - "anli_r2": { - "acc": 0.324, - "acc_stderr": 0.014806864733738854 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.01376707539507725 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942397, - "f1": 0.3890671420083185 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4640509858593906, - "acc_stderr": 0.0049768677965835555, - "acc_norm": 0.6082453694483171, - "acc_norm_stderr": 0.004871447106554927 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.574585635359116, - "acc_stderr": 0.013895257666646378 - }, - "storycloze_2016": { - "acc": 0.711918760021379, - "acc_stderr": 0.010472537019822582 - }, - "boolq": { - "acc": 0.5409785932721712, - "acc_stderr": 0.008715635308774412 - }, - "arc_easy": { - "acc": 0.6342592592592593, - "acc_stderr": 0.009882988069418829, - "acc_norm": 0.5837542087542088, - "acc_norm_stderr": 0.01011481940450087 - }, - "arc_challenge": { - "acc": 0.2901023890784983, - "acc_stderr": 0.013261573677520764, - "acc_norm": 0.30119453924914674, - "acc_norm_stderr": 0.013406741767847638 - }, - "sciq": { - "acc": 0.896, - "acc_stderr": 0.009658016218524301, - "acc_norm": 0.88, - "acc_norm_stderr": 0.010281328012747386 - }, - "piqa": { - "acc": 0.7551686615886833, - "acc_stderr": 0.010032309105568793, - "acc_norm": 0.766050054406964, - "acc_norm_stderr": 0.009877236895137436 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json deleted file mode 100644 index 3e49aa8d33cfba49045ebe2954fa4cb4c5d0b629..0000000000000000000000000000000000000000 --- a/4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.343, - "acc_stderr": 0.015019206922356953 - }, - "anli_r2": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r3": { - "acc": 0.325, - "acc_stderr": 0.013526454480351028 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.3058470764617691 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932263 - }, - "hellaswag": { - "acc": 0.45727942640908187, - "acc_stderr": 0.004971534874389935, - "acc_norm": 0.602867954590719, - "acc_norm_stderr": 0.004883037758919964 - }, - "rte": { - "acc": 0.48736462093862815, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5808997632202052, - "acc_stderr": 0.013867325192210116 - }, - "storycloze_2016": { - "acc": 0.7215392838054516, - "acc_stderr": 0.010365521460604415 - }, - "boolq": { - "acc": 0.5489296636085627, - "acc_stderr": 0.008703080962379622 - }, - "arc_easy": { - "acc": 0.6325757575757576, - "acc_stderr": 0.009892552616211558, - "acc_norm": 0.617003367003367, - "acc_norm_stderr": 0.009974920384536479 - }, - "arc_challenge": { - "acc": 0.2901023890784983, - "acc_stderr": 0.013261573677520759, - "acc_norm": 0.31313993174061433, - "acc_norm_stderr": 0.013552671543623496 - }, - "sciq": { - "acc": 0.906, - "acc_stderr": 0.009233052000787738, - "acc_norm": 0.891, - "acc_norm_stderr": 0.009859828407037186 - }, - "piqa": { - "acc": 0.7540805223068553, - "acc_stderr": 0.010047331865625194, - "acc_norm": 0.7698585418933623, - "acc_norm_stderr": 0.009820832826839796 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json deleted file mode 100644 index 408779e0bfcec61f04346c964e88daf60d169fc6..0000000000000000000000000000000000000000 --- a/4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.348, - "acc_stderr": 0.015070604603768408 - }, - "anli_r2": { - "acc": 0.36, - "acc_stderr": 0.01518652793204012 - }, - "anli_r3": { - "acc": 0.35083333333333333, - "acc_stderr": 0.013782212417178195 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.40387403446226977 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4567815176259709, - "acc_stderr": 0.004971106265046551, - "acc_norm": 0.5992830113523202, - "acc_norm_stderr": 0.004890422457747258 - }, - "rte": { - "acc": 0.48375451263537905, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.569060773480663, - "acc_stderr": 0.013917796623335966 - }, - "storycloze_2016": { - "acc": 0.7247461250668092, - "acc_stderr": 0.010328538400500567 - }, - "boolq": { - "acc": 0.5498470948012233, - "acc_stderr": 0.008701488203356937 - }, - "arc_easy": { - "acc": 0.6266835016835017, - "acc_stderr": 0.009925009142802903, - "acc_norm": 0.6203703703703703, - "acc_norm_stderr": 0.009958037725468558 - }, - "arc_challenge": { - "acc": 0.2901023890784983, - "acc_stderr": 0.013261573677520769, - "acc_norm": 0.31143344709897613, - "acc_norm_stderr": 0.013532472099850949 - }, - "sciq": { - "acc": 0.923, - "acc_stderr": 0.008434580140240632, - "acc_norm": 0.903, - "acc_norm_stderr": 0.00936368937324812 - }, - "piqa": { - "acc": 0.7578890097932536, - "acc_stderr": 0.009994371269104387, - "acc_norm": 0.7682263329706203, - "acc_norm_stderr": 0.00984514377279405 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json deleted file mode 100644 index efe239b5e466409417c045d2c72414349219ebf4..0000000000000000000000000000000000000000 --- a/4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.36, - "acc_stderr": 0.015186527932040117 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706625 - }, - "anli_r3": { - "acc": 0.3625, - "acc_stderr": 0.01388303787422552 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942395, - "f1": 0.4538378958668814 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.45180242979486157, - "acc_stderr": 0.004966544724452227, - "acc_norm": 0.5955984863572994, - "acc_norm_stderr": 0.004897728370737246 - }, - "rte": { - "acc": 0.48375451263537905, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5706393054459353, - "acc_stderr": 0.013911537499969163 - }, - "storycloze_2016": { - "acc": 0.7177979690005345, - "acc_stderr": 0.010407834479647672 - }, - "boolq": { - "acc": 0.545565749235474, - "acc_stderr": 0.008708665643758015 - }, - "arc_easy": { - "acc": 0.640993265993266, - "acc_stderr": 0.009843424713072174, - "acc_norm": 0.6186868686868687, - "acc_norm_stderr": 0.009966542497171025 - }, - "arc_challenge": { - "acc": 0.302901023890785, - "acc_stderr": 0.013428241573185349, - "acc_norm": 0.32337883959044367, - "acc_norm_stderr": 0.013669421630012129 - }, - "sciq": { - "acc": 0.915, - "acc_stderr": 0.008823426366942331, - "acc_norm": 0.912, - "acc_norm_stderr": 0.008963053962592085 - }, - "piqa": { - "acc": 0.7578890097932536, - "acc_stderr": 0.009994371269104385, - "acc_norm": 0.7752992383025027, - "acc_norm_stderr": 0.009738282586548389 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json deleted file mode 100644 index dd8c4c2b41c1fc72c753d73d7b2e4bde3577f7e0..0000000000000000000000000000000000000000 --- a/4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.363, - "acc_stderr": 0.015213890444671281 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706624 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767794 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942397, - "f1": 0.3974410235905637 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.44981079466241786, - "acc_stderr": 0.004964579685712439, - "acc_norm": 0.6002788289185421, - "acc_norm_stderr": 0.004888398535520516 - }, - "rte": { - "acc": 0.49097472924187724, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.5785319652722968, - "acc_stderr": 0.013878072377497603 - }, - "storycloze_2016": { - "acc": 0.7113842864778194, - "acc_stderr": 0.01047831178564294 - }, - "boolq": { - "acc": 0.5376146788990825, - "acc_stderr": 0.008720273736433679 - }, - "arc_easy": { - "acc": 0.6447811447811448, - "acc_stderr": 0.009820245899287117, - "acc_norm": 0.625, - "acc_norm_stderr": 0.009933992677987828 - }, - "arc_challenge": { - "acc": 0.2986348122866894, - "acc_stderr": 0.013374078615068756, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.013522292098053052 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_0.json b/4b284b17bc4/evaluation/rankeval/4b284b17bc4_0.json similarity index 100% rename from 4b284b17bc4/evaluation/4b284b17bc4_0.json rename to 4b284b17bc4/evaluation/rankeval/4b284b17bc4_0.json diff --git a/4b284b17bc4/evaluation/4b284b17bc4_1.json b/4b284b17bc4/evaluation/rankeval/4b284b17bc4_1.json similarity index 100% rename from 4b284b17bc4/evaluation/4b284b17bc4_1.json rename to 4b284b17bc4/evaluation/rankeval/4b284b17bc4_1.json diff --git a/4b284b17bc4/evaluation/4b284b17bc4_2.json b/4b284b17bc4/evaluation/rankeval/4b284b17bc4_2.json similarity index 100% rename from 4b284b17bc4/evaluation/4b284b17bc4_2.json rename to 4b284b17bc4/evaluation/rankeval/4b284b17bc4_2.json diff --git a/4b284b17bc4/evaluation/4b284b17bc4_3.json b/4b284b17bc4/evaluation/rankeval/4b284b17bc4_3.json similarity index 100% rename from 4b284b17bc4/evaluation/4b284b17bc4_3.json rename to 4b284b17bc4/evaluation/rankeval/4b284b17bc4_3.json diff --git a/4b284b17bc4/evaluation/4b284b17bc4_4.json b/4b284b17bc4/evaluation/rankeval/4b284b17bc4_4.json similarity index 100% rename from 4b284b17bc4/evaluation/4b284b17bc4_4.json rename to 4b284b17bc4/evaluation/rankeval/4b284b17bc4_4.json diff --git a/4b284b17bc4/evaluation/4b284b17bc4_5.json b/4b284b17bc4/evaluation/rankeval/4b284b17bc4_5.json similarity index 81% rename from 4b284b17bc4/evaluation/4b284b17bc4_5.json rename to 4b284b17bc4/evaluation/rankeval/4b284b17bc4_5.json index dd8c4c2b41c1fc72c753d73d7b2e4bde3577f7e0..c04f6ceb568c297b3086ce5f66014ee10321d9d4 100644 --- a/4b284b17bc4/evaluation/4b284b17bc4_5.json +++ b/4b284b17bc4/evaluation/rankeval/4b284b17bc4_5.json @@ -54,6 +54,18 @@ "acc_stderr": 0.013374078615068756, "acc_norm": 0.310580204778157, "acc_norm_stderr": 0.013522292098053052 + }, + "sciq": { + "acc": 0.918, + "acc_stderr": 0.00868051561552374, + "acc_norm": 0.908, + "acc_norm_stderr": 0.009144376393151117 + }, + "piqa": { + "acc": 0.7584330794341676, + "acc_stderr": 0.00998671800180446, + "acc_norm": 0.7671381936887922, + "acc_norm_stderr": 0.009861236071080757 } }, "versions": { @@ -68,6 +80,8 @@ "storycloze_2016": 0, "boolq": 1, "arc_easy": 0, - "arc_challenge": 0 + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json deleted file mode 100644 index 5aac52607ef41dfad6ed0068fdd89edf603e0a02..0000000000000000000000000000000000000000 --- a/4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811485 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.0149550879186536 - }, - "anli_r3": { - "acc": 0.355, - "acc_stderr": 0.013819249004047296 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.4347442680776014 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.4841665006970723, - "acc_stderr": 0.004987278910505115, - "acc_norm": 0.6352320254929297, - "acc_norm_stderr": 0.004803812631994966 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.03003973059219781 - }, - "winogrande": { - "acc": 0.5990528808208366, - "acc_stderr": 0.013773974554948033 - }, - "storycloze_2016": { - "acc": 0.7151256012827365, - "acc_stderr": 0.010437513986611718 - }, - "boolq": { - "acc": 0.5669724770642202, - "acc_stderr": 0.008666251305518059 - }, - "arc_easy": { - "acc": 0.617003367003367, - "acc_stderr": 0.009974920384536469, - "acc_norm": 0.5462962962962963, - "acc_norm_stderr": 0.010215708295494117 - }, - "arc_challenge": { - "acc": 0.28668941979522183, - "acc_stderr": 0.013214986329274757, - "acc_norm": 0.30631399317406144, - "acc_norm_stderr": 0.013470584417276513 - }, - "sciq": { - "acc": 0.845, - "acc_stderr": 0.011450157470799475, - "acc_norm": 0.757, - "acc_norm_stderr": 0.013569640199177458 - }, - "piqa": { - "acc": 0.7578890097932536, - "acc_stderr": 0.00999437126910438, - "acc_norm": 0.7676822633297062, - "acc_norm_stderr": 0.009853201384168243 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json deleted file mode 100644 index 8e72f1ec94f0cbd37c4dd210b0ac569424148625..0000000000000000000000000000000000000000 --- a/4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.01494414023379502 - }, - "anli_r2": { - "acc": 0.315, - "acc_stderr": 0.014696631960792506 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.0136804957257678 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.38181818181818183 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768077 - }, - "hellaswag": { - "acc": 0.48137821151165106, - "acc_stderr": 0.004986319587524962, - "acc_norm": 0.6344353714399522, - "acc_norm_stderr": 0.004806039039008954 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.5974743488555643, - "acc_stderr": 0.013782866831703048 - }, - "storycloze_2016": { - "acc": 0.7044361304115446, - "acc_stderr": 0.01055177883937378 - }, - "boolq": { - "acc": 0.5669724770642202, - "acc_stderr": 0.008666251305518059 - }, - "arc_easy": { - "acc": 0.6220538720538721, - "acc_stderr": 0.009949405744045452, - "acc_norm": 0.5787037037037037, - "acc_norm_stderr": 0.010131882498193127 - }, - "arc_challenge": { - "acc": 0.29266211604095566, - "acc_stderr": 0.01329591610361942, - "acc_norm": 0.32849829351535836, - "acc_norm_stderr": 0.013724978465537357 - }, - "sciq": { - "acc": 0.891, - "acc_stderr": 0.00985982840703719, - "acc_norm": 0.871, - "acc_norm_stderr": 0.010605256784796579 - }, - "piqa": { - "acc": 0.7551686615886833, - "acc_stderr": 0.010032309105568788, - "acc_norm": 0.764961915125136, - "acc_norm_stderr": 0.009893146688805308 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json deleted file mode 100644 index eb7e2dbe363df0445d27b1f90445c4562a1b6234..0000000000000000000000000000000000000000 --- a/4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.327, - "acc_stderr": 0.014842213153411247 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.01491084616422986 - }, - "anli_r3": { - "acc": 0.3408333333333333, - "acc_stderr": 0.01368860079329693 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.3829365079365079 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932262 - }, - "hellaswag": { - "acc": 0.48048197570205137, - "acc_stderr": 0.00498597821493792, - "acc_norm": 0.6397132045409281, - "acc_norm_stderr": 0.004791024004587989 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.6053670086819258, - "acc_stderr": 0.013736915172371883 - }, - "storycloze_2016": { - "acc": 0.7161945483698557, - "acc_stderr": 0.01042569627973092 - }, - "boolq": { - "acc": 0.5920489296636086, - "acc_stderr": 0.008595583792654892 - }, - "arc_easy": { - "acc": 0.622895622895623, - "acc_stderr": 0.009945041946366499, - "acc_norm": 0.6018518518518519, - "acc_norm_stderr": 0.010044662374653398 - }, - "arc_challenge": { - "acc": 0.295221843003413, - "acc_stderr": 0.013329750293382318, - "acc_norm": 0.32337883959044367, - "acc_norm_stderr": 0.013669421630012129 - }, - "sciq": { - "acc": 0.903, - "acc_stderr": 0.009363689373248092, - "acc_norm": 0.882, - "acc_norm_stderr": 0.010206869264381791 - }, - "piqa": { - "acc": 0.7578890097932536, - "acc_stderr": 0.009994371269104376, - "acc_norm": 0.7682263329706203, - "acc_norm_stderr": 0.009845143772794043 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json deleted file mode 100644 index c475d11569c652fd452aac5f851bd783c9fe644a..0000000000000000000000000000000000000000 --- a/4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.338, - "acc_stderr": 0.014965960710224496 - }, - "anli_r2": { - "acc": 0.345, - "acc_stderr": 0.015039986742055238 - }, - "anli_r3": { - "acc": 0.3566666666666667, - "acc_stderr": 0.013833742805050717 - }, - "cb": { - "acc": 0.6071428571428571, - "acc_stderr": 0.0658538889806635, - "f1": 0.5367003367003368 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4826727743477395, - "acc_stderr": 0.004986784319771787, - "acc_norm": 0.6368253335988847, - "acc_norm_stderr": 0.004799317209902001 - }, - "rte": { - "acc": 0.5631768953068592, - "acc_stderr": 0.029855247390314945 - }, - "winogrande": { - "acc": 0.6037884767166535, - "acc_stderr": 0.013746404157154949 - }, - "storycloze_2016": { - "acc": 0.7204703367183325, - "acc_stderr": 0.01037770209970486 - }, - "boolq": { - "acc": 0.5923547400611621, - "acc_stderr": 0.008594580270731619 - }, - "arc_easy": { - "acc": 0.627104377104377, - "acc_stderr": 0.009922743197129257, - "acc_norm": 0.609006734006734, - "acc_norm_stderr": 0.010012992232540631 - }, - "arc_challenge": { - "acc": 0.29436860068259385, - "acc_stderr": 0.013318528460539429, - "acc_norm": 0.3319112627986348, - "acc_norm_stderr": 0.01376098820088054 - }, - "sciq": { - "acc": 0.913, - "acc_stderr": 0.0089168666307459, - "acc_norm": 0.897, - "acc_norm_stderr": 0.009616833339695798 - }, - "piqa": { - "acc": 0.7589771490750816, - "acc_stderr": 0.009979042717267314, - "acc_norm": 0.7742110990206746, - "acc_norm_stderr": 0.009754980670917311 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json deleted file mode 100644 index a3b9ddb1c138667efb85173f7c157562aeef6d68..0000000000000000000000000000000000000000 --- a/4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.343, - "acc_stderr": 0.015019206922356951 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.01505026612756445 - }, - "anli_r3": { - "acc": 0.36083333333333334, - "acc_stderr": 0.01386918025244486 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942395, - "f1": 0.4583333333333333 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.48157737502489545, - "acc_stderr": 0.0049863932662691625, - "acc_norm": 0.6417048396733719, - "acc_norm_stderr": 0.00478519504988916 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529113 - }, - "winogrande": { - "acc": 0.6085240726124704, - "acc_stderr": 0.01371748707129085 - }, - "storycloze_2016": { - "acc": 0.7338321753073223, - "acc_stderr": 0.010220104800551206 - }, - "boolq": { - "acc": 0.6119266055045871, - "acc_stderr": 0.00852313058476084 - }, - "arc_easy": { - "acc": 0.6283670033670034, - "acc_stderr": 0.00991589712365879, - "acc_norm": 0.6153198653198653, - "acc_norm_stderr": 0.009983171707008997 - }, - "arc_challenge": { - "acc": 0.2960750853242321, - "acc_stderr": 0.013340916085246271, - "acc_norm": 0.3242320819112628, - "acc_norm_stderr": 0.013678810399518819 - }, - "sciq": { - "acc": 0.923, - "acc_stderr": 0.008434580140240648, - "acc_norm": 0.912, - "acc_norm_stderr": 0.008963053962592074 - }, - "piqa": { - "acc": 0.7595212187159956, - "acc_stderr": 0.009971345364651078, - "acc_norm": 0.7676822633297062, - "acc_norm_stderr": 0.009853201384168243 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json deleted file mode 100644 index ed2114b78288086a92003220343f1d693f0907ab..0000000000000000000000000000000000000000 --- a/4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json +++ /dev/null @@ -1,66 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.351, - "acc_stderr": 0.015100563798316405 - }, - "anli_r2": { - "acc": 0.345, - "acc_stderr": 0.015039986742055237 - }, - "anli_r3": { - "acc": 0.345, - "acc_stderr": 0.013728421539454878 - }, - "cb": { - "acc": 0.5714285714285714, - "acc_stderr": 0.06672848092813058, - "f1": 0.37671957671957673 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.4827723561043617, - "acc_stderr": 0.004986818680313444, - "acc_norm": 0.6446922923720374, - "acc_norm_stderr": 0.004776283203468094 - }, - "rte": { - "acc": 0.5776173285198556, - "acc_stderr": 0.02973162264649588 - }, - "winogrande": { - "acc": 0.595895816890292, - "acc_stderr": 0.013791610664670845 - }, - "storycloze_2016": { - "acc": 0.7252805986103688, - "acc_stderr": 0.010322309878339507 - }, - "boolq": { - "acc": 0.6146788990825688, - "acc_stderr": 0.008511930879680652 - }, - "arc_easy": { - "acc": 0.6300505050505051, - "acc_stderr": 0.009906656266021155, - "acc_norm": 0.6111111111111112, - "acc_norm_stderr": 0.01000324833531377 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_0.json b/4b284b21bc4/evaluation/rankeval/4b284b21bc4_0.json similarity index 100% rename from 4b284b21bc4/evaluation/4b284b21bc4_0.json rename to 4b284b21bc4/evaluation/rankeval/4b284b21bc4_0.json diff --git a/4b284b21bc4/evaluation/4b284b21bc4_1.json b/4b284b21bc4/evaluation/rankeval/4b284b21bc4_1.json similarity index 100% rename from 4b284b21bc4/evaluation/4b284b21bc4_1.json rename to 4b284b21bc4/evaluation/rankeval/4b284b21bc4_1.json diff --git a/4b284b21bc4/evaluation/4b284b21bc4_2.json b/4b284b21bc4/evaluation/rankeval/4b284b21bc4_2.json similarity index 100% rename from 4b284b21bc4/evaluation/4b284b21bc4_2.json rename to 4b284b21bc4/evaluation/rankeval/4b284b21bc4_2.json diff --git a/4b284b21bc4/evaluation/4b284b21bc4_3.json b/4b284b21bc4/evaluation/rankeval/4b284b21bc4_3.json similarity index 100% rename from 4b284b21bc4/evaluation/4b284b21bc4_3.json rename to 4b284b21bc4/evaluation/rankeval/4b284b21bc4_3.json diff --git a/4b284b21bc4/evaluation/4b284b21bc4_4.json b/4b284b21bc4/evaluation/rankeval/4b284b21bc4_4.json similarity index 100% rename from 4b284b21bc4/evaluation/4b284b21bc4_4.json rename to 4b284b21bc4/evaluation/rankeval/4b284b21bc4_4.json diff --git a/4b284b21bc4/evaluation/4b284b21bc4_5.json b/4b284b21bc4/evaluation/rankeval/4b284b21bc4_5.json similarity index 71% rename from 4b284b21bc4/evaluation/4b284b21bc4_5.json rename to 4b284b21bc4/evaluation/rankeval/4b284b21bc4_5.json index ed2114b78288086a92003220343f1d693f0907ab..26adc78f4bb5438648adac6504bd080462d4e7fb 100644 --- a/4b284b21bc4/evaluation/4b284b21bc4_5.json +++ b/4b284b21bc4/evaluation/rankeval/4b284b21bc4_5.json @@ -48,6 +48,24 @@ "acc_stderr": 0.009906656266021155, "acc_norm": 0.6111111111111112, "acc_norm_stderr": 0.01000324833531377 + }, + "arc_challenge": { + "acc": 0.30716723549488056, + "acc_stderr": 0.013481034054980945, + "acc_norm": 0.32337883959044367, + "acc_norm_stderr": 0.013669421630012122 + }, + "sciq": { + "acc": 0.919, + "acc_stderr": 0.008632121032139978, + "acc_norm": 0.907, + "acc_norm_stderr": 0.009188875634996669 + }, + "piqa": { + "acc": 0.7529923830250272, + "acc_stderr": 0.010062268140772625, + "acc_norm": 0.7671381936887922, + "acc_norm_stderr": 0.009861236071080753 } }, "versions": { @@ -61,6 +79,9 @@ "winogrande": 0, "storycloze_2016": 0, "boolq": 1, - "arc_easy": 0 + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json deleted file mode 100644 index 092d1b898067aa5e148ebd915a430c3d5464c4bd..0000000000000000000000000000000000000000 --- a/4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.344, - "acc_stderr": 0.015029633724408947 - }, - "anli_r2": { - "acc": 0.321, - "acc_stderr": 0.01477082181793464 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.01375943749887408 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809221, - "f1": 0.1754385964912281 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4792869946225851, - "acc_stderr": 0.004985498055190357, - "acc_norm": 0.6265684126667994, - "acc_norm_stderr": 0.004827266662144035 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366422 - }, - "winogrande": { - "acc": 0.5753749013417522, - "acc_stderr": 0.013891893150264213 - }, - "storycloze_2016": { - "acc": 0.7231427044361304, - "acc_stderr": 0.01034711289027692 - }, - "boolq": { - "acc": 0.5700305810397553, - "acc_stderr": 0.008658853690729254 - }, - "arc_easy": { - "acc": 0.5984848484848485, - "acc_stderr": 0.010058790020755567, - "acc_norm": 0.5395622895622896, - "acc_norm_stderr": 0.01022761638628902 - }, - "arc_challenge": { - "acc": 0.27986348122866894, - "acc_stderr": 0.013119040897725922, - "acc_norm": 0.31143344709897613, - "acc_norm_stderr": 0.013532472099850942 - }, - "sciq": { - "acc": 0.848, - "acc_stderr": 0.011358918303475274, - "acc_norm": 0.769, - "acc_norm_stderr": 0.013334797216936438 - }, - "piqa": { - "acc": 0.7584330794341676, - "acc_stderr": 0.009986718001804467, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json deleted file mode 100644 index 564bd37debe84193e51b636c92f826bee997af45..0000000000000000000000000000000000000000 --- a/4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.321, - "acc_stderr": 0.014770821817934644 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767803 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.32099491681373216 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.48078072097191793, - "acc_stderr": 0.004986093791041653, - "acc_norm": 0.6337382991435969, - "acc_norm_stderr": 0.004807975515446487 - }, - "rte": { - "acc": 0.5740072202166066, - "acc_stderr": 0.029764956741777645 - }, - "winogrande": { - "acc": 0.590370955011839, - "acc_stderr": 0.013821049109655453 - }, - "storycloze_2016": { - "acc": 0.7204703367183325, - "acc_stderr": 0.01037770209970486 - }, - "boolq": { - "acc": 0.5948012232415902, - "acc_stderr": 0.008586427929715515 - }, - "arc_easy": { - "acc": 0.6262626262626263, - "acc_stderr": 0.009927267058259628, - "acc_norm": 0.5917508417508418, - "acc_norm_stderr": 0.01008556619579125 - }, - "arc_challenge": { - "acc": 0.29266211604095566, - "acc_stderr": 0.013295916103619417, - "acc_norm": 0.32337883959044367, - "acc_norm_stderr": 0.013669421630012132 - }, - "sciq": { - "acc": 0.904, - "acc_stderr": 0.009320454434783227, - "acc_norm": 0.885, - "acc_norm_stderr": 0.01009340759490462 - }, - "piqa": { - "acc": 0.7622415669205659, - "acc_stderr": 0.009932525779525489, - "acc_norm": 0.763873775843308, - "acc_norm_stderr": 0.009908965890558218 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json deleted file mode 100644 index 25d172096cee2a6033e16c51c520c23abfe04837..0000000000000000000000000000000000000000 --- a/4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r3": { - "acc": 0.3233333333333333, - "acc_stderr": 0.013508372867300217 - }, - "cb": { - "acc": 0.25, - "acc_stderr": 0.058387420812114225, - "f1": 0.22987012987012986 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.47988448516231824, - "acc_stderr": 0.004985741706385727, - "acc_norm": 0.6363274248157738, - "acc_norm_stderr": 0.004800728138792371 - }, - "rte": { - "acc": 0.5631768953068592, - "acc_stderr": 0.02985524739031495 - }, - "winogrande": { - "acc": 0.5824782951854776, - "acc_stderr": 0.013859978264440248 - }, - "storycloze_2016": { - "acc": 0.7177979690005345, - "acc_stderr": 0.010407834479647673 - }, - "boolq": { - "acc": 0.627217125382263, - "acc_stderr": 0.008457255867914694 - }, - "arc_easy": { - "acc": 0.6308922558922558, - "acc_stderr": 0.009901987410242742, - "acc_norm": 0.6123737373737373, - "acc_norm_stderr": 0.009997307914447612 - }, - "arc_challenge": { - "acc": 0.30204778156996587, - "acc_stderr": 0.01341751914471642, - "acc_norm": 0.3216723549488055, - "acc_norm_stderr": 0.013650488084494162 - }, - "sciq": { - "acc": 0.914, - "acc_stderr": 0.008870325962594766, - "acc_norm": 0.883, - "acc_norm_stderr": 0.010169287802713329 - }, - "piqa": { - "acc": 0.7606093579978237, - "acc_stderr": 0.009955884250291681, - "acc_norm": 0.76550598476605, - "acc_norm_stderr": 0.009885203143240543 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json deleted file mode 100644 index 49db01008785eeb84ede436c087b77082fb99bee..0000000000000000000000000000000000000000 --- a/4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.325, - "acc_stderr": 0.014818724459095524 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795021 - }, - "anli_r3": { - "acc": 0.3233333333333333, - "acc_stderr": 0.013508372867300212 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.3565868967138097 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.4790878311093408, - "acc_stderr": 0.004985415250690914, - "acc_norm": 0.634833698466441, - "acc_norm_stderr": 0.004804927608773137 - }, - "rte": { - "acc": 0.6064981949458483, - "acc_stderr": 0.029405839314203194 - }, - "winogrande": { - "acc": 0.585635359116022, - "acc_stderr": 0.013844846232268563 - }, - "storycloze_2016": { - "acc": 0.7295563869588455, - "acc_stderr": 0.010271810373331027 - }, - "boolq": { - "acc": 0.6241590214067279, - "acc_stderr": 0.008471147248160107 - }, - "arc_easy": { - "acc": 0.6372053872053872, - "acc_stderr": 0.009865936757013942, - "acc_norm": 0.6186868686868687, - "acc_norm_stderr": 0.009966542497171021 - }, - "arc_challenge": { - "acc": 0.30119453924914674, - "acc_stderr": 0.013406741767847624, - "acc_norm": 0.32337883959044367, - "acc_norm_stderr": 0.01366942163001213 - }, - "sciq": { - "acc": 0.91, - "acc_stderr": 0.00905439020486644, - "acc_norm": 0.897, - "acc_norm_stderr": 0.009616833339695796 - }, - "piqa": { - "acc": 0.7540805223068553, - "acc_stderr": 0.01004733186562519, - "acc_norm": 0.7687704026115343, - "acc_norm_stderr": 0.009837063180625334 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json deleted file mode 100644 index 5a0d62eba9e09ae1ee6783fa2e6d5d765560dd65..0000000000000000000000000000000000000000 --- a/4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.345, - "acc_stderr": 0.015039986742055235 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r3": { - "acc": 0.31416666666666665, - "acc_stderr": 0.013405399314984096 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.3647495361781076 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.4819757020513842, - "acc_stderr": 0.004986538243846636, - "acc_norm": 0.6387173869747063, - "acc_norm_stderr": 0.004793904922401888 - }, - "rte": { - "acc": 0.48736462093862815, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5832675611681136, - "acc_stderr": 0.013856250072796322 - }, - "storycloze_2016": { - "acc": 0.7386424371993586, - "acc_stderr": 0.010160471460690485 - }, - "boolq": { - "acc": 0.6275229357798165, - "acc_stderr": 0.008455846866956085 - }, - "arc_easy": { - "acc": 0.6405723905723906, - "acc_stderr": 0.009845958893373766, - "acc_norm": 0.6212121212121212, - "acc_norm_stderr": 0.00995373765654204 - }, - "arc_challenge": { - "acc": 0.30204778156996587, - "acc_stderr": 0.01341751914471642, - "acc_norm": 0.32764505119453924, - "acc_norm_stderr": 0.013715847940719344 - }, - "sciq": { - "acc": 0.92, - "acc_stderr": 0.008583336977753653, - "acc_norm": 0.907, - "acc_norm_stderr": 0.009188875634996702 - }, - "piqa": { - "acc": 0.7551686615886833, - "acc_stderr": 0.01003230910556879, - "acc_norm": 0.76550598476605, - "acc_norm_stderr": 0.00988520314324054 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json deleted file mode 100644 index 63413370f8d5a712578417c5c52021fcc4999356..0000000000000000000000000000000000000000 --- a/4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811475 - }, - "anli_r2": { - "acc": 0.316, - "acc_stderr": 0.014709193056057106 - }, - "anli_r3": { - "acc": 0.31666666666666665, - "acc_stderr": 0.013434078660827384 - }, - "cb": { - "acc": 0.30357142857142855, - "acc_stderr": 0.06199938655510754, - "f1": 0.2503507986266607 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4788886675960964, - "acc_stderr": 0.004985331652408345, - "acc_norm": 0.6412069308902609, - "acc_norm_stderr": 0.004786660691181937 - }, - "rte": { - "acc": 0.5740072202166066, - "acc_stderr": 0.02976495674177765 - }, - "winogrande": { - "acc": 0.5911602209944752, - "acc_stderr": 0.013816954295135684 - }, - "storycloze_2016": { - "acc": 0.7279529663281668, - "acc_stderr": 0.010290888060871242 - }, - "boolq": { - "acc": 0.6275229357798165, - "acc_stderr": 0.008455846866956086 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1 - } -} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_0.json b/4b284b28bc4/evaluation/rankeval/4b284b28bc4_0.json similarity index 100% rename from 4b284b28bc4/evaluation/4b284b28bc4_0.json rename to 4b284b28bc4/evaluation/rankeval/4b284b28bc4_0.json diff --git a/4b284b28bc4/evaluation/4b284b28bc4_1.json b/4b284b28bc4/evaluation/rankeval/4b284b28bc4_1.json similarity index 100% rename from 4b284b28bc4/evaluation/4b284b28bc4_1.json rename to 4b284b28bc4/evaluation/rankeval/4b284b28bc4_1.json diff --git a/4b284b28bc4/evaluation/4b284b28bc4_2.json b/4b284b28bc4/evaluation/rankeval/4b284b28bc4_2.json similarity index 100% rename from 4b284b28bc4/evaluation/4b284b28bc4_2.json rename to 4b284b28bc4/evaluation/rankeval/4b284b28bc4_2.json diff --git a/4b284b28bc4/evaluation/4b284b28bc4_3.json b/4b284b28bc4/evaluation/rankeval/4b284b28bc4_3.json similarity index 100% rename from 4b284b28bc4/evaluation/4b284b28bc4_3.json rename to 4b284b28bc4/evaluation/rankeval/4b284b28bc4_3.json diff --git a/4b284b28bc4/evaluation/4b284b28bc4_4.json b/4b284b28bc4/evaluation/rankeval/4b284b28bc4_4.json similarity index 100% rename from 4b284b28bc4/evaluation/4b284b28bc4_4.json rename to 4b284b28bc4/evaluation/rankeval/4b284b28bc4_4.json diff --git a/4b284b28bc4/evaluation/4b284b28bc4_5.json b/4b284b28bc4/evaluation/rankeval/4b284b28bc4_5.json similarity index 61% rename from 4b284b28bc4/evaluation/4b284b28bc4_5.json rename to 4b284b28bc4/evaluation/rankeval/4b284b28bc4_5.json index 63413370f8d5a712578417c5c52021fcc4999356..25bb69f70330a99445d40d4f21b3c3c78bc262ad 100644 --- a/4b284b28bc4/evaluation/4b284b28bc4_5.json +++ b/4b284b28bc4/evaluation/rankeval/4b284b28bc4_5.json @@ -42,6 +42,30 @@ "boolq": { "acc": 0.6275229357798165, "acc_stderr": 0.008455846866956086 + }, + "arc_easy": { + "acc": 0.6401515151515151, + "acc_stderr": 0.009848484848484846, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.009908978578665755 + }, + "arc_challenge": { + "acc": 0.30887372013651876, + "acc_stderr": 0.013501770929344003, + "acc_norm": 0.32849829351535836, + "acc_norm_stderr": 0.013724978465537377 + }, + "sciq": { + "acc": 0.921, + "acc_stderr": 0.008534156773333445, + "acc_norm": 0.908, + "acc_norm_stderr": 0.00914437639315112 + }, + "piqa": { + "acc": 0.750272034820457, + "acc_stderr": 0.010099232969867486, + "acc_norm": 0.764961915125136, + "acc_norm_stderr": 0.009893146688805312 } }, "versions": { @@ -54,6 +78,10 @@ "rte": 0, "winogrande": 0, "storycloze_2016": 0, - "boolq": 1 + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json deleted file mode 100644 index 3296a9419420e5ec52b95fea7b62c31a9f88794e..0000000000000000000000000000000000000000 --- a/4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.014888272588203931 - }, - "anli_r2": { - "acc": 0.342, - "acc_stderr": 0.01500870618212173 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767784 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.45393112410656267 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4833698466440948, - "acc_stderr": 0.004987020679861267, - "acc_norm": 0.63433578968333, - "acc_norm_stderr": 0.004806316342709393 - }, - "rte": { - "acc": 0.5776173285198556, - "acc_stderr": 0.029731622646495887 - }, - "winogrande": { - "acc": 0.5864246250986582, - "acc_stderr": 0.013840971763195303 - }, - "storycloze_2016": { - "acc": 0.7204703367183325, - "acc_stderr": 0.01037770209970486 - }, - "boolq": { - "acc": 0.5253822629969419, - "acc_stderr": 0.0087337795418535 - }, - "arc_easy": { - "acc": 0.6224747474747475, - "acc_stderr": 0.00994722783346943, - "acc_norm": 0.5462962962962963, - "acc_norm_stderr": 0.010215708295494117 - }, - "arc_challenge": { - "acc": 0.27986348122866894, - "acc_stderr": 0.013119040897725922, - "acc_norm": 0.29266211604095566, - "acc_norm_stderr": 0.01329591610361942 - }, - "sciq": { - "acc": 0.837, - "acc_stderr": 0.011686212712746849, - "acc_norm": 0.757, - "acc_norm_stderr": 0.013569640199177458 - }, - "piqa": { - "acc": 0.7448313384113167, - "acc_stderr": 0.010171571592521822, - "acc_norm": 0.76550598476605, - "acc_norm_stderr": 0.00988520314324054 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json deleted file mode 100644 index cf339e43bef0da4ef04f37f057d335bb54e5bfc7..0000000000000000000000000000000000000000 --- a/4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.31, - "acc_stderr": 0.014632638658632902 - }, - "anli_r2": { - "acc": 0.31, - "acc_stderr": 0.014632638658632905 - }, - "anli_r3": { - "acc": 0.3283333333333333, - "acc_stderr": 0.013562032919529017 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.29749748849204566 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4803823939454292, - "acc_stderr": 0.004985939292819582, - "acc_norm": 0.6294562836088429, - "acc_norm_stderr": 0.004819633668832538 - }, - "rte": { - "acc": 0.44765342960288806, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.5887924230465666, - "acc_stderr": 0.013829128358676874 - }, - "storycloze_2016": { - "acc": 0.7049706039551042, - "acc_stderr": 0.010546232606962289 - }, - "boolq": { - "acc": 0.5522935779816514, - "acc_stderr": 0.008697094687974059 - }, - "arc_easy": { - "acc": 0.6262626262626263, - "acc_stderr": 0.009927267058259621, - "acc_norm": 0.5934343434343434, - "acc_norm_stderr": 0.010079056419223527 - }, - "arc_challenge": { - "acc": 0.2883959044368601, - "acc_stderr": 0.013238394422428173, - "acc_norm": 0.3148464163822526, - "acc_norm_stderr": 0.01357265770308495 - }, - "sciq": { - "acc": 0.892, - "acc_stderr": 0.0098200016513457, - "acc_norm": 0.869, - "acc_norm_stderr": 0.010674874844837954 - }, - "piqa": { - "acc": 0.7486398258977149, - "acc_stderr": 0.010121156016819259, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json deleted file mode 100644 index 22cf079e8ba914e5c154466c57a2d14c743145c3..0000000000000000000000000000000000000000 --- a/4b284b42bc4/evaluation/4b284b42bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.306, - "acc_stderr": 0.014580006055436969 - }, - "anli_r2": { - "acc": 0.33, - "acc_stderr": 0.014876872027456734 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070709007 - }, - "cb": { - "acc": 0.25, - "acc_stderr": 0.058387420812114225, - "f1": 0.2376010151606224 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.47679745070703045, - "acc_stderr": 0.004984405935541087, - "acc_norm": 0.6308504282015535, - "acc_norm_stderr": 0.004815882719278393 - }, - "rte": { - "acc": 0.48736462093862815, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.6101026045777427, - "acc_stderr": 0.013707547317008462 - }, - "storycloze_2016": { - "acc": 0.7199358631747729, - "acc_stderr": 0.01038376499392048 - }, - "boolq": { - "acc": 0.5889908256880734, - "acc_stderr": 0.008605429733982185 - }, - "arc_easy": { - "acc": 0.6388888888888888, - "acc_stderr": 0.00985601342581124, - "acc_norm": 0.6182659932659933, - "acc_norm_stderr": 0.009968648851839672 - }, - "arc_challenge": { - "acc": 0.29948805460750855, - "acc_stderr": 0.013385021637313565, - "acc_norm": 0.3148464163822526, - "acc_norm_stderr": 0.01357265770308495 - }, - "sciq": { - "acc": 0.902, - "acc_stderr": 0.009406619184621238, - "acc_norm": 0.89, - "acc_norm_stderr": 0.009899393819724444 - }, - "piqa": { - "acc": 0.750272034820457, - "acc_stderr": 0.010099232969867488, - "acc_norm": 0.763873775843308, - "acc_norm_stderr": 0.009908965890558218 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json deleted file mode 100644 index 61b8c23bda963770070ae812b469efca25c03862..0000000000000000000000000000000000000000 --- a/4b284b42bc4/evaluation/4b284b42bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.314, - "acc_stderr": 0.014683991951087962 - }, - "anli_r2": { - "acc": 0.342, - "acc_stderr": 0.015008706182121734 - }, - "anli_r3": { - "acc": 0.32416666666666666, - "acc_stderr": 0.013517438120881636 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.37437732746529967 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4823740290778729, - "acc_stderr": 0.004986680048438317, - "acc_norm": 0.6320454092810197, - "acc_norm_stderr": 0.004812633280078256 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.5887924230465666, - "acc_stderr": 0.013829128358676878 - }, - "storycloze_2016": { - "acc": 0.7215392838054516, - "acc_stderr": 0.010365521460604417 - }, - "boolq": { - "acc": 0.599388379204893, - "acc_stderr": 0.008570545612096372 - }, - "arc_easy": { - "acc": 0.6342592592592593, - "acc_stderr": 0.00988298806941883, - "acc_norm": 0.6212121212121212, - "acc_norm_stderr": 0.00995373765654204 - }, - "arc_challenge": { - "acc": 0.29180887372013653, - "acc_stderr": 0.013284525292403503, - "acc_norm": 0.3046075085324232, - "acc_norm_stderr": 0.01344952210993249 - }, - "sciq": { - "acc": 0.917, - "acc_stderr": 0.00872852720607479, - "acc_norm": 0.902, - "acc_norm_stderr": 0.009406619184621236 - }, - "piqa": { - "acc": 0.7600652883569097, - "acc_stderr": 0.009963625892809544, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json deleted file mode 100644 index 758e16da35807d91bcb07151f22b10f3a8b3d60b..0000000000000000000000000000000000000000 --- a/4b284b42bc4/evaluation/4b284b42bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932572 - }, - "anli_r2": { - "acc": 0.352, - "acc_stderr": 0.015110404505648663 - }, - "anli_r3": { - "acc": 0.3233333333333333, - "acc_stderr": 0.013508372867300215 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359541, - "f1": 0.3098047785547785 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932262 - }, - "hellaswag": { - "acc": 0.47849034056960765, - "acc_stderr": 0.00498516207433611, - "acc_norm": 0.6403106950806612, - "acc_norm_stderr": 0.00478928472395585 - }, - "rte": { - "acc": 0.4729241877256318, - "acc_stderr": 0.030052303463143706 - }, - "winogrande": { - "acc": 0.595895816890292, - "acc_stderr": 0.01379161066467086 - }, - "storycloze_2016": { - "acc": 0.7279529663281668, - "acc_stderr": 0.01029088806087124 - }, - "boolq": { - "acc": 0.6143730886850153, - "acc_stderr": 0.008513189460768057 - }, - "arc_easy": { - "acc": 0.6447811447811448, - "acc_stderr": 0.009820245899287119, - "acc_norm": 0.6195286195286195, - "acc_norm_stderr": 0.009962305992058567 - }, - "arc_challenge": { - "acc": 0.295221843003413, - "acc_stderr": 0.013329750293382316, - "acc_norm": 0.3046075085324232, - "acc_norm_stderr": 0.013449522109932487 - }, - "sciq": { - "acc": 0.918, - "acc_stderr": 0.008680515615523705, - "acc_norm": 0.902, - "acc_norm_stderr": 0.009406619184621224 - }, - "piqa": { - "acc": 0.7562568008705114, - "acc_stderr": 0.010017199471500619, - "acc_norm": 0.7622415669205659, - "acc_norm_stderr": 0.009932525779525492 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json deleted file mode 100644 index ddad2a41639e40b03f98968f1bc0776dfbd23de4..0000000000000000000000000000000000000000 --- a/4b284b42bc4/evaluation/4b284b42bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json +++ /dev/null @@ -1,66 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.323, - "acc_stderr": 0.014794927843348639 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811475 - }, - "anli_r3": { - "acc": 0.3275, - "acc_stderr": 0.013553211167251961 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.32470238095238096 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4792869946225851, - "acc_stderr": 0.004985498055190358, - "acc_norm": 0.6384186417048396, - "acc_norm_stderr": 0.004794764843685288 - }, - "rte": { - "acc": 0.5054151624548736, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.5880031570639306, - "acc_stderr": 0.013833112857645937 - }, - "storycloze_2016": { - "acc": 0.7306253340459647, - "acc_stderr": 0.010258997754057014 - }, - "boolq": { - "acc": 0.618960244648318, - "acc_stderr": 0.008493937524439337 - }, - "arc_easy": { - "acc": 0.6426767676767676, - "acc_stderr": 0.00983320561246312, - "acc_norm": 0.625, - "acc_norm_stderr": 0.009933992677987828 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_0.json b/4b284b42bc4/evaluation/rankeval/4b284b42bc4_0.json similarity index 100% rename from 4b284b42bc4/evaluation/4b284b42bc4_0.json rename to 4b284b42bc4/evaluation/rankeval/4b284b42bc4_0.json diff --git a/4b284b42bc4/evaluation/4b284b42bc4_1.json b/4b284b42bc4/evaluation/rankeval/4b284b42bc4_1.json similarity index 100% rename from 4b284b42bc4/evaluation/4b284b42bc4_1.json rename to 4b284b42bc4/evaluation/rankeval/4b284b42bc4_1.json diff --git a/4b284b42bc4/evaluation/4b284b42bc4_2.json b/4b284b42bc4/evaluation/rankeval/4b284b42bc4_2.json similarity index 100% rename from 4b284b42bc4/evaluation/4b284b42bc4_2.json rename to 4b284b42bc4/evaluation/rankeval/4b284b42bc4_2.json diff --git a/4b284b42bc4/evaluation/4b284b42bc4_3.json b/4b284b42bc4/evaluation/rankeval/4b284b42bc4_3.json similarity index 100% rename from 4b284b42bc4/evaluation/4b284b42bc4_3.json rename to 4b284b42bc4/evaluation/rankeval/4b284b42bc4_3.json diff --git a/4b284b42bc4/evaluation/4b284b42bc4_4.json b/4b284b42bc4/evaluation/rankeval/4b284b42bc4_4.json similarity index 100% rename from 4b284b42bc4/evaluation/4b284b42bc4_4.json rename to 4b284b42bc4/evaluation/rankeval/4b284b42bc4_4.json diff --git a/4b284b42bc4/evaluation/4b284b42bc4_5.json b/4b284b42bc4/evaluation/rankeval/4b284b42bc4_5.json similarity index 70% rename from 4b284b42bc4/evaluation/4b284b42bc4_5.json rename to 4b284b42bc4/evaluation/rankeval/4b284b42bc4_5.json index ddad2a41639e40b03f98968f1bc0776dfbd23de4..3546c29a41653e7ddae8120d8f47b1083787508d 100644 --- a/4b284b42bc4/evaluation/4b284b42bc4_5.json +++ b/4b284b42bc4/evaluation/rankeval/4b284b42bc4_5.json @@ -48,6 +48,24 @@ "acc_stderr": 0.00983320561246312, "acc_norm": 0.625, "acc_norm_stderr": 0.009933992677987828 + }, + "arc_challenge": { + "acc": 0.29948805460750855, + "acc_stderr": 0.013385021637313565, + "acc_norm": 0.31313993174061433, + "acc_norm_stderr": 0.013552671543623504 + }, + "sciq": { + "acc": 0.919, + "acc_stderr": 0.008632121032139964, + "acc_norm": 0.911, + "acc_norm_stderr": 0.009008893392651526 + }, + "piqa": { + "acc": 0.7557127312295974, + "acc_stderr": 0.010024765172284247, + "acc_norm": 0.7616974972796517, + "acc_norm_stderr": 0.009940334245876222 } }, "versions": { @@ -61,6 +79,9 @@ "winogrande": 0, "storycloze_2016": 0, "boolq": 1, - "arc_easy": 0 + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_5.json b/4b284b84bc4/evaluation/4b284b84bc4_5.json index 44bf5e7100931dff1543cecb01710e6eaf24ce07..fa71de74d0dca54d60d4087a32d9f3d84a7c80e9 100644 --- a/4b284b84bc4/evaluation/4b284b84bc4_5.json +++ b/4b284b84bc4/evaluation/4b284b84bc4_5.json @@ -48,6 +48,24 @@ "acc_stderr": 0.010091953527506246, "acc_norm": 0.5791245791245792, "acc_norm_stderr": 0.01013050216406634 + }, + "arc_challenge": { + "acc": 0.28754266211604096, + "acc_stderr": 0.01322671905626613, + "acc_norm": 0.31313993174061433, + "acc_norm_stderr": 0.013552671543623504 + }, + "sciq": { + "acc": 0.918, + "acc_stderr": 0.008680515615523746, + "acc_norm": 0.917, + "acc_norm_stderr": 0.00872852720607479 + }, + "piqa": { + "acc": 0.7317736670293797, + "acc_stderr": 0.010336761992404485, + "acc_norm": 0.7448313384113167, + "acc_norm_stderr": 0.010171571592521828 } }, "versions": { @@ -61,6 +79,9 @@ "winogrande": 0, "storycloze_2016": 0, "boolq": 1, - "arc_easy": 0 + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_5_lm-eval_global_step80108_2023-01-30-11-26-40_5shots_backup.json b/4b284b84bc4/evaluation/4b284b84bc4_5_lm-eval_global_step80108_2023-01-30-11-26-40_5shots_backup.json index 44bf5e7100931dff1543cecb01710e6eaf24ce07..fa71de74d0dca54d60d4087a32d9f3d84a7c80e9 100644 --- a/4b284b84bc4/evaluation/4b284b84bc4_5_lm-eval_global_step80108_2023-01-30-11-26-40_5shots_backup.json +++ b/4b284b84bc4/evaluation/4b284b84bc4_5_lm-eval_global_step80108_2023-01-30-11-26-40_5shots_backup.json @@ -48,6 +48,24 @@ "acc_stderr": 0.010091953527506246, "acc_norm": 0.5791245791245792, "acc_norm_stderr": 0.01013050216406634 + }, + "arc_challenge": { + "acc": 0.28754266211604096, + "acc_stderr": 0.01322671905626613, + "acc_norm": 0.31313993174061433, + "acc_norm_stderr": 0.013552671543623504 + }, + "sciq": { + "acc": 0.918, + "acc_stderr": 0.008680515615523746, + "acc_norm": 0.917, + "acc_norm_stderr": 0.00872852720607479 + }, + "piqa": { + "acc": 0.7317736670293797, + "acc_stderr": 0.010336761992404485, + "acc_norm": 0.7448313384113167, + "acc_norm_stderr": 0.010171571592521828 } }, "versions": { @@ -61,6 +79,9 @@ "winogrande": 0, "storycloze_2016": 0, "boolq": 1, - "arc_easy": 0 + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file