Update README.md
Browse files
README.md
CHANGED
@@ -5,6 +5,9 @@ license: apache-2.0
|
|
5 |
|
6 |
WIP
|
7 |
|
|
|
|
|
|
|
8 |
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
9 |
|----------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
|
10 |
|leaderboard_gpqa | N/A| | | | | | | |
|
@@ -12,9 +15,42 @@ WIP
|
|
12 |
| - leaderboard_gpqa_extended| 1|none | 0|acc_norm|↑ |0.2308|± |0.0180|
|
13 |
| - leaderboard_gpqa_main | 1|none | 0|acc_norm|↑ |0.2679|± |0.0209|
|
14 |
|
|
|
|
|
15 |
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
16 |
|-------------------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
|
17 |
|leaderboard_musr | N/A| | | | | | | |
|
18 |
| - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm|↑ |0.5160|± |0.0317|
|
19 |
| - leaderboard_musr_object_placements| 1|none | 0|acc_norm|↑ |0.2383|± |0.0267|
|
20 |
-
| - leaderboard_musr_team_allocation | 1|none | 0|acc_norm|↑ |0.4400|± |0.0315|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
WIP
|
7 |
|
8 |
+
## GPQA
|
9 |
+
|
10 |
+
|
11 |
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
12 |
|----------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
|
13 |
|leaderboard_gpqa | N/A| | | | | | | |
|
|
|
15 |
| - leaderboard_gpqa_extended| 1|none | 0|acc_norm|↑ |0.2308|± |0.0180|
|
16 |
| - leaderboard_gpqa_main | 1|none | 0|acc_norm|↑ |0.2679|± |0.0209|
|
17 |
|
18 |
+
## MUSR
|
19 |
+
|
20 |
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
21 |
|-------------------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
|
22 |
|leaderboard_musr | N/A| | | | | | | |
|
23 |
| - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm|↑ |0.5160|± |0.0317|
|
24 |
| - leaderboard_musr_object_placements| 1|none | 0|acc_norm|↑ |0.2383|± |0.0267|
|
25 |
+
| - leaderboard_musr_team_allocation | 1|none | 0|acc_norm|↑ |0.4400|± |0.0315|
|
26 |
+
|
27 |
+
|
28 |
+
## BBH
|
29 |
+
|
30 |
+
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
31 |
+
|----------------------------------------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
|
32 |
+
|leaderboard_bbh | N/A| | | | | | | |
|
33 |
+
| - leaderboard_bbh_boolean_expressions | 1|none | 3|acc_norm|↑ |0.5480|± |0.0315|
|
34 |
+
| - leaderboard_bbh_causal_judgement | 1|none | 3|acc_norm|↑ |0.4652|± |0.0366|
|
35 |
+
| - leaderboard_bbh_date_understanding | 1|none | 3|acc_norm|↑ |0.1560|± |0.0230|
|
36 |
+
| - leaderboard_bbh_disambiguation_qa | 1|none | 3|acc_norm|↑ |0.3120|± |0.0294|
|
37 |
+
| - leaderboard_bbh_formal_fallacies | 1|none | 3|acc_norm|↑ |0.5240|± |0.0316|
|
38 |
+
| - leaderboard_bbh_geometric_shapes | 1|none | 3|acc_norm|↑ |0.2040|± |0.0255|
|
39 |
+
| - leaderboard_bbh_hyperbaton | 1|none | 3|acc_norm|↑ |0.5000|± |0.0317|
|
40 |
+
| - leaderboard_bbh_logical_deduction_five_objects | 1|none | 3|acc_norm|↑ |0.2240|± |0.0264|
|
41 |
+
| - leaderboard_bbh_logical_deduction_seven_objects | 1|none | 3|acc_norm|↑ |0.1440|± |0.0222|
|
42 |
+
| - leaderboard_bbh_logical_deduction_three_objects | 1|none | 3|acc_norm|↑ |0.3320|± |0.0298|
|
43 |
+
| - leaderboard_bbh_movie_recommendation | 1|none | 3|acc_norm|↑ |0.2440|± |0.0272|
|
44 |
+
| - leaderboard_bbh_navigate | 1|none | 3|acc_norm|↑ |0.5800|± |0.0313|
|
45 |
+
| - leaderboard_bbh_object_counting | 1|none | 3|acc_norm|↑ |0.2080|± |0.0257|
|
46 |
+
| - leaderboard_bbh_penguins_in_a_table | 1|none | 3|acc_norm|↑ |0.2123|± |0.0340|
|
47 |
+
| - leaderboard_bbh_reasoning_about_colored_objects | 1|none | 3|acc_norm|↑ |0.1320|± |0.0215|
|
48 |
+
| - leaderboard_bbh_ruin_names | 1|none | 3|acc_norm|↑ |0.2480|± |0.0274|
|
49 |
+
| - leaderboard_bbh_salient_translation_error_detection | 1|none | 3|acc_norm|↑ |0.2120|± |0.0259|
|
50 |
+
| - leaderboard_bbh_snarks | 1|none | 3|acc_norm|↑ |0.5281|± |0.0375|
|
51 |
+
| - leaderboard_bbh_sports_understanding | 1|none | 3|acc_norm|↑ |0.4600|± |0.0316|
|
52 |
+
| - leaderboard_bbh_temporal_sequences | 1|none | 3|acc_norm|↑ |0.2800|± |0.0285|
|
53 |
+
| - leaderboard_bbh_tracking_shuffled_objects_five_objects | 1|none | 3|acc_norm|↑ |0.1720|± |0.0239|
|
54 |
+
| - leaderboard_bbh_tracking_shuffled_objects_seven_objects| 1|none | 3|acc_norm|↑ |0.1440|± |0.0222|
|
55 |
+
| - leaderboard_bbh_tracking_shuffled_objects_three_objects| 1|none | 3|acc_norm|↑ |0.3000|± |0.0290|
|
56 |
+
| - leaderboard_bbh_web_of_lies | 1|none | 3|acc_norm|↑ |0.5480|± |0.0315|
|