aloobun commited on
Commit
68a4ea4
·
verified ·
1 Parent(s): a0756d4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +37 -1
README.md CHANGED
@@ -5,6 +5,9 @@ license: apache-2.0
5
 
6
  WIP
7
 
 
 
 
8
  | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
9
  |----------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
10
  |leaderboard_gpqa | N/A| | | | | | | |
@@ -12,9 +15,42 @@ WIP
12
  | - leaderboard_gpqa_extended| 1|none | 0|acc_norm|↑ |0.2308|± |0.0180|
13
  | - leaderboard_gpqa_main | 1|none | 0|acc_norm|↑ |0.2679|± |0.0209|
14
 
 
 
15
  | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
16
  |-------------------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
17
  |leaderboard_musr | N/A| | | | | | | |
18
  | - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm|↑ |0.5160|± |0.0317|
19
  | - leaderboard_musr_object_placements| 1|none | 0|acc_norm|↑ |0.2383|± |0.0267|
20
- | - leaderboard_musr_team_allocation | 1|none | 0|acc_norm|↑ |0.4400|± |0.0315|
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  WIP
7
 
8
+ ## GPQA
9
+
10
+
11
  | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
12
  |----------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
13
  |leaderboard_gpqa | N/A| | | | | | | |
 
15
  | - leaderboard_gpqa_extended| 1|none | 0|acc_norm|↑ |0.2308|± |0.0180|
16
  | - leaderboard_gpqa_main | 1|none | 0|acc_norm|↑ |0.2679|± |0.0209|
17
 
18
+ ## MUSR
19
+
20
  | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
21
  |-------------------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
22
  |leaderboard_musr | N/A| | | | | | | |
23
  | - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm|↑ |0.5160|± |0.0317|
24
  | - leaderboard_musr_object_placements| 1|none | 0|acc_norm|↑ |0.2383|± |0.0267|
25
+ | - leaderboard_musr_team_allocation | 1|none | 0|acc_norm|↑ |0.4400|± |0.0315|
26
+
27
+
28
+ ## BBH
29
+
30
+ | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
31
+ |----------------------------------------------------------|-------|------|-----:|--------|---|-----:|---|-----:|
32
+ |leaderboard_bbh | N/A| | | | | | | |
33
+ | - leaderboard_bbh_boolean_expressions | 1|none | 3|acc_norm|↑ |0.5480|± |0.0315|
34
+ | - leaderboard_bbh_causal_judgement | 1|none | 3|acc_norm|↑ |0.4652|± |0.0366|
35
+ | - leaderboard_bbh_date_understanding | 1|none | 3|acc_norm|↑ |0.1560|± |0.0230|
36
+ | - leaderboard_bbh_disambiguation_qa | 1|none | 3|acc_norm|↑ |0.3120|± |0.0294|
37
+ | - leaderboard_bbh_formal_fallacies | 1|none | 3|acc_norm|↑ |0.5240|± |0.0316|
38
+ | - leaderboard_bbh_geometric_shapes | 1|none | 3|acc_norm|↑ |0.2040|± |0.0255|
39
+ | - leaderboard_bbh_hyperbaton | 1|none | 3|acc_norm|↑ |0.5000|± |0.0317|
40
+ | - leaderboard_bbh_logical_deduction_five_objects | 1|none | 3|acc_norm|↑ |0.2240|± |0.0264|
41
+ | - leaderboard_bbh_logical_deduction_seven_objects | 1|none | 3|acc_norm|↑ |0.1440|± |0.0222|
42
+ | - leaderboard_bbh_logical_deduction_three_objects | 1|none | 3|acc_norm|↑ |0.3320|± |0.0298|
43
+ | - leaderboard_bbh_movie_recommendation | 1|none | 3|acc_norm|↑ |0.2440|± |0.0272|
44
+ | - leaderboard_bbh_navigate | 1|none | 3|acc_norm|↑ |0.5800|± |0.0313|
45
+ | - leaderboard_bbh_object_counting | 1|none | 3|acc_norm|↑ |0.2080|± |0.0257|
46
+ | - leaderboard_bbh_penguins_in_a_table | 1|none | 3|acc_norm|↑ |0.2123|± |0.0340|
47
+ | - leaderboard_bbh_reasoning_about_colored_objects | 1|none | 3|acc_norm|↑ |0.1320|± |0.0215|
48
+ | - leaderboard_bbh_ruin_names | 1|none | 3|acc_norm|↑ |0.2480|± |0.0274|
49
+ | - leaderboard_bbh_salient_translation_error_detection | 1|none | 3|acc_norm|↑ |0.2120|± |0.0259|
50
+ | - leaderboard_bbh_snarks | 1|none | 3|acc_norm|↑ |0.5281|± |0.0375|
51
+ | - leaderboard_bbh_sports_understanding | 1|none | 3|acc_norm|↑ |0.4600|± |0.0316|
52
+ | - leaderboard_bbh_temporal_sequences | 1|none | 3|acc_norm|↑ |0.2800|± |0.0285|
53
+ | - leaderboard_bbh_tracking_shuffled_objects_five_objects | 1|none | 3|acc_norm|↑ |0.1720|± |0.0239|
54
+ | - leaderboard_bbh_tracking_shuffled_objects_seven_objects| 1|none | 3|acc_norm|↑ |0.1440|± |0.0222|
55
+ | - leaderboard_bbh_tracking_shuffled_objects_three_objects| 1|none | 3|acc_norm|↑ |0.3000|± |0.0290|
56
+ | - leaderboard_bbh_web_of_lies | 1|none | 3|acc_norm|↑ |0.5480|± |0.0315|