Update README.md
Browse files
README.md
CHANGED
@@ -37,9 +37,9 @@ model-index:
|
|
37 |
type: openai_humanneval
|
38 |
name: OpenAI HumanEval
|
39 |
metrics:
|
40 |
-
- name: pass@1 (t=0.
|
41 |
type: pass@1
|
42 |
-
value:
|
43 |
- name: pass@10 (t=0.8)
|
44 |
type: pass@10
|
45 |
value: 65.755
|
@@ -49,9 +49,9 @@ model-index:
|
|
49 |
type: mbpp
|
50 |
name: Mostly Basic Python Problems (mbpp)
|
51 |
metrics:
|
52 |
-
- name: pass@1 (t=0.
|
53 |
type: pass@1
|
54 |
-
value: 39.
|
55 |
- name: pass@10 (t=0.8)
|
56 |
type: pass@10
|
57 |
value: 59.895
|
@@ -61,7 +61,7 @@ model-index:
|
|
61 |
type: race
|
62 |
name: RACE
|
63 |
metrics:
|
64 |
-
- name: accuracy
|
65 |
type: accuracy
|
66 |
value: 41.148
|
67 |
- task:
|
@@ -70,7 +70,10 @@ model-index:
|
|
70 |
type: mmlu
|
71 |
name: Measuring Massive Multitask Language Understanding (MMLU)
|
72 |
metrics:
|
73 |
-
- name: accuracy
|
|
|
|
|
|
|
74 |
type: accuracy
|
75 |
value: 52.789
|
76 |
- task:
|
@@ -79,7 +82,7 @@ model-index:
|
|
79 |
type: truthful_qa
|
80 |
name: Truthful QA
|
81 |
metrics:
|
82 |
-
- name: accuracy
|
83 |
type: accuracy
|
84 |
value: 47.29
|
85 |
- task:
|
@@ -100,7 +103,7 @@ model-index:
|
|
100 |
type: copa
|
101 |
name: COPA
|
102 |
metrics:
|
103 |
-
- name: accuracy
|
104 |
type: accuracy
|
105 |
value: 85
|
106 |
- task:
|
@@ -109,7 +112,7 @@ model-index:
|
|
109 |
type: boolq
|
110 |
name: Boolq
|
111 |
metrics:
|
112 |
-
- name: accuracy
|
113 |
type: accuracy
|
114 |
value: 82.783
|
115 |
- task:
|
@@ -118,7 +121,7 @@ model-index:
|
|
118 |
type: openbookqa
|
119 |
name: Openbook QA
|
120 |
metrics:
|
121 |
-
- name: accuracy
|
122 |
type: accuracy
|
123 |
value: 42
|
124 |
- task:
|
@@ -139,7 +142,7 @@ model-index:
|
|
139 |
type: piqa
|
140 |
name: PIQA
|
141 |
metrics:
|
142 |
-
- name: accuracy
|
143 |
type: accuracy
|
144 |
value: 77.856
|
145 |
- task:
|
@@ -148,7 +151,7 @@ model-index:
|
|
148 |
type: ai2_arc
|
149 |
name: ARC (Easy)
|
150 |
metrics:
|
151 |
-
- name: accuracy
|
152 |
type: accuracy
|
153 |
value: 70.328
|
154 |
- task:
|
|
|
37 |
type: openai_humanneval
|
38 |
name: OpenAI HumanEval
|
39 |
metrics:
|
40 |
+
- name: pass@1 (t=0.2)
|
41 |
type: pass@1
|
42 |
+
value: 34.116
|
43 |
- name: pass@10 (t=0.8)
|
44 |
type: pass@10
|
45 |
value: 65.755
|
|
|
49 |
type: mbpp
|
50 |
name: Mostly Basic Python Problems (mbpp)
|
51 |
metrics:
|
52 |
+
- name: pass@1 (t=0.1)
|
53 |
type: pass@1
|
54 |
+
value: 39.112
|
55 |
- name: pass@10 (t=0.8)
|
56 |
type: pass@10
|
57 |
value: 59.895
|
|
|
61 |
type: race
|
62 |
name: RACE
|
63 |
metrics:
|
64 |
+
- name: accuracy (0 shot)
|
65 |
type: accuracy
|
66 |
value: 41.148
|
67 |
- task:
|
|
|
70 |
type: mmlu
|
71 |
name: Measuring Massive Multitask Language Understanding (MMLU)
|
72 |
metrics:
|
73 |
+
- name: accuracy (5 shot)
|
74 |
+
type: accuracy
|
75 |
+
value: 53.215
|
76 |
+
- name: accuracy (0 shot)
|
77 |
type: accuracy
|
78 |
value: 52.789
|
79 |
- task:
|
|
|
82 |
type: truthful_qa
|
83 |
name: Truthful QA
|
84 |
metrics:
|
85 |
+
- name: accuracy (0 shot)
|
86 |
type: accuracy
|
87 |
value: 47.29
|
88 |
- task:
|
|
|
103 |
type: copa
|
104 |
name: COPA
|
105 |
metrics:
|
106 |
+
- name: accuracy (0 shot)
|
107 |
type: accuracy
|
108 |
value: 85
|
109 |
- task:
|
|
|
112 |
type: boolq
|
113 |
name: Boolq
|
114 |
metrics:
|
115 |
+
- name: accuracy (0 shot)
|
116 |
type: accuracy
|
117 |
value: 82.783
|
118 |
- task:
|
|
|
121 |
type: openbookqa
|
122 |
name: Openbook QA
|
123 |
metrics:
|
124 |
+
- name: accuracy (0 shot)
|
125 |
type: accuracy
|
126 |
value: 42
|
127 |
- task:
|
|
|
142 |
type: piqa
|
143 |
name: PIQA
|
144 |
metrics:
|
145 |
+
- name: accuracy (0 shot)
|
146 |
type: accuracy
|
147 |
value: 77.856
|
148 |
- task:
|
|
|
151 |
type: ai2_arc
|
152 |
name: ARC (Easy)
|
153 |
metrics:
|
154 |
+
- name: accuracy (0 shot)
|
155 |
type: accuracy
|
156 |
value: 70.328
|
157 |
- task:
|