Update README.md
Browse files
README.md
CHANGED
@@ -78,171 +78,249 @@ In this section, we report the results for EdgeRunner-Tactical-7B models on stan
|
|
78 |
|
79 |
### Arena-Hard Benchmark
|
80 |
|
81 |
-
|
82 |
-
|
|
83 |
-
|
84 |
-
| gpt-4-
|
85 |
-
| claude-3-
|
86 |
-
| gpt-
|
87 |
-
|
|
88 |
-
|
|
89 |
-
|
|
90 |
-
|
|
91 |
-
|
|
92 |
-
|
|
93 |
-
|
|
94 |
-
|
|
95 |
-
|
|
96 |
-
|
|
97 |
-
|
|
98 |
-
|
|
99 |
-
|
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
|
103 |
### InfiniteBench
|
104 |
|
105 |
-
| Task Name
|
106 |
-
|
107 |
-
| Retrieve.PassKey| 100%
|
108 |
-
| Retrieve.Number
|
109 |
-
| Retrieve.KV
|
110 |
-
| En.Sum
|
111 |
-
| En.QA
|
112 |
-
| En.MC
|
113 |
-
| En.Dia
|
114 |
-
| Zh.QA
|
115 |
-
| Code.Debug
|
116 |
-
| Code.Run
|
117 |
-
| Math.Calc
|
118 |
-
| Math.Find
|
|
|
119 |
|
120 |
### GSM@ZeroEval
|
121 |
|
122 |
-
| Model
|
123 |
-
|
124 |
-
| Llama-3.1-405B-Instruct-Turbo
|
125 |
-
| claude-3-5-sonnet-20240620
|
126 |
-
| claude-3-opus-20240229
|
127 |
-
| gpt-4o-2024-05-13
|
128 |
-
| gpt-4o-mini-2024-07-18
|
129 |
-
| deepseek-chat
|
130 |
-
| deepseek-coder
|
131 |
-
| gemini-1.5-pro
|
132 |
-
| Meta-Llama-3-70B-Instruct
|
133 |
-
| Qwen2-72B-Instruct
|
134 |
-
| claude-3-sonnet-20240229
|
135 |
-
| gemini-1.5-flash
|
136 |
-
| gemma-2-27b-it@together
|
137 |
-
| claude-3-haiku-20240307
|
138 |
-
| gemma-2-9b-it
|
139 |
-
| reka-core-20240501
|
140 |
-
| Athene-70B
|
141 |
-
| Yi-1.5-34B-Chat
|
142 |
-
| Llama-3.1-8B-Instruct
|
143 |
-
| Mistral-Nemo-Instruct-2407
|
144 |
-
|
|
145 |
-
|
|
146 |
-
|
|
147 |
-
|
|
148 |
-
|
|
149 |
-
|
|
150 |
-
| Yi-1.5-9B-Chat
|
151 |
-
| Phi-3-mini-4k-instruct
|
152 |
-
| reka-flash-20240226
|
153 |
-
|
|
154 |
-
|
|
155 |
-
|
|
156 |
-
|
157 |
-
| Qwen2-1.5B-Instruct | 43.37 | 4.78 | 301.67 |
|
158 |
|
159 |
|
160 |
### MMLU-REDUX@ZeroEval
|
161 |
|
162 |
-
| Model
|
163 |
-
|
164 |
-
| gpt-4o-2024-05-13
|
165 |
-
| claude-3-5-sonnet-20240620
|
166 |
-
| Llama-3.1-405B-Instruct-Turbo
|
167 |
-
| gpt-4-turbo-2024-04-09
|
168 |
-
| gemini-1.5-pro
|
169 |
-
| claude-3-opus-20240229
|
170 |
-
| yi-large-preview
|
171 |
-
| gpt-4-0314
|
172 |
-
| Qwen2-72B-Instruct
|
173 |
-
| gpt-4o-mini-2024-07-18
|
174 |
-
| yi-large
|
175 |
-
| deepseek-chat
|
176 |
-
| deepseek-coder
|
177 |
-
| Meta-Llama-3-70B-Instruct
|
178 |
-
| gemini-1.5-flash
|
179 |
-
| Athene-70B
|
180 |
-
| reka-core-20240501
|
181 |
-
| gemma-2-27b-it@together
|
182 |
-
| claude-3-sonnet-20240229
|
183 |
-
| gemma-2-9b-it@nvidia
|
184 |
-
| Yi-1.5-34B-Chat
|
185 |
-
| claude-3-haiku-20240307
|
186 |
-
| Phi-3-mini-4k-instruct
|
187 |
-
| command-r-plus
|
188 |
-
| gpt-3.5-turbo-0125
|
189 |
-
| **EdgeRunner-Tactical-7B**
|
190 |
-
| Llama-3.1-8B-Instruct
|
191 |
-
| Qwen2-7B-Instruct
|
192 |
-
| Mistral-Nemo-Instruct-2407
|
193 |
-
| Yi-1.5-9B-Chat
|
194 |
-
|
|
195 |
-
|
|
196 |
-
|
|
197 |
-
|
|
198 |
-
|
|
199 |
-
| Llama-3-Instruct-8B-SimPO-v0.2 | 55.22 | 1.19 | 450.6 |
|
200 |
-
| Qwen2-1.5B-Instruct | 41.11 | 7.74 | 280.56 |
|
201 |
|
202 |
### WildBench
|
203 |
|
204 |
-
| Model
|
205 |
-
|
206 |
-
| gpt-4o-2024-05-13
|
207 |
-
| claude-3-5-sonnet-20240620
|
208 |
-
| gpt-4-turbo-2024-04-09
|
209 |
-
| gpt-4-0125-preview
|
210 |
-
| gemini-1.5-pro
|
211 |
-
| yi-large-preview
|
212 |
-
| claude-3-opus-20240229
|
213 |
-
| Meta-Llama-3-70B-Instruct
|
214 |
-
| Athene-70B
|
215 |
-
| deepseek-coder-v2
|
216 |
-
| gpt-4o-mini-2024-07-18
|
217 |
-
| yi-large
|
218 |
-
| gemini-1.5-flash
|
219 |
-
| deepseek-v2-chat-0628
|
220 |
-
| gemma-2-9b-it-SimPO
|
221 |
-
| gemma-2-9b-it-DPO
|
222 |
-
| nemotron-4-340b-instruct
|
223 |
-
| claude-3-sonnet-20240229
|
224 |
-
| deepseekv2-chat
|
225 |
-
| gemma-2-27b-it@together
|
226 |
-
| Qwen2-72B-Instruct
|
227 |
-
| reka-core-20240501
|
228 |
-
| Mistral-Nemo-Instruct-2407
|
229 |
-
| Yi-1.5-34B-Chat
|
230 |
-
| **EdgeRunner-Tactical-7B**
|
231 |
-
| claude-3-haiku-20240307
|
232 |
-
| mistral-large-2402
|
233 |
-
| deepseek-v2-coder-0628
|
234 |
-
| gemma-2-9b-it
|
235 |
-
|
|
236 |
-
|
|
237 |
-
|
|
238 |
-
|
|
239 |
-
|
|
240 |
-
|
|
241 |
-
|
|
242 |
-
|
|
243 |
-
| Llama-3-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
### Arena-Hard Benchmark
|
80 |
|
81 |
+
|
82 |
+
| Model | Score | CI | Avg Tokens |
|
83 |
+
|:--------------------------------|:----------|:--------------------|:------------|
|
84 |
+
| gpt-4-turbo-2024-04-09 | 82.63 | (-1.71, \+1.57) | 662.0 |
|
85 |
+
| claude-3-5-sonnet-20240620 | 79.35 | (-1.45, \+2.06) | 567.0 |
|
86 |
+
| gpt-4o-2024-05-13 | 79.21 | (-1.50, \+1.66) | 696.0 |
|
87 |
+
| gpt-4-0125-preview | 77.96 | (-2.12, \+1.63) | 619.0 |
|
88 |
+
| gpt-4o-mini | 74.94 | (-2.40, \+1.75) | 668.0 |
|
89 |
+
| gemini-1.5-pro-api-0514 | 71.96 | (-2.39, \+2.10) | 676.0 |
|
90 |
+
| yi-large-preview | 71.48 | (-2.03, \+3.14) | 720.0 |
|
91 |
+
| glm-4-0520 | 63.84 | (-2.72, \+1.81) | 636.0 |
|
92 |
+
| yi-large | 63.7 | (-2.72, \+2.21) | 626.0 |
|
93 |
+
| deepseek-coder-v2 | 62.3 | (-1.73, \+2.41) | 578.0 |
|
94 |
+
| claude-3-opus-20240229 | 60.36 | (-2.84, \+2.75) | 541.0 |
|
95 |
+
| gemma-2-27b-it | 57.51 | (-2.35, \+2.46) | 577.0 |
|
96 |
+
| glm-4-0116 | 55.72 | (-2.51, \+2.31) | 622.0 |
|
97 |
+
| gemini-1.5-pro-api-0409-preview | 53.37 | (-2.53, \+1.89) | 478.0 |
|
98 |
+
| glm-4-air | 50.88 | (-2.60, \+2.45) | 619.0 |
|
99 |
+
| gpt-4-0314 | 50.0 | (-0.00, \+0.00) | 423.0 |
|
100 |
+
| gemini-1.5-flash-api-0514 | 49.61 | (-2.93, \+2.85) | 642.0 |
|
101 |
+
| qwen2-72b-instruct | 46.86 | (-2.51, \+2.22) | 515.0 |
|
102 |
+
| claude-3-sonnet-20240229 | 46.8 | (-2.94, \+2.35) | 552.0 |
|
103 |
+
| llama-3-70b-instruct | 46.57 | (-2.00, \+2.66) | 591.0 |
|
104 |
+
| claude-3-haiku-20240307 | 41.47 | (-2.15, \+2.65) | 505.0 |
|
105 |
+
| gpt-4-0613 | 37.9 | (-2.21, \+2.51) | 354.0 |
|
106 |
+
| mistral-large-2402 | 37.71 | (-1.88, \+2.77) | 400.0 |
|
107 |
+
| **EdgeRunner-Tactical-7B** | **37.47** | **(-2.74, \+2.57)** | **721.0** |
|
108 |
+
| mixtral-8x22b-instruct-v0.1 | 36.36 | (-2.61, \+2.60) | 430.0 |
|
109 |
+
| qwen1.5-72b-chat | 36.12 | (-2.81, \+2.39) | 474.0 |
|
110 |
+
| phi-3-medium-4k-instruct | 33.37 | (-2.02, \+2.25) | 517.0 |
|
111 |
+
| mistral-medium | 31.9 | (-2.54, \+2.13) | 485.0 |
|
112 |
+
| phi-3-small-8k-instruct | 29.77 | (-2.16, \+2.02) | 568.0 |
|
113 |
+
| mistral-next | 27.37 | (-1.90, \+1.99) | 297.0 |
|
114 |
+
| qwen2-7b-instruct | 25.2 | (-1.55, \+2.46) | 618.0 |
|
115 |
+
| gpt-3.5-turbo-0613 | 24.82 | (-2.15, \+1.90) | 401.0 |
|
116 |
+
| claude-2.0 | 23.99 | (-1.90, \+1.75) | 295.0 |
|
117 |
+
| Arcee-Spark | 23.52 | (-2.03, \+1.73) | 622.0 |
|
118 |
+
| mixtral-8x7b-instruct-v0.1 | 23.4 | (-1.87, \+1.73) | 457.0 |
|
119 |
+
| gpt-3.5-turbo-0125 | 23.34 | (-1.46, \+2.31) | 329.0 |
|
120 |
+
| yi-34b-chat | 23.15 | (-2.15, \+1.85) | 611.0 |
|
121 |
+
| starling-lm-7b-beta | 23.01 | (-1.98, \+1.71) | 530.0 |
|
122 |
+
| claude-2.1 | 22.77 | (-1.48, \+2.38) | 290.0 |
|
123 |
+
| llama-3-8b-instruct | 20.56 | (-1.65, \+2.09) | 585.0 |
|
124 |
+
| gpt-3.5-turbo-1106 | 18.87 | (-1.79, \+2.34) | 285.0 |
|
125 |
+
| gpt-3.5-turbo-0314 | 18.05 | (-1.47, \+2.09) | 334.0 |
|
126 |
+
| gemini-pro | 17.8 | (-1.65, \+1.54) | 322.0 |
|
127 |
+
| phi-3-mini-128k-instruct | 15.43 | (-1.71, \+1.60) | 609.0 |
|
128 |
+
| mistral-7b-instruct | 12.57 | (-1.58, \+1.54) | 541.0 |
|
129 |
+
| gemma-1.1-7b-it | 12.09 | (-1.35, \+1.56) | 341.0 |
|
130 |
+
| llama-2-70b-chat | 11.55 | (-1.18, \+1.27) | 595.0 |
|
131 |
+
| vicuna-33b | 8.63 | (-0.88, \+1.28) | 451.0 |
|
132 |
+
| gemma-7b-it | 7.47 | (-1.05, \+1.09) | 378.0 |
|
133 |
+
| gemma-1.1-2b-it | 3.37 | (-0.67, \+0.70) | 316.0 |
|
134 |
+
| gemma-2b-it | 3.0 | (-0.68, \+0.62) | 369.0 |
|
135 |
+
|
136 |
|
137 |
|
138 |
### InfiniteBench
|
139 |
|
140 |
+
| Task Name | GPT-4 | YaRN-Mistral-7B | Kimi-Chat | Claude 2 | Yi-6B-200K | Yi-34B-200K | Chatglm3-6B-128K | EdgeRunner-Tactical-7B | Qwen2-7B-Instruct |
|
141 |
+
|:-----------------|:-------|:----------------|:----------|:---------|:-----------|:------------|:-----------------|:-----------------------|:------------------|
|
142 |
+
| Retrieve.PassKey | 100% | 92.71% | 98.14% | 97.80% | 100.00% | 100.00% | 92.20% | 100% | 100% |
|
143 |
+
| Retrieve.Number | 100% | 56.61% | 95.42% | 98.14% | 94.92% | 100.00% | 80.68% | 100% | 99.83% |
|
144 |
+
| Retrieve.KV | 89.00% | < 5% | 53.60% | 65.40% | < 5% | < 5% | < 5% | 2.2% | 1.8% |
|
145 |
+
| En.Sum | 14.73% | 9.09% | 17.96% | 14.50% | < 5% | < 5% | < 5% | 33.07% | 29.13% |
|
146 |
+
| En.QA | 22.44% | 9.55% | 16.52% | 11.97% | 9.20% | 12.17% | < 5% | 3.4% | 9.09% |
|
147 |
+
| En.MC | 67.25% | 27.95% | 72.49% | 62.88% | 36.68% | 38.43% | 10.48% | 66.81% | 66.37% |
|
148 |
+
| En.Dia | 8.50% | 7.50% | 11.50% | 46.50% | < 5% | < 5% | < 5% | 29% | 17% |
|
149 |
+
| Zh.QA | 25.96% | 16.98% | 17.93% | 9.64% | 15.07% | 13.61% | < 5% | 4.6% | 11.14% |
|
150 |
+
| Code.Debug | 37.06% | < 5% | 17.77% | < 5% | 9.14% | 13.96% | 7.36% | 22.08% | 24.61% |
|
151 |
+
| Code.Run | 23.25% | < 5% | < 5% | < 5% | < 5% | < 5% | < 5% | 0% | 0.5% |
|
152 |
+
| Math.Calc | < 5% | < 5% | < 5% | < 5% | < 5% | < 5% | < 5% | 0% | 0% |
|
153 |
+
| Math.Find | 60.00% | 17.14% | 12.57% | 32.29% | < 5% | 25.71% | 7.71% | 29.14% | 31.42% |
|
154 |
+
|
155 |
|
156 |
### GSM@ZeroEval
|
157 |
|
158 |
+
| Model | Acc | No answer | Reason Lens |
|
159 |
+
|:------------------------------|:----------|:----------|:------------|
|
160 |
+
| Llama-3.1-405B-Instruct-Turbo | 95.91 | 0.08 | 365.07 |
|
161 |
+
| claude-3-5-sonnet-20240620 | 95.6 | 0 | 465.19 |
|
162 |
+
| claude-3-opus-20240229 | 95.6 | 0 | 410.62 |
|
163 |
+
| gpt-4o-2024-05-13 | 95.38 | 0 | 479.98 |
|
164 |
+
| gpt-4o-mini-2024-07-18 | 94.24 | 0 | 463.71 |
|
165 |
+
| deepseek-chat | 93.93 | 0 | 495.52 |
|
166 |
+
| deepseek-coder | 93.78 | 0 | 566.89 |
|
167 |
+
| gemini-1.5-pro | 93.4 | 0 | 389.17 |
|
168 |
+
| Meta-Llama-3-70B-Instruct | 93.03 | 0 | 352.05 |
|
169 |
+
| Qwen2-72B-Instruct | 92.65 | 0 | 375.96 |
|
170 |
+
| claude-3-sonnet-20240229 | 91.51 | 0 | 762.69 |
|
171 |
+
| gemini-1.5-flash | 91.36 | 0 | 344.61 |
|
172 |
+
| gemma-2-27b-it@together | 90.22 | 0 | 364.68 |
|
173 |
+
| claude-3-haiku-20240307 | 88.78 | 0 | 587.65 |
|
174 |
+
| gemma-2-9b-it | 87.41 | 0 | 394.83 |
|
175 |
+
| reka-core-20240501 | 87.41 | 0.08 | 414.7 |
|
176 |
+
| Athene-70B | 86.66 | 0.3 | 253.53 |
|
177 |
+
| Yi-1.5-34B-Chat | 84.08 | 0.08 | 553.47 |
|
178 |
+
| Llama-3.1-8B-Instruct | 82.87 | 0.45 | 414.19 |
|
179 |
+
| Mistral-Nemo-Instruct-2407 | 82.79 | 0 | 349.81 |
|
180 |
+
| yi-large-preview | 82.64 | 0 | 514.25 |
|
181 |
+
| **EdgeRunner-Tactical-7B** | **81.12** | **0.08** | **615.89** |
|
182 |
+
| gpt-3.5-turbo-0125 | 80.36 | 0 | 350.97 |
|
183 |
+
| command-r-plus | 80.14 | 0.08 | 294.08 |
|
184 |
+
| Qwen2-7B-Instruct | 80.06 | 0 | 452.6 |
|
185 |
+
| yi-large | 80.06 | 0 | 479.87 |
|
186 |
+
| Yi-1.5-9B-Chat | 76.42 | 0.08 | 485.39 |
|
187 |
+
| Phi-3-mini-4k-instruct | 75.51 | 0 | 462.53 |
|
188 |
+
| reka-flash-20240226 | 74.68 | 0.45 | 460.06 |
|
189 |
+
| Mixtral-8x7B-Instruct-v0.1 | 70.13 | 2.27 | 361.12 |
|
190 |
+
| command-r | 52.99 | 0 | 294.43 |
|
191 |
+
| Qwen2-1.5B-Instruct | 43.37 | 4.78 | 301.67 |
|
192 |
+
|
|
|
193 |
|
194 |
|
195 |
### MMLU-REDUX@ZeroEval
|
196 |
|
197 |
+
| Model | Acc | No answer | Reason Lens |
|
198 |
+
|:------------------------------|:----------|:----------|:------------|
|
199 |
+
| gpt-4o-2024-05-13 | 88.01 | 0.14 | 629.79 |
|
200 |
+
| claude-3-5-sonnet-20240620 | 86 | 0.18 | 907.1 |
|
201 |
+
| Llama-3.1-405B-Instruct-Turbo | 85.64 | 0.76 | 449.71 |
|
202 |
+
| gpt-4-turbo-2024-04-09 | 85.31 | 0.04 | 631.38 |
|
203 |
+
| gemini-1.5-pro | 82.76 | 1.94 | 666.7 |
|
204 |
+
| claude-3-opus-20240229 | 82.54 | 0.58 | 500.35 |
|
205 |
+
| yi-large-preview | 82.15 | 0.14 | 982.6 |
|
206 |
+
| gpt-4-0314 | 81.64 | 0.04 | 397.22 |
|
207 |
+
| Qwen2-72B-Instruct | 81.61 | 0.29 | 486.41 |
|
208 |
+
| gpt-4o-mini-2024-07-18 | 81.5 | 0.07 | 526 |
|
209 |
+
| yi-large | 81.17 | 0 | 774.85 |
|
210 |
+
| deepseek-chat | 80.81 | 0.11 | 691.91 |
|
211 |
+
| deepseek-coder | 79.63 | 0.14 | 704.72 |
|
212 |
+
| Meta-Llama-3-70B-Instruct | 78.01 | 0.11 | 520.77 |
|
213 |
+
| gemini-1.5-flash | 77.36 | 1.26 | 583.45 |
|
214 |
+
| Athene-70B | 76.64 | 0.04 | 552.61 |
|
215 |
+
| reka-core-20240501 | 76.42 | 0.76 | 701.67 |
|
216 |
+
| gemma-2-27b-it@together | 75.67 | 0.61 | 446.51 |
|
217 |
+
| claude-3-sonnet-20240229 | 74.87 | 0.07 | 671.75 |
|
218 |
+
| gemma-2-9b-it@nvidia | 72.82 | 0.76 | 499 |
|
219 |
+
| Yi-1.5-34B-Chat | 72.79 | 1.01 | 620.1 |
|
220 |
+
| claude-3-haiku-20240307 | 72.32 | 0.04 | 644.59 |
|
221 |
+
| Phi-3-mini-4k-instruct | 70.34 | 0.43 | 677.09 |
|
222 |
+
| command-r-plus | 68.61 | 0 | 401.51 |
|
223 |
+
| gpt-3.5-turbo-0125 | 68.36 | 0.04 | 357.92 |
|
224 |
+
| **EdgeRunner-Tactical-7B** | **67.71** | **0.65** | **917.6** |
|
225 |
+
| Llama-3.1-8B-Instruct | 67.13 | 3.38 | 399.54 |
|
226 |
+
| Qwen2-7B-Instruct | 66.92 | 0.72 | 533.15 |
|
227 |
+
| Mistral-Nemo-Instruct-2407 | 66.88 | 0.47 | 464.19 |
|
228 |
+
| Yi-1.5-9B-Chat | 65.05 | 4.61 | 542.87 |
|
229 |
+
| reka-flash-20240226 | 64.72 | 0.32 | 659.25 |
|
230 |
+
| Mixtral-8x7B-Instruct-v0.1 | 63.17 | 5.51 | 324.31 |
|
231 |
+
| Meta-Llama-3-8B-Instruct | 61.66 | 0.97 | 600.81 |
|
232 |
+
| command-r | 61.12 | 0.04 | 382.23 |
|
233 |
+
| Qwen2-1.5B-Instruct | 41.11 | 7.74 | 280.56 |
|
|
|
|
|
234 |
|
235 |
### WildBench
|
236 |
|
237 |
+
| Model | WB_Elo | RewardScore_Avg | task_macro_reward.K=-1 | Length |
|
238 |
+
|:---------------------------|:------------|:----------------|:-----------------------|:------------|
|
239 |
+
| gpt-4o-2024-05-13 | 1248.12 | 50.05 | 40.80 | 3723.52 |
|
240 |
+
| claude-3-5-sonnet-20240620 | 1229.76 | 46.16 | 37.63 | 2911.85 |
|
241 |
+
| gpt-4-turbo-2024-04-09 | 1225.29 | 46.19 | 37.17 | 3093.17 |
|
242 |
+
| gpt-4-0125-preview | 1211.44 | 41.24 | 30.20 | 3335.64 |
|
243 |
+
| gemini-1.5-pro | 1209.23 | 45.27 | 37.59 | 3247.97 |
|
244 |
+
| yi-large-preview | 1209.00 | 46.92 | 38.54 | 3512.68 |
|
245 |
+
| claude-3-opus-20240229 | 1206.56 | 37.03 | 22.35 | 2685.98 |
|
246 |
+
| Meta-Llama-3-70B-Instruct | 1197.72 | 35.15 | 22.54 | 3046.64 |
|
247 |
+
| Athene-70B | 1197.41 | 29.77 | 0.00 | 3175.14 |
|
248 |
+
| deepseek-coder-v2 | 1194.11 | 29.39 | 11.38 | 2795.31 |
|
249 |
+
| gpt-4o-mini-2024-07-18 | 1192.43 | 28.57 | 0.00 | 3648.13 |
|
250 |
+
| yi-large | 1191.88 | 33.35 | 17.77 | 3095.34 |
|
251 |
+
| gemini-1.5-flash | 1190.30 | 37.45 | 26.04 | 3654.40 |
|
252 |
+
| deepseek-v2-chat-0628 | 1188.07 | 27.00 | 0.00 | 3252.38 |
|
253 |
+
| gemma-2-9b-it-SimPO | 1184.67 | 26.64 | 0.00 | 4277.67 |
|
254 |
+
| gemma-2-9b-it-DPO | 1182.43 | 26.61 | 0.00 | 3982.63 |
|
255 |
+
| nemotron-4-340b-instruct | 1181.77 | 33.76 | 19.85 | 2754.01 |
|
256 |
+
| claude-3-sonnet-20240229 | 1179.81 | 28.09 | 10.70 | 2670.24 |
|
257 |
+
| deepseekv2-chat | 1178.76 | 30.41 | 12.60 | 2896.97 |
|
258 |
+
| gemma-2-27b-it@together | 1178.34 | 24.27 | 0.00 | 2924.55 |
|
259 |
+
| Qwen2-72B-Instruct | 1176.75 | 24.77 | 5.03 | 2856.45 |
|
260 |
+
| reka-core-20240501 | 1173.85 | 31.48 | 17.06 | 2592.59 |
|
261 |
+
| Mistral-Nemo-Instruct-2407 | 1165.29 | 22.19 | 0.00 | 3318.21 |
|
262 |
+
| Yi-1.5-34B-Chat | 1163.69 | 30.83 | 16.06 | 3523.56 |
|
263 |
+
| **EdgeRunner-Tactical-7B** | **1162.88** | **22.26** | **0.00** | **3754.66** |
|
264 |
+
| claude-3-haiku-20240307 | 1160.56 | 16.30 | -6.30 | 2601.03 |
|
265 |
+
| mistral-large-2402 | 1159.72 | 13.27 | -12.36 | 2514.98 |
|
266 |
+
| deepseek-v2-coder-0628 | 1155.97 | 22.83 | 0.00 | 2580.18 |
|
267 |
+
| gemma-2-9b-it | 1154.30 | 21.35 | 0.00 | 2802.89 |
|
268 |
+
| command-r-plus | 1153.15 | 16.58 | -3.60 | 3293.81 |
|
269 |
+
| glm-4-9b-chat | 1152.68 | 20.71 | 2.33 | 3692.04 |
|
270 |
+
| Qwen1.5-72B-Chat-greedy | 1151.97 | 20.83 | 1.72 | 2392.36 |
|
271 |
+
| Yi-1.5-9B-Chat | 1151.43 | 21.80 | 4.93 | 3468.23 |
|
272 |
+
| Meta-Llama-3-8B-Instruct | 1140.76 | 6.72 | -15.76 | 2975.19 |
|
273 |
+
| Qwen2-7B-Instruct | 1137.66 | 16.20 | 0.00 | 3216.43 |
|
274 |
+
| Starling-LM-7B-beta-ExPO | 1137.58 | 11.28 | -9.01 | 2835.83 |
|
275 |
+
| Hermes-2-Theta-Llama-3-8B | 1135.99 | 3.18 | -23.28 | 2742.17 |
|
276 |
+
| Llama-3.1-8B-Instruct | 1135.42 | 16.38 | 0.00 | 3750.60 |
|
277 |
+
|
278 |
+
|
279 |
+
|
280 |
+
### AlpacaEval 2.0
|
281 |
+
|
282 |
+
| Model | Length Controlled Winrate | Win Rate | N Total | Avg Length |
|
283 |
+
|:----------------------------|:--------------------------|:----------|:--------|:-----------|
|
284 |
+
| gpt-4o-2024-05-13 | 57.46 | 51.33 | 805 | 1873 |
|
285 |
+
| gpt-4-turbo-2024-04-09 | 55.02 | 46.12 | 805 | 1802 |
|
286 |
+
| claude-3-5-sonnet-20240620 | 52.37 | 40.56 | 805 | 1488 |
|
287 |
+
| yi-large-preview | 51.89 | 57.47 | 805 | 2335 |
|
288 |
+
| gpt4\_1106\_preview | 50.0 | 50.0 | 805 | 2049 |
|
289 |
+
| Qwen1.5-110B-Chat | 43.91 | 33.78 | 805 | 1631 |
|
290 |
+
| claude-3-opus-20240229 | 40.51 | 29.11 | 805 | 1388 |
|
291 |
+
| gpt4 | 38.13 | 23.58 | 805 | 1365 |
|
292 |
+
| Qwen1.5-72B-Chat | 36.57 | 26.5 | 805 | 1549 |
|
293 |
+
| gpt4\_0314 | 35.31 | 22.07 | 805 | 1371 |
|
294 |
+
| Meta-Llama-3-70B-Instruct | 34.42 | 33.18 | 805 | 1919 |
|
295 |
+
| **EdgeRunner-Tactical-7B** | **34.41** | **51.28** | **805** | **2735** |
|
296 |
+
| mistral-large-2402 | 32.65 | 21.44 | 805 | 1362 |
|
297 |
+
| Mixtral-8x22B-Instruct-v0.1 | 30.88 | 22.21 | 805 | 1445 |
|
298 |
+
| gpt4\_0613 | 30.18 | 15.76 | 805 | 1140 |
|
299 |
+
| mistral-medium | 28.61 | 21.86 | 805 | 1500 |
|
300 |
+
| claude-2 | 28.16 | 17.19 | 805 | 1069 |
|
301 |
+
| Samba-CoE-v0.2 | 27.62 | 21.85 | 805 | 1469 |
|
302 |
+
| internlm2-chat-20b-ExPO | 27.23 | 46.19 | 805 | 3335 |
|
303 |
+
| Yi-34B-Chat | 27.19 | 29.66 | 805 | 2123 |
|
304 |
+
| Starling-LM-7B-beta-ExPO | 26.41 | 29.6 | 805 | 2215 |
|
305 |
+
| Llama-3.1-8B-Instruct | 26.41 | 30.32 | 805 | 2171 |
|
306 |
+
| Snorkel-Mistral-PairRM-DPO | 26.39 | 30.22 | 804 | 2736 |
|
307 |
+
| Arcee-Spark | 25.58 | 26.19 | 805 | 2002 |
|
308 |
+
| claude-2.1 | 25.25 | 15.73 | 805 | 1096 |
|
309 |
+
| gemini-pro | 24.38 | 18.18 | 805 | 1456 |
|
310 |
+
| Qwen1.5-14B-Chat | 23.9 | 18.65 | 805 | 1607 |
|
311 |
+
| Mixtral-8x7B-Instruct-v0.1 | 23.69 | 18.26 | 805 | 1465 |
|
312 |
+
| Meta-Llama-3-8B-Instruct | 22.92 | 22.57 | 805 | 1899 |
|
313 |
+
| Samba-CoE-v0.1 | 22.87 | 16.84 | 805 | 1316 |
|
314 |
+
| gpt-3.5-turbo-0613 | 22.35 | 14.1 | 805 | 1331 |
|
315 |
+
| Qwen2-7B-Instruct | 21.51 | 18.93 | 805 | 1793 |
|
316 |
+
| gpt-3.5-turbo-1106 | 19.3 | 9.18 | 805 | 796 |
|
317 |
+
| internlm2-chat-20b-ppo | 18.75 | 21.75 | 805 | 2373 |
|
318 |
+
| claude-2.1\_concise | 18.21 | 9.23 | 805 | 573 |
|
319 |
+
| gpt-3.5-turbo-0301 | 18.09 | 9.62 | 805 | 827 |
|
320 |
+
| deepseek-llm-67b-chat | 17.84 | 12.09 | 805 | 1151 |
|
321 |
+
| vicuna-33b-v1.3 | 17.57 | 12.71 | 805 | 1479 |
|
322 |
+
| Mistral-7B-Instruct-v0.2 | 17.11 | 14.72 | 805 | 1676 |
|
323 |
+
| OpenHermes-2.5-Mistral-7B | 16.25 | 10.34 | 805 | 1107 |
|
324 |
+
| Qwen1.5-7B-Chat | 14.75 | 11.77 | 805 | 1594 |
|
325 |
+
|
326 |
+
|