Spaces:
Running
Running
Your Name
commited on
Commit
·
aa854e6
1
Parent(s):
276df52
updtae model leaderboard
Browse files- app.py +1 -1
- rank_data_03_25_2025.json +0 -5
- rank_single_model_03_25_2025.json +245 -30
app.py
CHANGED
@@ -942,7 +942,7 @@ def build_app():
|
|
942 |
gr.Markdown("""
|
943 |
**🎮 Welcome to LMGame Bench!**
|
944 |
|
945 |
-
We invite developers to implement their own gaming agents by replacing our `baseAgent` in `
|
946 |
""", elem_classes="welcome-message")
|
947 |
|
948 |
# Visualization section
|
|
|
942 |
gr.Markdown("""
|
943 |
**🎮 Welcome to LMGame Bench!**
|
944 |
|
945 |
+
We invite developers to implement their own gaming agents by replacing our `baseAgent` in `single_agent_runner.py` and evaluate them on our comprehensive benchmark. Visit our repository at https://github.com/lmgame-org/GamingAgent to get started and join the competition to see how your agent performs!
|
946 |
""", elem_classes="welcome-message")
|
947 |
|
948 |
# Visualization section
|
rank_data_03_25_2025.json
CHANGED
@@ -743,11 +743,6 @@
|
|
743 |
"score": 3.67,
|
744 |
"details": "3,4,4"
|
745 |
},
|
746 |
-
{
|
747 |
-
"model": "🎮 gemini-2.5-flash-preview-05-20",
|
748 |
-
"score": 4.33,
|
749 |
-
"details": "3,4,6"
|
750 |
-
},
|
751 |
{
|
752 |
"model": "🎮 gpt-5-thinking-high",
|
753 |
"score": 9,
|
|
|
743 |
"score": 3.67,
|
744 |
"details": "3,4,4"
|
745 |
},
|
|
|
|
|
|
|
|
|
|
|
746 |
{
|
747 |
"model": "🎮 gpt-5-thinking-high",
|
748 |
"score": 9,
|
rank_single_model_03_25_2025.json
CHANGED
@@ -5,67 +5,67 @@
|
|
5 |
{
|
6 |
"model": "claude-3-5-sonnet-20241022",
|
7 |
"score": 1540.0,
|
8 |
-
"detail_data":
|
9 |
"progress": "1-1"
|
10 |
},
|
11 |
{
|
12 |
"model": "claude-3-7-sonnet-20250219",
|
13 |
"score": 1430.0,
|
14 |
-
"detail_data":
|
15 |
"progress": "1-1"
|
16 |
},
|
17 |
{
|
18 |
"model": "gemini-2.5-flash-preview-04-17",
|
19 |
"score": 1540.7,
|
20 |
-
"detail_data":
|
21 |
"progress": "1-1"
|
22 |
},
|
23 |
{
|
24 |
"model": "gemini-2.5-pro-preview-05-06",
|
25 |
"score": 1025.3,
|
26 |
-
"detail_data":
|
27 |
"progress": "1-1"
|
28 |
},
|
29 |
{
|
30 |
"model": "llama-4-maverick-17b-128e-instruct-fp8",
|
31 |
"score": 786.0,
|
32 |
-
"detail_data":
|
33 |
"progress": "1-1"
|
34 |
},
|
35 |
{
|
36 |
"model": "gpt-4.1-2025-04-14",
|
37 |
"score": 1991.3,
|
38 |
-
"detail_data":
|
39 |
"progress": "1-1"
|
40 |
},
|
41 |
{
|
42 |
"model": "gpt-4o-2024-11-20",
|
43 |
"score": 1028.3,
|
44 |
-
"detail_data":
|
45 |
"progress": "1-1"
|
46 |
},
|
47 |
{
|
48 |
"model": "o1-2024-12-17",
|
49 |
"score": 1434.0,
|
50 |
-
"detail_data":
|
51 |
"progress": "1-1"
|
52 |
},
|
53 |
{
|
54 |
"model": "o3-2025-04-16",
|
55 |
"score": 1955.0,
|
56 |
-
"detail_data":
|
57 |
"progress": "1-1"
|
58 |
},
|
59 |
{
|
60 |
"model": "o4-mini-2025-04-16",
|
61 |
"score": 1348.3,
|
62 |
-
"detail_data":
|
63 |
"progress": "1-1"
|
64 |
},
|
65 |
{
|
66 |
"model": "random (x30)",
|
67 |
"score": 986.97,
|
68 |
-
"detail_data":
|
69 |
"progress": "1-1"
|
70 |
}
|
71 |
]
|
@@ -143,7 +143,7 @@
|
|
143 |
"model": "gemini-2.5-flash-preview-05-20",
|
144 |
"score": 2750,
|
145 |
"details": "3128, 2758, 2364",
|
146 |
-
"highest_tail":
|
147 |
},
|
148 |
{
|
149 |
"model": "claude-sonnet-4-20250514",
|
@@ -156,6 +156,66 @@
|
|
156 |
"score": 2232,
|
157 |
"details": "2212,2856,1628",
|
158 |
"highest_tail": 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
}
|
160 |
]
|
161 |
},
|
@@ -231,6 +291,56 @@
|
|
231 |
"model": "gemini-2.5-pro-preview-06-05",
|
232 |
"score": 13.67,
|
233 |
"details": "12,14,15"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
}
|
235 |
]
|
236 |
},
|
@@ -306,6 +416,56 @@
|
|
306 |
"model": "gemini-2.5-pro-preview-06-05",
|
307 |
"score": 496,
|
308 |
"details": "461,556,471"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
}
|
310 |
]
|
311 |
},
|
@@ -315,73 +475,73 @@
|
|
315 |
{
|
316 |
"model": "claude-3-5-sonnet-20241022",
|
317 |
"score": 0,
|
318 |
-
"detail_box_on_target":
|
319 |
"cracked_levels": "0,0,0"
|
320 |
},
|
321 |
{
|
322 |
"model": "claude-3-7-sonnet-20250219",
|
323 |
"score": 0,
|
324 |
-
"detail_box_on_target":
|
325 |
"cracked_levels": "0,0,0"
|
326 |
},
|
327 |
{
|
328 |
"model": "gemini-2.5-flash-preview-04-17",
|
329 |
"score": 0,
|
330 |
-
"detail_box_on_target":
|
331 |
"cracked_levels": "0,0,0"
|
332 |
},
|
333 |
{
|
334 |
"model": "gemini-2.5-pro-preview-05-06",
|
335 |
"score": 1,
|
336 |
-
"detail_box_on_target":
|
337 |
"cracked_levels": "0,0,0"
|
338 |
},
|
339 |
{
|
340 |
"model": "llama-4-maverick-17b-128e-instruct-fp8",
|
341 |
"score": 0,
|
342 |
-
"detail_box_on_target":
|
343 |
"cracked_levels": "0,0,0"
|
344 |
},
|
345 |
{
|
346 |
"model": "gpt-4.1-2025-04-14",
|
347 |
"score": 0,
|
348 |
-
"detail_box_on_target":
|
349 |
"cracked_levels": "0,0,0"
|
350 |
},
|
351 |
{
|
352 |
"model": "gpt-4o-2024-11-20",
|
353 |
"score": 0,
|
354 |
-
"detail_box_on_target":
|
355 |
"cracked_levels": "0,0,0"
|
356 |
},
|
357 |
{
|
358 |
"model": "o1-2024-12-17",
|
359 |
"score": 0,
|
360 |
-
"detail_box_on_target":
|
361 |
"cracked_levels": "0"
|
362 |
},
|
363 |
{
|
364 |
"model": "o3-2025-04-16",
|
365 |
"score": 2,
|
366 |
-
"detail_box_on_target":
|
367 |
"cracked_levels": "1"
|
368 |
},
|
369 |
{
|
370 |
"model": "o4-mini-2025-04-16",
|
371 |
"score": 1.33,
|
372 |
-
"detail_box_on_target":
|
373 |
"cracked_levels": "0,1,0"
|
374 |
},
|
375 |
{
|
376 |
"model": "random (x30)",
|
377 |
"score": 0,
|
378 |
-
"detail_box_on_target":
|
379 |
"cracked_levels": "0,0,0"
|
380 |
},
|
381 |
{
|
382 |
"model": "claude-sonnet-4-20250514",
|
383 |
"score": 0,
|
384 |
-
"detail_box_on_target":
|
385 |
"cracked_levels": "0,0,0"
|
386 |
},
|
387 |
{
|
@@ -395,7 +555,68 @@
|
|
395 |
"score": 0,
|
396 |
"detail_box_on_target": "0,0,0",
|
397 |
"cracked_levels": "0,0,0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
}
|
|
|
399 |
]
|
400 |
},
|
401 |
"Ace Attorney": {
|
@@ -467,12 +688,6 @@
|
|
467 |
"score": 1.33,
|
468 |
"details": "0,2,2",
|
469 |
"progress": "1:2/5"
|
470 |
-
},
|
471 |
-
{
|
472 |
-
"model": "gemini-2.5-pro-preview-06-05",
|
473 |
-
"score": 1.33,
|
474 |
-
"details": "7,0,0",
|
475 |
-
"progress": "2:2/9"
|
476 |
}
|
477 |
]
|
478 |
}
|
|
|
5 |
{
|
6 |
"model": "claude-3-5-sonnet-20241022",
|
7 |
"score": 1540.0,
|
8 |
+
"detail_data":"1551,1515,1554",
|
9 |
"progress": "1-1"
|
10 |
},
|
11 |
{
|
12 |
"model": "claude-3-7-sonnet-20250219",
|
13 |
"score": 1430.0,
|
14 |
+
"detail_data":"1532,1515,1243",
|
15 |
"progress": "1-1"
|
16 |
},
|
17 |
{
|
18 |
"model": "gemini-2.5-flash-preview-04-17",
|
19 |
"score": 1540.7,
|
20 |
+
"detail_data":"1794,1270,1558",
|
21 |
"progress": "1-1"
|
22 |
},
|
23 |
{
|
24 |
"model": "gemini-2.5-pro-preview-05-06",
|
25 |
"score": 1025.3,
|
26 |
+
"detail_data":"820,1534,722",
|
27 |
"progress": "1-1"
|
28 |
},
|
29 |
{
|
30 |
"model": "llama-4-maverick-17b-128e-instruct-fp8",
|
31 |
"score": 786.0,
|
32 |
+
"detail_data":"837,300,1221",
|
33 |
"progress": "1-1"
|
34 |
},
|
35 |
{
|
36 |
"model": "gpt-4.1-2025-04-14",
|
37 |
"score": 1991.3,
|
38 |
+
"detail_data":"1563,1257,3154",
|
39 |
"progress": "1-1"
|
40 |
},
|
41 |
{
|
42 |
"model": "gpt-4o-2024-11-20",
|
43 |
"score": 1028.3,
|
44 |
+
"detail_data":"1565,297,1223",
|
45 |
"progress": "1-1"
|
46 |
},
|
47 |
{
|
48 |
"model": "o1-2024-12-17",
|
49 |
"score": 1434.0,
|
50 |
+
"detail_data":"1434",
|
51 |
"progress": "1-1"
|
52 |
},
|
53 |
{
|
54 |
"model": "o3-2025-04-16",
|
55 |
"score": 1955.0,
|
56 |
+
"detail_data":"1955",
|
57 |
"progress": "1-1"
|
58 |
},
|
59 |
{
|
60 |
"model": "o4-mini-2025-04-16",
|
61 |
"score": 1348.3,
|
62 |
+
"detail_data":"1554,1245,1246",
|
63 |
"progress": "1-1"
|
64 |
},
|
65 |
{
|
66 |
"model": "random (x30)",
|
67 |
"score": 986.97,
|
68 |
+
"detail_data":"986.97",
|
69 |
"progress": "1-1"
|
70 |
}
|
71 |
]
|
|
|
143 |
"model": "gemini-2.5-flash-preview-05-20",
|
144 |
"score": 2750,
|
145 |
"details": "3128, 2758, 2364",
|
146 |
+
"highest_tail": 256
|
147 |
},
|
148 |
{
|
149 |
"model": "claude-sonnet-4-20250514",
|
|
|
156 |
"score": 2232,
|
157 |
"details": "2212,2856,1628",
|
158 |
"highest_tail": 256
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"model": "claude-opus-4-20250514",
|
162 |
+
"score": 2272,
|
163 |
+
"details": "2272",
|
164 |
+
"highest_tail": 256
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"model": "deepseek-r1-0528",
|
168 |
+
"score": 1104,
|
169 |
+
"details": "1460.0,1156.0, 696.0",
|
170 |
+
"highest_tail": 128
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"model": "qwen3-235B-A22B-fp8",
|
174 |
+
"score": 434.67,
|
175 |
+
"details": "972.0, 168.0, 164.0",
|
176 |
+
"highest_tail": 128
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"model": "grok-4-0709",
|
180 |
+
"score": 4229.33,
|
181 |
+
"details": "2252, 2936, 7500",
|
182 |
+
"highest_tail": 512
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"model": "kimi-k2-0711-preview",
|
186 |
+
"score": 1376.0,
|
187 |
+
"details": "1432.0, 1348.0, 1348.0",
|
188 |
+
"highest_tail": 128
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"model": "glm-4.5",
|
192 |
+
"score": 662.67,
|
193 |
+
"details": "72, 1428, 488",
|
194 |
+
"highest_tail": 128
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"model": "qwen3-235B-A22B-fp8-thinking",
|
198 |
+
"score": 3456,
|
199 |
+
"details": "4112, 3428, 2828",
|
200 |
+
"highest_tail": 512
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"model": "gpt-oss-120b",
|
204 |
+
"score": 1746.67,
|
205 |
+
"details": "1044, 2816,1380",
|
206 |
+
"highest_tail": 256
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"model": "gpt-oss-20b",
|
210 |
+
"score": 3032,
|
211 |
+
"details": "2752, 860, 5484",
|
212 |
+
"highest_tail": 512
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"model": "gpt-5-thinking-high",
|
216 |
+
"score": 3206.67,
|
217 |
+
"details": "3924.0, 2884.0, 2812.0",
|
218 |
+
"highest_tail": 256
|
219 |
}
|
220 |
]
|
221 |
},
|
|
|
291 |
"model": "gemini-2.5-pro-preview-06-05",
|
292 |
"score": 13.67,
|
293 |
"details": "12,14,15"
|
294 |
+
},
|
295 |
+
{
|
296 |
+
"model": "claude-opus-4-20250514",
|
297 |
+
"score": 18,
|
298 |
+
"details": "18"
|
299 |
+
},
|
300 |
+
{
|
301 |
+
"model": "deepseek-r1-0528",
|
302 |
+
"score": 25.33,
|
303 |
+
"details": "23.0, 42.0, 11.0"
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"model": "qwen3-235B-A22B-fp8",
|
307 |
+
"score": 10.67,
|
308 |
+
"details": "10.0, 11.0, 11.0"
|
309 |
+
},
|
310 |
+
{
|
311 |
+
"model": "grok-4-0709",
|
312 |
+
"score": 65.66,
|
313 |
+
"details": "114.0, 61.0, 22.0"
|
314 |
+
},
|
315 |
+
{
|
316 |
+
"model": "kimi-k2-0711-preview",
|
317 |
+
"score": 13.67,
|
318 |
+
"details": "14.0, 15.0, 12.0"
|
319 |
+
},
|
320 |
+
{
|
321 |
+
"model": "glm-4.5",
|
322 |
+
"score": 15.33,
|
323 |
+
"details": "17.0, 10.0, 19"
|
324 |
+
},
|
325 |
+
{
|
326 |
+
"model": "qwen3-235B-A22B-fp8-thinking",
|
327 |
+
"score": 15,
|
328 |
+
"details": "14, 15, 16"
|
329 |
+
},
|
330 |
+
{
|
331 |
+
"model": "gpt-oss-120b",
|
332 |
+
"score": 13,
|
333 |
+
"details": "11, 14, 14"
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"model": "gpt-oss-20b",
|
337 |
+
"score": 12.67,
|
338 |
+
"details": "12, 13, 13"
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"model": "gpt-5-thinking-high",
|
342 |
+
"score": 64.33,
|
343 |
+
"details": "46.0, 73.0, 74.0"
|
344 |
}
|
345 |
]
|
346 |
},
|
|
|
416 |
"model": "gemini-2.5-pro-preview-06-05",
|
417 |
"score": 496,
|
418 |
"details": "461,556,471"
|
419 |
+
},
|
420 |
+
{
|
421 |
+
"model": "claude-opus-4-20250514",
|
422 |
+
"score": 464,
|
423 |
+
"details": "593, 406, 393"
|
424 |
+
},
|
425 |
+
{
|
426 |
+
"model": "deepseek-r1-0528",
|
427 |
+
"score": 498.33,
|
428 |
+
"details": "551.0, 492.0, 452.0"
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"model": "qwen3-235B-A22B-fp8",
|
432 |
+
"score": 472.67,
|
433 |
+
"details": "591.0, 418.0, 409.0"
|
434 |
+
},
|
435 |
+
{
|
436 |
+
"model": "grok-4-0709",
|
437 |
+
"score": 66.0,
|
438 |
+
"details": "34.0, 38.0, 126.0"
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"model": "kimi-k2-0711-preview",
|
442 |
+
"score": 10.67,
|
443 |
+
"details": "10.0, 11.0, 11.0"
|
444 |
+
},
|
445 |
+
{
|
446 |
+
"model": "glm-4.5",
|
447 |
+
"score": 153,
|
448 |
+
"details": "153.0, 180.0, 126"
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"model": "qwen3-235B-A22B-fp8-thinking",
|
452 |
+
"score": 577.33,
|
453 |
+
"details": "449,672,611"
|
454 |
+
},
|
455 |
+
{
|
456 |
+
"model": "gpt-oss-120b",
|
457 |
+
"score": 506,
|
458 |
+
"details": "519, 500, 499"
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"model": "gpt-oss-20b",
|
462 |
+
"score": 595,
|
463 |
+
"details": "648, 581, 556"
|
464 |
+
},
|
465 |
+
{
|
466 |
+
"model": "gpt-5-thinking-high",
|
467 |
+
"score": 678.33,
|
468 |
+
"details": "731, 586, 718"
|
469 |
}
|
470 |
]
|
471 |
},
|
|
|
475 |
{
|
476 |
"model": "claude-3-5-sonnet-20241022",
|
477 |
"score": 0,
|
478 |
+
"detail_box_on_target":"0,0,0",
|
479 |
"cracked_levels": "0,0,0"
|
480 |
},
|
481 |
{
|
482 |
"model": "claude-3-7-sonnet-20250219",
|
483 |
"score": 0,
|
484 |
+
"detail_box_on_target":"0,0,0",
|
485 |
"cracked_levels": "0,0,0"
|
486 |
},
|
487 |
{
|
488 |
"model": "gemini-2.5-flash-preview-04-17",
|
489 |
"score": 0,
|
490 |
+
"detail_box_on_target":"0,0,0",
|
491 |
"cracked_levels": "0,0,0"
|
492 |
},
|
493 |
{
|
494 |
"model": "gemini-2.5-pro-preview-05-06",
|
495 |
"score": 1,
|
496 |
+
"detail_box_on_target":"1,1,1",
|
497 |
"cracked_levels": "0,0,0"
|
498 |
},
|
499 |
{
|
500 |
"model": "llama-4-maverick-17b-128e-instruct-fp8",
|
501 |
"score": 0,
|
502 |
+
"detail_box_on_target":"0,0,0",
|
503 |
"cracked_levels": "0,0,0"
|
504 |
},
|
505 |
{
|
506 |
"model": "gpt-4.1-2025-04-14",
|
507 |
"score": 0,
|
508 |
+
"detail_box_on_target":"0,0,0",
|
509 |
"cracked_levels": "0,0,0"
|
510 |
},
|
511 |
{
|
512 |
"model": "gpt-4o-2024-11-20",
|
513 |
"score": 0,
|
514 |
+
"detail_box_on_target":"0,0,0",
|
515 |
"cracked_levels": "0,0,0"
|
516 |
},
|
517 |
{
|
518 |
"model": "o1-2024-12-17",
|
519 |
"score": 0,
|
520 |
+
"detail_box_on_target":"0",
|
521 |
"cracked_levels": "0"
|
522 |
},
|
523 |
{
|
524 |
"model": "o3-2025-04-16",
|
525 |
"score": 2,
|
526 |
+
"detail_box_on_target":"2",
|
527 |
"cracked_levels": "1"
|
528 |
},
|
529 |
{
|
530 |
"model": "o4-mini-2025-04-16",
|
531 |
"score": 1.33,
|
532 |
+
"detail_box_on_target":"1,2,1",
|
533 |
"cracked_levels": "0,1,0"
|
534 |
},
|
535 |
{
|
536 |
"model": "random (x30)",
|
537 |
"score": 0,
|
538 |
+
"detail_box_on_target":"0,0,0",
|
539 |
"cracked_levels": "0,0,0"
|
540 |
},
|
541 |
{
|
542 |
"model": "claude-sonnet-4-20250514",
|
543 |
"score": 0,
|
544 |
+
"detail_box_on_target":"0,0,0",
|
545 |
"cracked_levels": "0,0,0"
|
546 |
},
|
547 |
{
|
|
|
555 |
"score": 0,
|
556 |
"detail_box_on_target": "0,0,0",
|
557 |
"cracked_levels": "0,0,0"
|
558 |
+
},
|
559 |
+
{
|
560 |
+
"model": "claude-opus-4-20250514",
|
561 |
+
"score": 4,
|
562 |
+
"detail_box_on_target": "4",
|
563 |
+
"cracked_levels": "2"
|
564 |
+
},
|
565 |
+
{
|
566 |
+
"model": "deepseek-r1-0528",
|
567 |
+
"score": 2,
|
568 |
+
"detail_box_on_target": "3,1,2",
|
569 |
+
"cracked_levels": "1, 0, 1"
|
570 |
+
},
|
571 |
+
{
|
572 |
+
"model": "qwen3-235B-A22B-fp8",
|
573 |
+
"score": 1,
|
574 |
+
"detail_box_on_target": "1,1,1",
|
575 |
+
"cracked_levels": "0, 0, 0"
|
576 |
+
},
|
577 |
+
{
|
578 |
+
"model": "grok-4-0709",
|
579 |
+
"score": 2.33,
|
580 |
+
"detail_box_on_target": "2,0,5",
|
581 |
+
"cracked_levels": "1, 0, 3"
|
582 |
+
},
|
583 |
+
{
|
584 |
+
"model": "kimi-k2-0711-preview",
|
585 |
+
"score": 0,
|
586 |
+
"detail_box_on_target": "0,0,0",
|
587 |
+
"cracked_levels": "0,0,0"
|
588 |
+
},
|
589 |
+
{
|
590 |
+
"model": "glm-4.5",
|
591 |
+
"score": 0,
|
592 |
+
"detail_box_on_target": "0,0,0",
|
593 |
+
"cracked_levels": "0,0,0"
|
594 |
+
},
|
595 |
+
{
|
596 |
+
"model": "qwen3-235B-A22B-fp8-thinking",
|
597 |
+
"score": 2,
|
598 |
+
"detail_box_on_target": "2, 3, 1",
|
599 |
+
"cracked_levels": "1, 1, 0"
|
600 |
+
},
|
601 |
+
{
|
602 |
+
"model": "gpt-oss-120b",
|
603 |
+
"score": 5.5,
|
604 |
+
"detail_box_on_target": "5, 6",
|
605 |
+
"cracked_levels": "3,3"
|
606 |
+
},
|
607 |
+
{
|
608 |
+
"model": "gpt-oss-20b",
|
609 |
+
"score": 3,
|
610 |
+
"detail_box_on_target": "2, 4",
|
611 |
+
"cracked_levels": "1, 2"
|
612 |
+
},
|
613 |
+
{
|
614 |
+
"model": "gpt-5-thinking-high",
|
615 |
+
"score": 6,
|
616 |
+
"detail_box_on_target": "9, 6, 3",
|
617 |
+
"cracked_levels": "4, 3, 2"
|
618 |
}
|
619 |
+
|
620 |
]
|
621 |
},
|
622 |
"Ace Attorney": {
|
|
|
688 |
"score": 1.33,
|
689 |
"details": "0,2,2",
|
690 |
"progress": "1:2/5"
|
|
|
|
|
|
|
|
|
|
|
|
|
691 |
}
|
692 |
]
|
693 |
}
|