Your Name commited on
Commit
aa854e6
·
1 Parent(s): 276df52

updtae model leaderboard

Browse files
app.py CHANGED
@@ -942,7 +942,7 @@ def build_app():
942
  gr.Markdown("""
943
  **🎮 Welcome to LMGame Bench!**
944
 
945
- We invite developers to implement their own gaming agents by replacing our `baseAgent` in `customer_runner.py` and evaluate them on our comprehensive benchmark. Visit our repository at https://github.com/lmgame-org/GamingAgent to get started and join the competition to see how your agent performs!
946
  """, elem_classes="welcome-message")
947
 
948
  # Visualization section
 
942
  gr.Markdown("""
943
  **🎮 Welcome to LMGame Bench!**
944
 
945
+ We invite developers to implement their own gaming agents by replacing our `baseAgent` in `single_agent_runner.py` and evaluate them on our comprehensive benchmark. Visit our repository at https://github.com/lmgame-org/GamingAgent to get started and join the competition to see how your agent performs!
946
  """, elem_classes="welcome-message")
947
 
948
  # Visualization section
rank_data_03_25_2025.json CHANGED
@@ -743,11 +743,6 @@
743
  "score": 3.67,
744
  "details": "3,4,4"
745
  },
746
- {
747
- "model": "🎮 gemini-2.5-flash-preview-05-20",
748
- "score": 4.33,
749
- "details": "3,4,6"
750
- },
751
  {
752
  "model": "🎮 gpt-5-thinking-high",
753
  "score": 9,
 
743
  "score": 3.67,
744
  "details": "3,4,4"
745
  },
 
 
 
 
 
746
  {
747
  "model": "🎮 gpt-5-thinking-high",
748
  "score": 9,
rank_single_model_03_25_2025.json CHANGED
@@ -5,67 +5,67 @@
5
  {
6
  "model": "claude-3-5-sonnet-20241022",
7
  "score": 1540.0,
8
- "detail_data": "1551,1515,1554",
9
  "progress": "1-1"
10
  },
11
  {
12
  "model": "claude-3-7-sonnet-20250219",
13
  "score": 1430.0,
14
- "detail_data": "1532,1515,1243",
15
  "progress": "1-1"
16
  },
17
  {
18
  "model": "gemini-2.5-flash-preview-04-17",
19
  "score": 1540.7,
20
- "detail_data": "1794,1270,1558",
21
  "progress": "1-1"
22
  },
23
  {
24
  "model": "gemini-2.5-pro-preview-05-06",
25
  "score": 1025.3,
26
- "detail_data": "820,1534,722",
27
  "progress": "1-1"
28
  },
29
  {
30
  "model": "llama-4-maverick-17b-128e-instruct-fp8",
31
  "score": 786.0,
32
- "detail_data": "837,300,1221",
33
  "progress": "1-1"
34
  },
35
  {
36
  "model": "gpt-4.1-2025-04-14",
37
  "score": 1991.3,
38
- "detail_data": "1563,1257,3154",
39
  "progress": "1-1"
40
  },
41
  {
42
  "model": "gpt-4o-2024-11-20",
43
  "score": 1028.3,
44
- "detail_data": "1565,297,1223",
45
  "progress": "1-1"
46
  },
47
  {
48
  "model": "o1-2024-12-17",
49
  "score": 1434.0,
50
- "detail_data": "1434",
51
  "progress": "1-1"
52
  },
53
  {
54
  "model": "o3-2025-04-16",
55
  "score": 1955.0,
56
- "detail_data": "1955",
57
  "progress": "1-1"
58
  },
59
  {
60
  "model": "o4-mini-2025-04-16",
61
  "score": 1348.3,
62
- "detail_data": "1554,1245,1246",
63
  "progress": "1-1"
64
  },
65
  {
66
  "model": "random (x30)",
67
  "score": 986.97,
68
- "detail_data": "986.97",
69
  "progress": "1-1"
70
  }
71
  ]
@@ -143,7 +143,7 @@
143
  "model": "gemini-2.5-flash-preview-05-20",
144
  "score": 2750,
145
  "details": "3128, 2758, 2364",
146
- "highest_tail": 128
147
  },
148
  {
149
  "model": "claude-sonnet-4-20250514",
@@ -156,6 +156,66 @@
156
  "score": 2232,
157
  "details": "2212,2856,1628",
158
  "highest_tail": 256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  }
160
  ]
161
  },
@@ -231,6 +291,56 @@
231
  "model": "gemini-2.5-pro-preview-06-05",
232
  "score": 13.67,
233
  "details": "12,14,15"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  }
235
  ]
236
  },
@@ -306,6 +416,56 @@
306
  "model": "gemini-2.5-pro-preview-06-05",
307
  "score": 496,
308
  "details": "461,556,471"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  }
310
  ]
311
  },
@@ -315,73 +475,73 @@
315
  {
316
  "model": "claude-3-5-sonnet-20241022",
317
  "score": 0,
318
- "detail_box_on_target": "0,0,0",
319
  "cracked_levels": "0,0,0"
320
  },
321
  {
322
  "model": "claude-3-7-sonnet-20250219",
323
  "score": 0,
324
- "detail_box_on_target": "0,0,0",
325
  "cracked_levels": "0,0,0"
326
  },
327
  {
328
  "model": "gemini-2.5-flash-preview-04-17",
329
  "score": 0,
330
- "detail_box_on_target": "0,0,0",
331
  "cracked_levels": "0,0,0"
332
  },
333
  {
334
  "model": "gemini-2.5-pro-preview-05-06",
335
  "score": 1,
336
- "detail_box_on_target": "1,1,1",
337
  "cracked_levels": "0,0,0"
338
  },
339
  {
340
  "model": "llama-4-maverick-17b-128e-instruct-fp8",
341
  "score": 0,
342
- "detail_box_on_target": "0,0,0",
343
  "cracked_levels": "0,0,0"
344
  },
345
  {
346
  "model": "gpt-4.1-2025-04-14",
347
  "score": 0,
348
- "detail_box_on_target": "0,0,0",
349
  "cracked_levels": "0,0,0"
350
  },
351
  {
352
  "model": "gpt-4o-2024-11-20",
353
  "score": 0,
354
- "detail_box_on_target": "0,0,0",
355
  "cracked_levels": "0,0,0"
356
  },
357
  {
358
  "model": "o1-2024-12-17",
359
  "score": 0,
360
- "detail_box_on_target": "0",
361
  "cracked_levels": "0"
362
  },
363
  {
364
  "model": "o3-2025-04-16",
365
  "score": 2,
366
- "detail_box_on_target": "2",
367
  "cracked_levels": "1"
368
  },
369
  {
370
  "model": "o4-mini-2025-04-16",
371
  "score": 1.33,
372
- "detail_box_on_target": "1,2,1",
373
  "cracked_levels": "0,1,0"
374
  },
375
  {
376
  "model": "random (x30)",
377
  "score": 0,
378
- "detail_box_on_target": "0,0,0",
379
  "cracked_levels": "0,0,0"
380
  },
381
  {
382
  "model": "claude-sonnet-4-20250514",
383
  "score": 0,
384
- "detail_box_on_target": "0,0,0",
385
  "cracked_levels": "0,0,0"
386
  },
387
  {
@@ -395,7 +555,68 @@
395
  "score": 0,
396
  "detail_box_on_target": "0,0,0",
397
  "cracked_levels": "0,0,0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  }
 
399
  ]
400
  },
401
  "Ace Attorney": {
@@ -467,12 +688,6 @@
467
  "score": 1.33,
468
  "details": "0,2,2",
469
  "progress": "1:2/5"
470
- },
471
- {
472
- "model": "gemini-2.5-pro-preview-06-05",
473
- "score": 1.33,
474
- "details": "7,0,0",
475
- "progress": "2:2/9"
476
  }
477
  ]
478
  }
 
5
  {
6
  "model": "claude-3-5-sonnet-20241022",
7
  "score": 1540.0,
8
+ "detail_data":"1551,1515,1554",
9
  "progress": "1-1"
10
  },
11
  {
12
  "model": "claude-3-7-sonnet-20250219",
13
  "score": 1430.0,
14
+ "detail_data":"1532,1515,1243",
15
  "progress": "1-1"
16
  },
17
  {
18
  "model": "gemini-2.5-flash-preview-04-17",
19
  "score": 1540.7,
20
+ "detail_data":"1794,1270,1558",
21
  "progress": "1-1"
22
  },
23
  {
24
  "model": "gemini-2.5-pro-preview-05-06",
25
  "score": 1025.3,
26
+ "detail_data":"820,1534,722",
27
  "progress": "1-1"
28
  },
29
  {
30
  "model": "llama-4-maverick-17b-128e-instruct-fp8",
31
  "score": 786.0,
32
+ "detail_data":"837,300,1221",
33
  "progress": "1-1"
34
  },
35
  {
36
  "model": "gpt-4.1-2025-04-14",
37
  "score": 1991.3,
38
+ "detail_data":"1563,1257,3154",
39
  "progress": "1-1"
40
  },
41
  {
42
  "model": "gpt-4o-2024-11-20",
43
  "score": 1028.3,
44
+ "detail_data":"1565,297,1223",
45
  "progress": "1-1"
46
  },
47
  {
48
  "model": "o1-2024-12-17",
49
  "score": 1434.0,
50
+ "detail_data":"1434",
51
  "progress": "1-1"
52
  },
53
  {
54
  "model": "o3-2025-04-16",
55
  "score": 1955.0,
56
+ "detail_data":"1955",
57
  "progress": "1-1"
58
  },
59
  {
60
  "model": "o4-mini-2025-04-16",
61
  "score": 1348.3,
62
+ "detail_data":"1554,1245,1246",
63
  "progress": "1-1"
64
  },
65
  {
66
  "model": "random (x30)",
67
  "score": 986.97,
68
+ "detail_data":"986.97",
69
  "progress": "1-1"
70
  }
71
  ]
 
143
  "model": "gemini-2.5-flash-preview-05-20",
144
  "score": 2750,
145
  "details": "3128, 2758, 2364",
146
+ "highest_tail": 256
147
  },
148
  {
149
  "model": "claude-sonnet-4-20250514",
 
156
  "score": 2232,
157
  "details": "2212,2856,1628",
158
  "highest_tail": 256
159
+ },
160
+ {
161
+ "model": "claude-opus-4-20250514",
162
+ "score": 2272,
163
+ "details": "2272",
164
+ "highest_tail": 256
165
+ },
166
+ {
167
+ "model": "deepseek-r1-0528",
168
+ "score": 1104,
169
+ "details": "1460.0,1156.0, 696.0",
170
+ "highest_tail": 128
171
+ },
172
+ {
173
+ "model": "qwen3-235B-A22B-fp8",
174
+ "score": 434.67,
175
+ "details": "972.0, 168.0, 164.0",
176
+ "highest_tail": 128
177
+ },
178
+ {
179
+ "model": "grok-4-0709",
180
+ "score": 4229.33,
181
+ "details": "2252, 2936, 7500",
182
+ "highest_tail": 512
183
+ },
184
+ {
185
+ "model": "kimi-k2-0711-preview",
186
+ "score": 1376.0,
187
+ "details": "1432.0, 1348.0, 1348.0",
188
+ "highest_tail": 128
189
+ },
190
+ {
191
+ "model": "glm-4.5",
192
+ "score": 662.67,
193
+ "details": "72, 1428, 488",
194
+ "highest_tail": 128
195
+ },
196
+ {
197
+ "model": "qwen3-235B-A22B-fp8-thinking",
198
+ "score": 3456,
199
+ "details": "4112, 3428, 2828",
200
+ "highest_tail": 512
201
+ },
202
+ {
203
+ "model": "gpt-oss-120b",
204
+ "score": 1746.67,
205
+ "details": "1044, 2816,1380",
206
+ "highest_tail": 256
207
+ },
208
+ {
209
+ "model": "gpt-oss-20b",
210
+ "score": 3032,
211
+ "details": "2752, 860, 5484",
212
+ "highest_tail": 512
213
+ },
214
+ {
215
+ "model": "gpt-5-thinking-high",
216
+ "score": 3206.67,
217
+ "details": "3924.0, 2884.0, 2812.0",
218
+ "highest_tail": 256
219
  }
220
  ]
221
  },
 
291
  "model": "gemini-2.5-pro-preview-06-05",
292
  "score": 13.67,
293
  "details": "12,14,15"
294
+ },
295
+ {
296
+ "model": "claude-opus-4-20250514",
297
+ "score": 18,
298
+ "details": "18"
299
+ },
300
+ {
301
+ "model": "deepseek-r1-0528",
302
+ "score": 25.33,
303
+ "details": "23.0, 42.0, 11.0"
304
+ },
305
+ {
306
+ "model": "qwen3-235B-A22B-fp8",
307
+ "score": 10.67,
308
+ "details": "10.0, 11.0, 11.0"
309
+ },
310
+ {
311
+ "model": "grok-4-0709",
312
+ "score": 65.66,
313
+ "details": "114.0, 61.0, 22.0"
314
+ },
315
+ {
316
+ "model": "kimi-k2-0711-preview",
317
+ "score": 13.67,
318
+ "details": "14.0, 15.0, 12.0"
319
+ },
320
+ {
321
+ "model": "glm-4.5",
322
+ "score": 15.33,
323
+ "details": "17.0, 10.0, 19"
324
+ },
325
+ {
326
+ "model": "qwen3-235B-A22B-fp8-thinking",
327
+ "score": 15,
328
+ "details": "14, 15, 16"
329
+ },
330
+ {
331
+ "model": "gpt-oss-120b",
332
+ "score": 13,
333
+ "details": "11, 14, 14"
334
+ },
335
+ {
336
+ "model": "gpt-oss-20b",
337
+ "score": 12.67,
338
+ "details": "12, 13, 13"
339
+ },
340
+ {
341
+ "model": "gpt-5-thinking-high",
342
+ "score": 64.33,
343
+ "details": "46.0, 73.0, 74.0"
344
  }
345
  ]
346
  },
 
416
  "model": "gemini-2.5-pro-preview-06-05",
417
  "score": 496,
418
  "details": "461,556,471"
419
+ },
420
+ {
421
+ "model": "claude-opus-4-20250514",
422
+ "score": 464,
423
+ "details": "593, 406, 393"
424
+ },
425
+ {
426
+ "model": "deepseek-r1-0528",
427
+ "score": 498.33,
428
+ "details": "551.0, 492.0, 452.0"
429
+ },
430
+ {
431
+ "model": "qwen3-235B-A22B-fp8",
432
+ "score": 472.67,
433
+ "details": "591.0, 418.0, 409.0"
434
+ },
435
+ {
436
+ "model": "grok-4-0709",
437
+ "score": 66.0,
438
+ "details": "34.0, 38.0, 126.0"
439
+ },
440
+ {
441
+ "model": "kimi-k2-0711-preview",
442
+ "score": 10.67,
443
+ "details": "10.0, 11.0, 11.0"
444
+ },
445
+ {
446
+ "model": "glm-4.5",
447
+ "score": 153,
448
+ "details": "153.0, 180.0, 126"
449
+ },
450
+ {
451
+ "model": "qwen3-235B-A22B-fp8-thinking",
452
+ "score": 577.33,
453
+ "details": "449,672,611"
454
+ },
455
+ {
456
+ "model": "gpt-oss-120b",
457
+ "score": 506,
458
+ "details": "519, 500, 499"
459
+ },
460
+ {
461
+ "model": "gpt-oss-20b",
462
+ "score": 595,
463
+ "details": "648, 581, 556"
464
+ },
465
+ {
466
+ "model": "gpt-5-thinking-high",
467
+ "score": 678.33,
468
+ "details": "731, 586, 718"
469
  }
470
  ]
471
  },
 
475
  {
476
  "model": "claude-3-5-sonnet-20241022",
477
  "score": 0,
478
+ "detail_box_on_target":"0,0,0",
479
  "cracked_levels": "0,0,0"
480
  },
481
  {
482
  "model": "claude-3-7-sonnet-20250219",
483
  "score": 0,
484
+ "detail_box_on_target":"0,0,0",
485
  "cracked_levels": "0,0,0"
486
  },
487
  {
488
  "model": "gemini-2.5-flash-preview-04-17",
489
  "score": 0,
490
+ "detail_box_on_target":"0,0,0",
491
  "cracked_levels": "0,0,0"
492
  },
493
  {
494
  "model": "gemini-2.5-pro-preview-05-06",
495
  "score": 1,
496
+ "detail_box_on_target":"1,1,1",
497
  "cracked_levels": "0,0,0"
498
  },
499
  {
500
  "model": "llama-4-maverick-17b-128e-instruct-fp8",
501
  "score": 0,
502
+ "detail_box_on_target":"0,0,0",
503
  "cracked_levels": "0,0,0"
504
  },
505
  {
506
  "model": "gpt-4.1-2025-04-14",
507
  "score": 0,
508
+ "detail_box_on_target":"0,0,0",
509
  "cracked_levels": "0,0,0"
510
  },
511
  {
512
  "model": "gpt-4o-2024-11-20",
513
  "score": 0,
514
+ "detail_box_on_target":"0,0,0",
515
  "cracked_levels": "0,0,0"
516
  },
517
  {
518
  "model": "o1-2024-12-17",
519
  "score": 0,
520
+ "detail_box_on_target":"0",
521
  "cracked_levels": "0"
522
  },
523
  {
524
  "model": "o3-2025-04-16",
525
  "score": 2,
526
+ "detail_box_on_target":"2",
527
  "cracked_levels": "1"
528
  },
529
  {
530
  "model": "o4-mini-2025-04-16",
531
  "score": 1.33,
532
+ "detail_box_on_target":"1,2,1",
533
  "cracked_levels": "0,1,0"
534
  },
535
  {
536
  "model": "random (x30)",
537
  "score": 0,
538
+ "detail_box_on_target":"0,0,0",
539
  "cracked_levels": "0,0,0"
540
  },
541
  {
542
  "model": "claude-sonnet-4-20250514",
543
  "score": 0,
544
+ "detail_box_on_target":"0,0,0",
545
  "cracked_levels": "0,0,0"
546
  },
547
  {
 
555
  "score": 0,
556
  "detail_box_on_target": "0,0,0",
557
  "cracked_levels": "0,0,0"
558
+ },
559
+ {
560
+ "model": "claude-opus-4-20250514",
561
+ "score": 4,
562
+ "detail_box_on_target": "4",
563
+ "cracked_levels": "2"
564
+ },
565
+ {
566
+ "model": "deepseek-r1-0528",
567
+ "score": 2,
568
+ "detail_box_on_target": "3,1,2",
569
+ "cracked_levels": "1, 0, 1"
570
+ },
571
+ {
572
+ "model": "qwen3-235B-A22B-fp8",
573
+ "score": 1,
574
+ "detail_box_on_target": "1,1,1",
575
+ "cracked_levels": "0, 0, 0"
576
+ },
577
+ {
578
+ "model": "grok-4-0709",
579
+ "score": 2.33,
580
+ "detail_box_on_target": "2,0,5",
581
+ "cracked_levels": "1, 0, 3"
582
+ },
583
+ {
584
+ "model": "kimi-k2-0711-preview",
585
+ "score": 0,
586
+ "detail_box_on_target": "0,0,0",
587
+ "cracked_levels": "0,0,0"
588
+ },
589
+ {
590
+ "model": "glm-4.5",
591
+ "score": 0,
592
+ "detail_box_on_target": "0,0,0",
593
+ "cracked_levels": "0,0,0"
594
+ },
595
+ {
596
+ "model": "qwen3-235B-A22B-fp8-thinking",
597
+ "score": 2,
598
+ "detail_box_on_target": "2, 3, 1",
599
+ "cracked_levels": "1, 1, 0"
600
+ },
601
+ {
602
+ "model": "gpt-oss-120b",
603
+ "score": 5.5,
604
+ "detail_box_on_target": "5, 6",
605
+ "cracked_levels": "3,3"
606
+ },
607
+ {
608
+ "model": "gpt-oss-20b",
609
+ "score": 3,
610
+ "detail_box_on_target": "2, 4",
611
+ "cracked_levels": "1, 2"
612
+ },
613
+ {
614
+ "model": "gpt-5-thinking-high",
615
+ "score": 6,
616
+ "detail_box_on_target": "9, 6, 3",
617
+ "cracked_levels": "4, 3, 2"
618
  }
619
+
620
  ]
621
  },
622
  "Ace Attorney": {
 
688
  "score": 1.33,
689
  "details": "0,2,2",
690
  "progress": "1:2/5"
 
 
 
 
 
 
691
  }
692
  ]
693
  }