ai-progress-charts / simple_bench_leaderboard.jsonl
kaizuberbuehler's picture
Add Humanity's Last Exam, LiveBench and LiveCodeBench; Remove Codeforces; Update Simple Bench
b605a32
{"model": "o1-preview-2024-09-12", "score": 41.7}
{"model": "claude-3-5-sonnet-20241022", "score": 41.4}
{"model": "o1-2024-12-17 (high)", "score": 40.1}
{"model": "o1-2024-12-17 (medium)", "score": 36.7}
{"model": "gemini-exp-1206", "score": 31.1}
{"model": "deepseek-r1", "score": 30.9}
{"model": "claude-3-5-sonnet-20240620", "score": 27.5}
{"model": "gemini-1.5-pro-002", "score": 27.1}
{"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
{"model": "claude-3-opus-20240229", "score": 23.5}
{"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
{"model": "o3-mini-2025-01-31 (high)", "score": 22.8}
{"model": "grok-beta", "score": 22.7}
{"model": "mistral-large-2407", "score": 22.5}
{"model": "llama-3.3-70b-instruct", "score": 19.9}
{"model": "deepseek-v3", "score": 18.9}
{"model": "gemini-2.0-flash-exp", "score": 18.9}
{"model": "o1-mini-2024-09-12", "score": 18.1}
{"model": "gpt-4o-2024-08-06", "score": 17.8}
{"model": "command-r-plus", "score": 17.4}
{"model": "gpt-4o-mini-2024-07-18", "score": 10.7}