zhtw-reasoning-eval-leaderboard

Sleeping

yentinglin commited on Feb 15

Commit

6710791

verified ·

1 Parent(s): 5a4cc35

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ DESCRIPTION = f"""
 Evaluation of open-r1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
-BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
 def get_leaderboard_df():

 Evaluation of open-r1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
+BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2", "aime25:part1", "aime25_part1", "gpqa"]
 def get_leaderboard_df():