Spaces:

toloka
/

u-math-leaderboard

Running

App Files Files Community

Konstantin Chernyshev commited on Jan 14

Commit

79ede97

1 Parent(s): 4790bc5

chore: add about section

Browse files

Files changed (2) hide show

app.py +1 -1
src/about.py +15 -26

app.py CHANGED Viewed

@@ -205,4 +205,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=60 * 60)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=60 * 60)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch(ssr=False)

src/about.py CHANGED Viewed

@@ -1,39 +1,28 @@
-from dataclasses import dataclass
-from enum import Enum
-@dataclass
-class Task:
-    benchmark: str
-    metric: str
-    col_name: str
-# Select your tasks here
-# ---------------------------------------------------
-class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-# ---------------------------------------------------
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">U-MATH / μ-MATH leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = """
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
 """

 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">U-MATH / μ-MATH leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+These datasets are designed to test the mathematical reasoning and meta-evaluation capabilities of Large Language Models (LLMs) on university-level problems.
+U-MATH provides a set of 1,100 university-level mathematical problems, while µ-MATH complements it with a meta-evaluation framework focusing on solution judgment with 1084 LLM solutions.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = """
+This repository contains the official leaderboard code for the U-MATH and $\mu$-MATH benchmarks. These datasets are designed to test the mathematical reasoning and meta-evaluation capabilities of Large Language Models (LLMs) on university-level problems.
+### Overview
+U-MATH provides a set of 1,100 university-level mathematical problems, while µ-MATH complements it with a meta-evaluation framework focusing on solution judgment with 1084 LLM solutions.
+* 📊 [U-MATH benchmark at Huggingface](https://huggingface.co/datasets/toloka/umath)
+* 🔎 [μ-MATH benchmark at Huggingface](https://huggingface.co/datasets/toloka/mumath)
+* 🗞️ [Paper](https://arxiv.org/abs/2412.03205)
+* 👾 [Evaluation Code at GitHub](https://github.com/Toloka/u-math/)
+### Licensing Information
+* The contents of the μ-MATH's machine-generated `model_output` column are subject to the underlying LLMs' licensing terms.
+* Contents of all the other dataset U-MATH and μ-MATH fields, as well as the code, are available under the MIT license.
 """