Spaces:
Running
Running
Konstantin Chernyshev
commited on
Commit
·
79ede97
1
Parent(s):
4790bc5
chore: add about section
Browse files- app.py +1 -1
- src/about.py +15 -26
app.py
CHANGED
@@ -205,4 +205,4 @@ with demo:
|
|
205 |
scheduler = BackgroundScheduler()
|
206 |
scheduler.add_job(restart_space, "interval", seconds=60 * 60)
|
207 |
scheduler.start()
|
208 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
205 |
scheduler = BackgroundScheduler()
|
206 |
scheduler.add_job(restart_space, "interval", seconds=60 * 60)
|
207 |
scheduler.start()
|
208 |
+
demo.queue(default_concurrency_limit=40).launch(ssr=False)
|
src/about.py
CHANGED
@@ -1,39 +1,28 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from enum import Enum
|
3 |
-
|
4 |
-
|
5 |
-
@dataclass
|
6 |
-
class Task:
|
7 |
-
benchmark: str
|
8 |
-
metric: str
|
9 |
-
col_name: str
|
10 |
-
|
11 |
-
|
12 |
-
# Select your tasks here
|
13 |
-
# ---------------------------------------------------
|
14 |
-
class Tasks(Enum):
|
15 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
-
task0 = Task("anli_r1", "acc", "ANLI")
|
17 |
-
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
18 |
-
|
19 |
-
|
20 |
-
# ---------------------------------------------------
|
21 |
-
|
22 |
-
|
23 |
# Your leaderboard name
|
24 |
TITLE = """<h1 align="center" id="space-title">U-MATH / μ-MATH leaderboard</h1>"""
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
|
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = """
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
|
|
37 |
|
38 |
"""
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Your leaderboard name
|
2 |
TITLE = """<h1 align="center" id="space-title">U-MATH / μ-MATH leaderboard</h1>"""
|
3 |
|
4 |
# What does your leaderboard evaluate?
|
5 |
INTRODUCTION_TEXT = """
|
6 |
+
These datasets are designed to test the mathematical reasoning and meta-evaluation capabilities of Large Language Models (LLMs) on university-level problems.
|
7 |
+
U-MATH provides a set of 1,100 university-level mathematical problems, while µ-MATH complements it with a meta-evaluation framework focusing on solution judgment with 1084 LLM solutions.
|
8 |
"""
|
9 |
|
10 |
# Which evaluations are you running? how can people reproduce what you have?
|
11 |
LLM_BENCHMARKS_TEXT = """
|
12 |
+
This repository contains the official leaderboard code for the U-MATH and $\mu$-MATH benchmarks. These datasets are designed to test the mathematical reasoning and meta-evaluation capabilities of Large Language Models (LLMs) on university-level problems.
|
13 |
+
|
14 |
+
### Overview
|
15 |
+
|
16 |
+
U-MATH provides a set of 1,100 university-level mathematical problems, while µ-MATH complements it with a meta-evaluation framework focusing on solution judgment with 1084 LLM solutions.
|
17 |
+
|
18 |
+
* 📊 [U-MATH benchmark at Huggingface](https://huggingface.co/datasets/toloka/umath)
|
19 |
+
* 🔎 [μ-MATH benchmark at Huggingface](https://huggingface.co/datasets/toloka/mumath)
|
20 |
+
* 🗞️ [Paper](https://arxiv.org/abs/2412.03205)
|
21 |
+
* 👾 [Evaluation Code at GitHub](https://github.com/Toloka/u-math/)
|
22 |
|
23 |
+
### Licensing Information
|
24 |
+
* The contents of the μ-MATH's machine-generated `model_output` column are subject to the underlying LLMs' licensing terms.
|
25 |
+
* Contents of all the other dataset U-MATH and μ-MATH fields, as well as the code, are available under the MIT license.
|
26 |
|
27 |
"""
|
28 |
|