Spaces:
Running
Running
Lj Miranda
commited on
Update metadata (#4)
Browse files- app.py +5 -3
- requirements.txt +1 -0
- src/about.py +61 -7
- src/logo.png +0 -0
- src/logo.svg +51 -0
app.py
CHANGED
|
@@ -28,7 +28,7 @@ def restart_space():
|
|
| 28 |
|
| 29 |
|
| 30 |
# 2. Load and populate leaderboard data
|
| 31 |
-
def get_results(source: str, aggregate: bool = False) -> pd.DataFrame:
|
| 32 |
results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
|
| 33 |
raw_data = [EvalResult.init_from_dict(result) for result in results]
|
| 34 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
@@ -157,11 +157,13 @@ def download_results():
|
|
| 157 |
return filepath
|
| 158 |
|
| 159 |
|
|
|
|
|
|
|
| 160 |
# 3. Actual setup of the HF Space
|
| 161 |
demo = gr.Blocks(css=custom_css)
|
| 162 |
with demo:
|
| 163 |
-
gr.
|
| 164 |
-
|
| 165 |
|
| 166 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 167 |
with gr.TabItem(
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
# 2. Load and populate leaderboard data
|
| 31 |
+
def get_results(source: str, aggregate: bool = False) -> tuple[pd.DataFrame, list]:
|
| 32 |
results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
|
| 33 |
raw_data = [EvalResult.init_from_dict(result) for result in results]
|
| 34 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
|
| 157 |
return filepath
|
| 158 |
|
| 159 |
|
| 160 |
+
num_models = len(get_results(REPO_RESULTS, aggregate=True)[0])
|
| 161 |
+
|
| 162 |
# 3. Actual setup of the HF Space
|
| 163 |
demo = gr.Blocks(css=custom_css)
|
| 164 |
with demo:
|
| 165 |
+
with gr.Column(scale=6):
|
| 166 |
+
gr.Markdown(about.TOP_TEXT.format(str(num_models)))
|
| 167 |
|
| 168 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 169 |
with gr.TabItem(
|
requirements.txt
CHANGED
|
@@ -16,3 +16,4 @@ sentencepiece
|
|
| 16 |
tokenizers>=0.15.0
|
| 17 |
tqdm
|
| 18 |
transformers
|
|
|
|
|
|
| 16 |
tokenizers>=0.15.0
|
| 17 |
tqdm
|
| 18 |
transformers
|
| 19 |
+
pytz
|
src/about.py
CHANGED
|
@@ -1,20 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
NUM_FEWSHOT = 0
|
| 2 |
|
|
|
|
|
|
|
| 3 |
|
| 4 |
# Leaderboard general information
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
# Leaderboard reproducibility
|
| 11 |
LLM_BENCHMARKS_TEXT = """
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"""
|
| 16 |
|
| 17 |
# Citation information
|
| 18 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 19 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"""
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
|
| 3 |
+
import pytz
|
| 4 |
+
|
| 5 |
NUM_FEWSHOT = 0
|
| 6 |
|
| 7 |
+
pacific_tz = pytz.timezone("Asia/Manila")
|
| 8 |
+
current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
|
| 9 |
|
| 10 |
# Leaderboard general information
|
| 11 |
+
TOP_TEXT = f"""
|
| 12 |
+
# FilBench: An Open LLM Leaderboard for Filipino
|
| 13 |
+
|
| 14 |
+
[Code](https://github.com/filbench/filbench) | [Runner](https://github.com/filbench/lighteval) | [Paper (<i>Coming soon!</i>)]() | Total Models: {{}} | Last restart (PHT): {current_time}
|
| 15 |
+
"""
|
| 16 |
|
| 17 |
# Leaderboard reproducibility
|
| 18 |
LLM_BENCHMARKS_TEXT = """
|
| 19 |
+
**FilBench** is a comprehensive evaluation benchmark for Filipino. We curate 12 sub-tasks across 4 major categories--Cultural Knowledge, Classical NLP, Reading Comprehension, and Generation--and evaluate several models in order to understand their Filipino-centric capabilities.
|
| 20 |
+
|
| 21 |
+
## Overview
|
| 22 |
+
|
| 23 |
+
We average four core sections (weighted by the number of instances):
|
| 24 |
+
|
| 25 |
+
1. **Cultural Knowledge:** Includes instances for measuring cultural understanding capabilities of LLMs.
|
| 26 |
+
2. **Classical NLP:** Contains questions on standard NLP tasks such as text classification and named-entity recognition.
|
| 27 |
+
3. **Reading Comprehension:** Contains more focused natural language understanding (NLU) tasks and questions from readability benchmarks.
|
| 28 |
+
4. **Generation:** Contains instances for natural language generation (NLG), more focused on translation.
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
## Evaluation Runner
|
| 32 |
+
|
| 33 |
+
We use our own fork of [lighteval](https://github.com/filbench/lighteval) to perform evaluations.
|
| 34 |
+
We highly recommend using the vLLM backend for faster inference.
|
| 35 |
+
Sequentially, evaluating on FilBench can take 4.93 hours on 2 NVIDIA H100 GPUs.
|
| 36 |
+
However, the evaluation suite can be parallelized per benchmark, where the longest-running task can take approximately 1 hour and 28 minutes, and the shortest task takes only 5.86 minutes.
|
| 37 |
+
|
| 38 |
+
To evaluate your model on FilBench and for it to appear in the leaderboard, please follow these steps:
|
| 39 |
+
|
| 40 |
+
1. First clone the FilBench's lighteval repository and install all dependencies:
|
| 41 |
+
|
| 42 |
+
```sh
|
| 43 |
+
git clone https://github.com/filbench/lighteval.git
|
| 44 |
+
python3 -m venv venv
|
| 45 |
+
pip install -e .[dev,vllm]
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
2. Run the evaluation runner via vLLM
|
| 49 |
+
|
| 50 |
+
```sh
|
| 51 |
+
python3 -m lighteval vllm ${MODEL_NAME} ${TASK_NAME} \\
|
| 52 |
+
--push-to-hub \\
|
| 53 |
+
--results-org UD-Filipino \\
|
| 54 |
+
--custom-tasks community_tasks/filbench_evals.py
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
You can find the list of all FilBench tasks [in this file](https://github.com/filbench/lighteval/blob/main/examples/tasks/all_filbench_tasks.txt).
|
| 58 |
+
|
| 59 |
+
## Acknowledgements
|
| 60 |
+
|
| 61 |
+
The authors would like to thank Cohere Labs for the Cohere Research Grant that includes credits for running the Command models.
|
| 62 |
"""
|
| 63 |
|
| 64 |
# Citation information
|
| 65 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 66 |
CITATION_BUTTON_TEXT = r"""
|
| 67 |
+
@misc{OALL-2,
|
| 68 |
+
author = {Miranda, Lester James Validad and Aco, Elyanah and Manuel, Conner and Cruz, Jan Christian Blaise and Imperial, Joseph Marvin},
|
| 69 |
+
title = {FilBench Leaderboard},
|
| 70 |
+
year = {2025},
|
| 71 |
+
publisher = {FilBench},
|
| 72 |
+
howpublished = {https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard}
|
| 73 |
+
}
|
| 74 |
"""
|
src/logo.png
ADDED
|
src/logo.svg
ADDED
|
|