Spaces:

filbench
/

filbench-leaderboard

Running

App Files Files Community

Lj Miranda commited on May 17

Commit

6829d60

unverified ·

1 Parent(s): 2a64d15

Update metadata (#4)

Browse files

Files changed (5) hide show

app.py +5 -3
requirements.txt +1 -0
src/about.py +61 -7
src/logo.png +0 -0
src/logo.svg +51 -0

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ def restart_space():
 # 2. Load and populate leaderboard data
-def get_results(source: str, aggregate: bool = False) -> pd.DataFrame:
     results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
     raw_data = [EvalResult.init_from_dict(result) for result in results]
     all_data_json = [v.to_dict() for v in raw_data]
@@ -157,11 +157,13 @@ def download_results():
     return filepath
 # 3. Actual setup of the HF Space
 demo = gr.Blocks(css=custom_css)
 with demo:
-    gr.HTML(about.TITLE)
-    gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem(

 # 2. Load and populate leaderboard data
+def get_results(source: str, aggregate: bool = False) -> tuple[pd.DataFrame, list]:
     results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
     raw_data = [EvalResult.init_from_dict(result) for result in results]
     all_data_json = [v.to_dict() for v in raw_data]
     return filepath
+num_models = len(get_results(REPO_RESULTS, aggregate=True)[0])
 # 3. Actual setup of the HF Space
 demo = gr.Blocks(css=custom_css)
 with demo:
+    with gr.Column(scale=6):
+        gr.Markdown(about.TOP_TEXT.format(str(num_models)))
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem(

requirements.txt CHANGED Viewed

@@ -16,3 +16,4 @@ sentencepiece
 tokenizers>=0.15.0
 tqdm
 transformers

 tokenizers>=0.15.0
 tqdm
 transformers
+pytz

src/about.py CHANGED Viewed

@@ -1,20 +1,74 @@
 NUM_FEWSHOT = 0
 # Leaderboard general information
-TITLE = (
-    """<h1 align="center" id="space-title">An Open LLM Leaderboard for Filipino</h1>"""
-)
-INTRODUCTION_TEXT = """Intro Text"""
 # Leaderboard reproducibility
 LLM_BENCHMARKS_TEXT = """
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
 """
 # Citation information
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

+from datetime import datetime
+import pytz
 NUM_FEWSHOT = 0
+pacific_tz = pytz.timezone("Asia/Manila")
+current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
 # Leaderboard general information
+TOP_TEXT = f"""
+# FilBench: An Open LLM Leaderboard for Filipino
+[Code](https://github.com/filbench/filbench) | [Runner](https://github.com/filbench/lighteval) | [Paper (<i>Coming soon!</i>)]() | Total Models: {{}} | Last restart (PHT): {current_time}
+"""
 # Leaderboard reproducibility
 LLM_BENCHMARKS_TEXT = """
+**FilBench** is a comprehensive evaluation benchmark for Filipino. We curate 12 sub-tasks across 4 major categories--Cultural Knowledge, Classical NLP, Reading Comprehension, and Generation--and evaluate several models in order to understand their Filipino-centric capabilities.
+## Overview
+We average four core sections (weighted by the number of instances):
+1. **Cultural Knowledge:** Includes instances for measuring cultural understanding capabilities of LLMs.
+2. **Classical NLP:** Contains questions on standard NLP tasks such as text classification and named-entity recognition.
+3. **Reading Comprehension:** Contains more focused natural language understanding (NLU) tasks and questions from readability benchmarks.
+4. **Generation:** Contains instances for natural language generation (NLG), more focused on translation.
+## Evaluation Runner
+We use our own fork of [lighteval](https://github.com/filbench/lighteval) to perform evaluations.
+We highly recommend using the vLLM backend for faster inference.
+Sequentially, evaluating on FilBench can take 4.93 hours on 2 NVIDIA H100 GPUs.
+However, the evaluation suite can be parallelized per benchmark, where the longest-running task can take approximately 1 hour and 28 minutes, and the shortest task takes only 5.86 minutes.
+To evaluate your model on FilBench and for it to appear in the leaderboard, please follow these steps:
+1. First clone the FilBench's lighteval repository and install all dependencies:
+```sh
+git clone https://github.com/filbench/lighteval.git
+python3 -m venv venv
+pip install -e .[dev,vllm]
+```
+2. Run the evaluation runner via vLLM
+```sh
+python3 -m lighteval vllm ${MODEL_NAME} ${TASK_NAME} \\
+    --push-to-hub \\
+    --results-org UD-Filipino \\
+    --custom-tasks community_tasks/filbench_evals.py
+```
+You can find the list of all FilBench tasks [in this file](https://github.com/filbench/lighteval/blob/main/examples/tasks/all_filbench_tasks.txt).
+## Acknowledgements
+The authors would like to thank Cohere Labs for the Cohere Research Grant that includes credits for running the Command models.
 """
 # Citation information
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
+@misc{OALL-2,
+  author = {Miranda, Lester James Validad and Aco, Elyanah and Manuel, Conner and Cruz, Jan Christian Blaise and Imperial, Joseph Marvin},
+  title = {FilBench Leaderboard},
+  year = {2025},
+  publisher = {FilBench},
+  howpublished = {https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard}
+}
 """

src/logo.png ADDED Viewed

src/logo.svg ADDED Viewed