Lj Miranda commited on
Commit
6829d60
·
unverified ·
1 Parent(s): 2a64d15

Update metadata (#4)

Browse files
Files changed (5) hide show
  1. app.py +5 -3
  2. requirements.txt +1 -0
  3. src/about.py +61 -7
  4. src/logo.png +0 -0
  5. src/logo.svg +51 -0
app.py CHANGED
@@ -28,7 +28,7 @@ def restart_space():
28
 
29
 
30
  # 2. Load and populate leaderboard data
31
- def get_results(source: str, aggregate: bool = False) -> pd.DataFrame:
32
  results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
33
  raw_data = [EvalResult.init_from_dict(result) for result in results]
34
  all_data_json = [v.to_dict() for v in raw_data]
@@ -157,11 +157,13 @@ def download_results():
157
  return filepath
158
 
159
 
 
 
160
  # 3. Actual setup of the HF Space
161
  demo = gr.Blocks(css=custom_css)
162
  with demo:
163
- gr.HTML(about.TITLE)
164
- gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
165
 
166
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
167
  with gr.TabItem(
 
28
 
29
 
30
  # 2. Load and populate leaderboard data
31
+ def get_results(source: str, aggregate: bool = False) -> tuple[pd.DataFrame, list]:
32
  results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
33
  raw_data = [EvalResult.init_from_dict(result) for result in results]
34
  all_data_json = [v.to_dict() for v in raw_data]
 
157
  return filepath
158
 
159
 
160
+ num_models = len(get_results(REPO_RESULTS, aggregate=True)[0])
161
+
162
  # 3. Actual setup of the HF Space
163
  demo = gr.Blocks(css=custom_css)
164
  with demo:
165
+ with gr.Column(scale=6):
166
+ gr.Markdown(about.TOP_TEXT.format(str(num_models)))
167
 
168
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
169
  with gr.TabItem(
requirements.txt CHANGED
@@ -16,3 +16,4 @@ sentencepiece
16
  tokenizers>=0.15.0
17
  tqdm
18
  transformers
 
 
16
  tokenizers>=0.15.0
17
  tqdm
18
  transformers
19
+ pytz
src/about.py CHANGED
@@ -1,20 +1,74 @@
 
 
 
 
1
  NUM_FEWSHOT = 0
2
 
 
 
3
 
4
  # Leaderboard general information
5
- TITLE = (
6
- """<h1 align="center" id="space-title">An Open LLM Leaderboard for Filipino</h1>"""
7
- )
8
- INTRODUCTION_TEXT = """Intro Text"""
 
9
 
10
  # Leaderboard reproducibility
11
  LLM_BENCHMARKS_TEXT = """
12
- ## How it works
13
- ## Reproducibility
14
- To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  """
16
 
17
  # Citation information
18
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
19
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
20
  """
 
1
+ from datetime import datetime
2
+
3
+ import pytz
4
+
5
  NUM_FEWSHOT = 0
6
 
7
+ pacific_tz = pytz.timezone("Asia/Manila")
8
+ current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
9
 
10
  # Leaderboard general information
11
+ TOP_TEXT = f"""
12
+ # FilBench: An Open LLM Leaderboard for Filipino
13
+
14
+ [Code](https://github.com/filbench/filbench) | [Runner](https://github.com/filbench/lighteval) | [Paper (<i>Coming soon!</i>)]() | Total Models: {{}} | Last restart (PHT): {current_time}
15
+ """
16
 
17
  # Leaderboard reproducibility
18
  LLM_BENCHMARKS_TEXT = """
19
+ **FilBench** is a comprehensive evaluation benchmark for Filipino. We curate 12 sub-tasks across 4 major categories--Cultural Knowledge, Classical NLP, Reading Comprehension, and Generation--and evaluate several models in order to understand their Filipino-centric capabilities.
20
+
21
+ ## Overview
22
+
23
+ We average four core sections (weighted by the number of instances):
24
+
25
+ 1. **Cultural Knowledge:** Includes instances for measuring cultural understanding capabilities of LLMs.
26
+ 2. **Classical NLP:** Contains questions on standard NLP tasks such as text classification and named-entity recognition.
27
+ 3. **Reading Comprehension:** Contains more focused natural language understanding (NLU) tasks and questions from readability benchmarks.
28
+ 4. **Generation:** Contains instances for natural language generation (NLG), more focused on translation.
29
+
30
+
31
+ ## Evaluation Runner
32
+
33
+ We use our own fork of [lighteval](https://github.com/filbench/lighteval) to perform evaluations.
34
+ We highly recommend using the vLLM backend for faster inference.
35
+ Sequentially, evaluating on FilBench can take 4.93 hours on 2 NVIDIA H100 GPUs.
36
+ However, the evaluation suite can be parallelized per benchmark, where the longest-running task can take approximately 1 hour and 28 minutes, and the shortest task takes only 5.86 minutes.
37
+
38
+ To evaluate your model on FilBench and for it to appear in the leaderboard, please follow these steps:
39
+
40
+ 1. First clone the FilBench's lighteval repository and install all dependencies:
41
+
42
+ ```sh
43
+ git clone https://github.com/filbench/lighteval.git
44
+ python3 -m venv venv
45
+ pip install -e .[dev,vllm]
46
+ ```
47
+
48
+ 2. Run the evaluation runner via vLLM
49
+
50
+ ```sh
51
+ python3 -m lighteval vllm ${MODEL_NAME} ${TASK_NAME} \\
52
+ --push-to-hub \\
53
+ --results-org UD-Filipino \\
54
+ --custom-tasks community_tasks/filbench_evals.py
55
+ ```
56
+
57
+ You can find the list of all FilBench tasks [in this file](https://github.com/filbench/lighteval/blob/main/examples/tasks/all_filbench_tasks.txt).
58
+
59
+ ## Acknowledgements
60
+
61
+ The authors would like to thank Cohere Labs for the Cohere Research Grant that includes credits for running the Command models.
62
  """
63
 
64
  # Citation information
65
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
66
  CITATION_BUTTON_TEXT = r"""
67
+ @misc{OALL-2,
68
+ author = {Miranda, Lester James Validad and Aco, Elyanah and Manuel, Conner and Cruz, Jan Christian Blaise and Imperial, Joseph Marvin},
69
+ title = {FilBench Leaderboard},
70
+ year = {2025},
71
+ publisher = {FilBench},
72
+ howpublished = {https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard}
73
+ }
74
  """
src/logo.png ADDED
src/logo.svg ADDED