Lj Miranda commited on
Commit
96c67b7
·
unverified ·
1 Parent(s): d777fa4

Add ability to incorporate external submissions (#7)

Browse files
Files changed (3) hide show
  1. app.py +68 -9
  2. src/about.py +8 -21
  3. src/schema.py +160 -49
app.py CHANGED
@@ -1,10 +1,13 @@
 
1
  import os
2
  import re
 
3
 
4
  import gradio as gr
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from datasets import load_dataset
 
8
  from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
9
  from huggingface_hub import HfApi
10
 
@@ -13,6 +16,13 @@ from src.display.css_html_js import custom_css
13
  from src.plots import plot_cost_efficiency, plot_parameter_efficiency
14
  from src.schema import AutoEvalColumn, EvalResult, fields
15
 
 
 
 
 
 
 
 
16
  # 1. Initialization
17
  _hf_token = os.environ.get("HF_TOKEN")
18
  if not _hf_token:
@@ -22,6 +32,7 @@ api = HfApi(token=_hf_token)
22
 
23
  REPO_ID = "UD-Filipino/filbench-leaderboard"
24
  REPO_RESULTS = "UD-Filipino/filbench-results"
 
25
 
26
 
27
  def restart_space():
@@ -29,11 +40,44 @@ def restart_space():
29
 
30
 
31
  # 2. Load and populate leaderboard data
32
- def get_results(source: str, aggregate: bool = False) -> tuple[pd.DataFrame, list]:
 
 
 
 
 
 
 
 
 
 
 
33
  results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
34
  raw_data = [EvalResult.init_from_dict(result) for result in results]
35
- all_data_json = [v.to_dict() for v in raw_data]
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  df = pd.DataFrame.from_records(all_data_json)
38
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
39
  df["Incomplete"] = ~df.isna().any(axis=1)
@@ -58,8 +102,12 @@ def get_results(source: str, aggregate: bool = False) -> tuple[pd.DataFrame, lis
58
  return df, master_columns
59
 
60
 
61
- def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
62
- df, master_columns = get_results(source=source, aggregate=aggregate)
 
 
 
 
63
 
64
  return Leaderboard(
65
  value=df,
@@ -80,6 +128,7 @@ def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
80
  filter_columns=[
81
  # fmt: off
82
  ColumnFilter("Incomplete", type="boolean", label="Hide incomplete evaluations", default=True),
 
83
  # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
84
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model type"),
85
  ColumnFilter(AutoEvalColumn.multilingual.name, type="checkboxgroup", label="Multilinguality"),
@@ -97,8 +146,12 @@ def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
97
 
98
 
99
  def get_clean_df() -> pd.DataFrame:
100
- df, _ = get_results(source=REPO_RESULTS, aggregate=False)
101
- df_agg, _ = get_results(source=REPO_RESULTS, aggregate=True)
 
 
 
 
102
 
103
  # Cleanup
104
  def extract_names(html_string):
@@ -165,19 +218,25 @@ def download_results():
165
  demo = gr.Blocks(css=custom_css)
166
  with demo:
167
  with gr.Column(scale=6):
168
- num_models = len(get_results(REPO_RESULTS, aggregate=True)[0])
 
 
169
  gr.Markdown(about.TOP_TEXT.format(str(num_models)))
170
 
171
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
172
  with gr.TabItem(
173
  "🏅 FilBench Leaderboard", elem_id="llm-benchmark-tab-table", id=0
174
  ):
175
- leaderboard = init_leaderboard(REPO_RESULTS, aggregate=True)
 
 
176
 
177
  with gr.TabItem(
178
  "🔍 FilBench - Detailed", elem_id="llm-benchmark-tab-table", id=1
179
  ):
180
- leaderboard = init_leaderboard(REPO_RESULTS, aggregate=False)
 
 
181
 
182
  with gr.TabItem("📊 Analysis", id=2):
183
  df = get_clean_df()
 
1
+ import logging
2
  import os
3
  import re
4
+ import sys
5
 
6
  import gradio as gr
7
  import pandas as pd
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  from datasets import load_dataset
10
+ from datasets.data_files import EmptyDatasetError
11
  from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
12
  from huggingface_hub import HfApi
13
 
 
16
  from src.plots import plot_cost_efficiency, plot_parameter_efficiency
17
  from src.schema import AutoEvalColumn, EvalResult, fields
18
 
19
+ logging.basicConfig(
20
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
21
+ datefmt="%Y-%m-%d %H:%M:%S",
22
+ handlers=[logging.StreamHandler(sys.stdout)],
23
+ level=logging.INFO,
24
+ )
25
+
26
  # 1. Initialization
27
  _hf_token = os.environ.get("HF_TOKEN")
28
  if not _hf_token:
 
32
 
33
  REPO_ID = "UD-Filipino/filbench-leaderboard"
34
  REPO_RESULTS = "UD-Filipino/filbench-results"
35
+ SUBMISSION_RESULTS = "UD-Filipino/filbench-results-submission"
36
 
37
 
38
  def restart_space():
 
40
 
41
 
42
  # 2. Load and populate leaderboard data
43
+ def get_results(
44
+ source: str, aggregate: bool = False, submissions: str = None
45
+ ) -> tuple[pd.DataFrame, list]:
46
+ """Load results from a given source and return a DataFrame with the relevant columns.
47
+
48
+ If `aggregate` is True, it returns the aggregated results.
49
+
50
+ source (str): The source dataset to load results from.
51
+ aggregate (bool): Whether to return aggregated results or not.
52
+ submissions (str, optional): The submissions dataset to load results from.
53
+ RETURNS (tuple[pd.DataFrame, list]): A tuple containing the DataFrame with results and a list of master columns.
54
+ """
55
  results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
56
  raw_data = [EvalResult.init_from_dict(result) for result in results]
 
57
 
58
+ if submissions:
59
+ try:
60
+ submission_results = (
61
+ load_dataset(
62
+ submissions, split="train", download_mode="force_redownload"
63
+ )
64
+ .to_pandas()
65
+ .to_dict(orient="records")
66
+ )
67
+ except EmptyDatasetError:
68
+ logging.info("Empty dataset for submissions, skipping...")
69
+ submission_results = []
70
+ if len(submission_results) == 0:
71
+ logging.info("No external submissions found!")
72
+ else:
73
+ logging.info(f"Found {len(submission_results)} submission/s!")
74
+
75
+ raw_data += [
76
+ EvalResult.init_from_dict(result, is_submission=True)
77
+ for result in submission_results
78
+ ]
79
+
80
+ all_data_json = [v.to_dict() for v in raw_data]
81
  df = pd.DataFrame.from_records(all_data_json)
82
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
83
  df["Incomplete"] = ~df.isna().any(axis=1)
 
102
  return df, master_columns
103
 
104
 
105
+ def init_leaderboard(
106
+ source: str, aggregate: bool = False, submissions: str = None
107
+ ) -> Leaderboard:
108
+ df, master_columns = get_results(
109
+ source=source, aggregate=aggregate, submissions=submissions
110
+ )
111
 
112
  return Leaderboard(
113
  value=df,
 
128
  filter_columns=[
129
  # fmt: off
130
  ColumnFilter("Incomplete", type="boolean", label="Hide incomplete evaluations", default=True),
131
+ ColumnFilter("Submission", type="boolean", label="Show only submitted results", default=False),
132
  # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
133
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model type"),
134
  ColumnFilter(AutoEvalColumn.multilingual.name, type="checkboxgroup", label="Multilinguality"),
 
146
 
147
 
148
  def get_clean_df() -> pd.DataFrame:
149
+ df, _ = get_results(
150
+ source=REPO_RESULTS, aggregate=False, submissions=SUBMISSION_RESULTS
151
+ )
152
+ df_agg, _ = get_results(
153
+ source=REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS
154
+ )
155
 
156
  # Cleanup
157
  def extract_names(html_string):
 
218
  demo = gr.Blocks(css=custom_css)
219
  with demo:
220
  with gr.Column(scale=6):
221
+ num_models = len(
222
+ get_results(REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS)[0]
223
+ )
224
  gr.Markdown(about.TOP_TEXT.format(str(num_models)))
225
 
226
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
227
  with gr.TabItem(
228
  "🏅 FilBench Leaderboard", elem_id="llm-benchmark-tab-table", id=0
229
  ):
230
+ leaderboard = init_leaderboard(
231
+ REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS
232
+ )
233
 
234
  with gr.TabItem(
235
  "🔍 FilBench - Detailed", elem_id="llm-benchmark-tab-table", id=1
236
  ):
237
+ leaderboard = init_leaderboard(
238
+ REPO_RESULTS, aggregate=False, submissions=SUBMISSION_RESULTS
239
+ )
240
 
241
  with gr.TabItem("📊 Analysis", id=2):
242
  df = get_clean_df()
src/about.py CHANGED
@@ -11,7 +11,8 @@ current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
11
  TOP_TEXT = f"""
12
  # FilBench: An Open LLM Leaderboard for Filipino
13
 
14
- [Code](https://github.com/filbench/filbench) | [Runner](https://github.com/filbench/lighteval) | [Paper (<i>Coming soon!</i>)]() | Total Models: {{}} | Last restart (PHT): {current_time}
 
15
  """
16
 
17
  # Leaderboard reproducibility
@@ -27,7 +28,6 @@ We average four core sections (weighted by the number of instances):
27
  3. **Reading Comprehension:** Contains more focused natural language understanding (NLU) tasks and questions from readability benchmarks.
28
  4. **Generation:** Contains instances for natural language generation (NLG), more focused on translation.
29
 
30
-
31
  ## Evaluation Runner
32
 
33
  We use our own fork of [lighteval](https://github.com/filbench/lighteval) to perform evaluations.
@@ -35,30 +35,17 @@ We highly recommend using the vLLM backend for faster inference.
35
  Sequentially, evaluating on FilBench can take 4.93 hours on 2 NVIDIA H100 GPUs.
36
  However, the evaluation suite can be parallelized per benchmark, where the longest-running task can take approximately 1 hour and 28 minutes, and the shortest task takes only 5.86 minutes.
37
 
38
- To evaluate your model on FilBench and for it to appear in the leaderboard, please follow these steps:
39
-
40
- 1. First clone the FilBench's lighteval repository and install all dependencies:
41
-
42
- ```sh
43
- git clone https://github.com/filbench/lighteval.git
44
- python3 -m venv venv
45
- pip install -e .[dev,vllm]
46
- ```
47
-
48
- 2. Run the evaluation runner via vLLM
49
 
50
- ```sh
51
- python3 -m lighteval vllm ${MODEL_NAME} ${TASK_NAME} \\
52
- --push-to-hub \\
53
- --results-org UD-Filipino \\
54
- --custom-tasks community_tasks/filbench_evals.py
55
- ```
56
 
57
- You can find the list of all FilBench tasks [in this file](https://github.com/filbench/lighteval/blob/main/examples/tasks/all_filbench_tasks.txt).
 
58
 
59
  ## Acknowledgements
60
 
61
- The authors would like to thank Cohere Labs for the Cohere Research Grant that includes credits for running the Command models.
 
62
  """
63
 
64
  # Citation information
 
11
  TOP_TEXT = f"""
12
  # FilBench: An Open LLM Leaderboard for Filipino
13
 
14
+ [Code](https://github.com/filbench/filbench-eval) | [Paper (<i>Coming soon!</i>)]() | Total Models: {{}} | Last restart (PHT): {current_time}
15
+ 📥: Indicates model submissions from the community. If you wish to submit your model evaluations, then please check our instructions on [GitHub](https://github.com/filbench/filbench-eval).
16
  """
17
 
18
  # Leaderboard reproducibility
 
28
  3. **Reading Comprehension:** Contains more focused natural language understanding (NLU) tasks and questions from readability benchmarks.
29
  4. **Generation:** Contains instances for natural language generation (NLG), more focused on translation.
30
 
 
31
  ## Evaluation Runner
32
 
33
  We use our own fork of [lighteval](https://github.com/filbench/lighteval) to perform evaluations.
 
35
  Sequentially, evaluating on FilBench can take 4.93 hours on 2 NVIDIA H100 GPUs.
36
  However, the evaluation suite can be parallelized per benchmark, where the longest-running task can take approximately 1 hour and 28 minutes, and the shortest task takes only 5.86 minutes.
37
 
38
+ To evaluate your model on FilBench and for it to appear in the leaderboard, please follow the steps in our [Github repository](https://github.com/filbench/filbench-eval).
 
 
 
 
 
 
 
 
 
 
39
 
40
+ ## Contact
 
 
 
 
 
41
 
42
+ This work was done by Lj V. Miranda ([@ljvmiranda921](https://github.com/ljvmiranda921)), Elyanah Aco ([@elyanah-aco](https://github.com/elyanah-aco)), Conner Manuel ([@connermanuel](https://github.com/connermanuel)), Blaise Cruz ([@jcblaisecruz02](https://github.com/jcblaisecruz02)), and Joseph Imperial ([@imperialite](https://github.com/imperialite)).
43
+ For any questions, please reach out to us via [email protected] or through our [GitHub Issues](https://github.com/filbench/filbench-eval/issues).
44
 
45
  ## Acknowledgements
46
 
47
+ We would like to thank [Cohere Labs](https://cohere.com/research) for providing credits through the [Cohere Research Grant](https://cohere.com/research/grants) to run the Aya model series, and [Together AI](https://together.ai) for additional computational credits for running several open models.
48
+ We also acknowledge the Hugging Face team, particularly the OpenEvals team (Clémentine Fourrier [@clefourrier](https://github.com/clefourrier) and Nathan Habib [@NathanHB](https://github.com/NathanHB)) and Daniel van Strien [@davanstrien](https://github.com/davanstrien), for their support in publishing the FilBench blog post.
49
  """
50
 
51
  # Citation information
src/schema.py CHANGED
@@ -1,9 +1,18 @@
 
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import numpy as np
5
 
6
- from src.display.formatting import make_clickable_model
 
 
 
 
 
 
 
7
 
8
 
9
  def fields(raw_class):
@@ -147,6 +156,8 @@ auto_eval_cols = [
147
  ["param_size", ColumnContent, ColumnContent("# Parameters", "number", False, meta=True)],
148
  ["multilingual", ColumnContent, ColumnContent("Multilingual", "markdown", False, meta=True)],
149
  ["model_type", ColumnContent, ColumnContent("Model Type", "markdown", False, meta=True)],
 
 
150
  # fmt: on
151
  ]
152
  for task in Tasks:
@@ -199,16 +210,28 @@ class EvalResult:
199
  average: float
200
  aggregate_results: dict
201
  precision: Precision = Precision.Unknown
 
 
 
 
 
 
 
202
 
203
  @classmethod
204
- def init_from_dict(self, data: dict):
205
  """Populate results from a dictionary"""
206
 
207
- config = data.get("config")
208
-
209
- # Get model details
210
  precision = Precision.from_str(config.get("model_dtype"))
211
- org_and_model = config.get("model_name", config.get("model_args", None))
 
 
 
 
 
212
  org_and_model = org_and_model.split("/", 1)
213
 
214
  if len(org_and_model) == 1:
@@ -221,35 +244,105 @@ class EvalResult:
221
  result_key = f"{org}_{model}_{precision.value.name}"
222
  full_model = "/".join(org_and_model)
223
 
 
 
 
 
224
  # Format all results
225
- scores = data.get("results")
226
- results = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  for task in Tasks:
228
  task = task.value
229
- if scores.get(task.benchmark):
230
- score = scores.get(task.benchmark).get(task.metric)
231
  if "acc_" in task.metric:
232
  score = score * 100.0
233
  if "rougeL" in task.metric:
234
  score = score * 100.0
235
- results[task.benchmark] = score
236
  else:
237
- results[task.benchmark] = None
238
-
239
- # Compute weighted average
240
- # weighted_total = 0
241
- # for task in Tasks:
242
- # task = task.value
243
- # if results[task.benchmark]:
244
- # score = results[task.benchmark]
245
- # else:
246
- # score = 0
247
- # weighted_total += score * task.num_samples
248
-
249
- # total = sum([task.value.num_samples for task in Tasks])
250
- # average = weighted_total / total
251
-
252
- # Compute weighted average for each category
253
  aggregate_results = {}
254
  for task_category in TaskCategory:
255
  tasks = [
@@ -266,41 +359,59 @@ class EvalResult:
266
  aggregate_results[task_category.value] = (
267
  weighted_total_category / total_category
268
  )
 
269
 
270
- # Correct average computation
271
- average = np.mean(list(aggregate_results.values()))
272
-
273
- return self(
274
- eval_name=result_key,
275
- full_model=full_model,
276
- org=org,
277
- model=model,
278
- precision=precision,
279
- results=results,
280
- aggregate_results=aggregate_results,
281
- average=average,
282
- )
 
 
 
283
 
284
  def to_dict(self):
285
  """Converts the EvalResult to a dict compatible with our dataframe display"""
286
 
287
- model_details = model_registry.get(
288
- self.full_model,
289
- ModelSUT(
290
- param_size=-1,
291
- model_type=ModelType.UNKNOWN.value,
292
- multilingual=Multilingual.UNKNOWN.value,
293
- ),
294
- )
 
 
 
 
 
 
 
295
 
 
 
 
 
 
296
  data_dict = {
297
  "eval_name": self.eval_name, # not a column, just a save name
298
  AutoEvalColumn.precision.name: self.precision.value.name,
299
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
300
  AutoEvalColumn.average.name: self.average,
301
  AutoEvalColumn.param_size.name: model_details.param_size,
302
  AutoEvalColumn.model_type.name: model_details.model_type,
303
  AutoEvalColumn.multilingual.name: model_details.multilingual,
 
 
304
  }
305
 
306
  for task in Tasks:
 
1
+ import logging
2
+ import sys
3
  from dataclasses import dataclass, make_dataclass
4
  from enum import Enum
5
 
6
  import numpy as np
7
 
8
+ from src.display.formatting import make_clickable_model, model_hyperlink
9
+
10
+ logging.basicConfig(
11
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
12
+ datefmt="%Y-%m-%d %H:%M:%S",
13
+ handlers=[logging.StreamHandler(sys.stdout)],
14
+ level=logging.INFO,
15
+ )
16
 
17
 
18
  def fields(raw_class):
 
156
  ["param_size", ColumnContent, ColumnContent("# Parameters", "number", False, meta=True)],
157
  ["multilingual", ColumnContent, ColumnContent("Multilingual", "markdown", False, meta=True)],
158
  ["model_type", ColumnContent, ColumnContent("Model Type", "markdown", False, meta=True)],
159
+ ["is_submission", ColumnContent, ColumnContent("Submission", "boolean", False, meta=True)],
160
+ ["submission_date", ColumnContent, ColumnContent("Submission Date", "str", False, meta=True)],
161
  # fmt: on
162
  ]
163
  for task in Tasks:
 
210
  average: float
211
  aggregate_results: dict
212
  precision: Precision = Precision.Unknown
213
+ # Submission metadata
214
+ is_submission: bool = False
215
+ param_size: float = -1
216
+ model_type: str = ModelType.UNKNOWN.value
217
+ multilingual: str = Multilingual.UNKNOWN.value
218
+ submission_date: str = ""
219
+ model_url: str = "https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard"
220
 
221
  @classmethod
222
+ def init_from_dict(self, data: dict, is_submission: bool = False) -> "EvalResult":
223
  """Populate results from a dictionary"""
224
 
225
+ # For model details, use user-provided metadata if it's a submission
226
+ config_key = "display_metadata" if is_submission else "config"
227
+ config = data.get(config_key)
228
  precision = Precision.from_str(config.get("model_dtype"))
229
+
230
+ org_and_model = (
231
+ config.get("hf_id")
232
+ if is_submission
233
+ else config.get("model_name", config.get("model_args", None))
234
+ )
235
  org_and_model = org_and_model.split("/", 1)
236
 
237
  if len(org_and_model) == 1:
 
244
  result_key = f"{org}_{model}_{precision.value.name}"
245
  full_model = "/".join(org_and_model)
246
 
247
+ results = EvalResult.compute_scores_per_benchmark(data.get("results"))
248
+ aggregate_results = EvalResult.compute_aggregate_results(results)
249
+ filbench_score = np.mean(list(aggregate_results.values()))
250
+
251
  # Format all results
252
+ if is_submission:
253
+ # Use pre-computed scores and check if they match our computed scores
254
+ category_scores = data.get("category_scores")
255
+ aggregate_results_precomputed = {
256
+ TaskCategory.CULTURAL_KNOWLEDGE.value: category_scores.get(
257
+ "CULTURAL_KNOWLEDGE"
258
+ ),
259
+ TaskCategory.CLASSICAL_NLP.value: category_scores.get("CLASSICAL_NLP"),
260
+ TaskCategory.READING_COMPREHENSION.value: category_scores.get(
261
+ "READING_COMPREHENSION"
262
+ ),
263
+ TaskCategory.TRANSLATION.value: category_scores.get("GENERATION"),
264
+ }
265
+ is_similar = EvalResult.compare_category_scores(
266
+ precomputed=aggregate_results_precomputed,
267
+ computed=aggregate_results,
268
+ )
269
+ if not is_similar:
270
+ logging.warning("Precomputed and computed category scores differ.")
271
+ logging.info("Will use computed scores for display.")
272
+ else:
273
+ logging.info("Precomputed and computed category scores are similar.")
274
+ aggregate_results = aggregate_results_precomputed
275
+
276
+ # Do the same comparison for FilBench score
277
+ filbench_score_precomputed = data.get("filbench_score")
278
+ is_filbench_score_similar = (
279
+ abs(filbench_score_precomputed - filbench_score) < 1e-2
280
+ )
281
+ if not is_filbench_score_similar:
282
+ logging.warning(
283
+ f"Precomputed filbench_score ({filbench_score_precomputed}) and"
284
+ f" official FilBench score ({filbench_score}) differ."
285
+ )
286
+ average = (
287
+ filbench_score_precomputed
288
+ if is_filbench_score_similar
289
+ else filbench_score
290
+ )
291
+ display_metadata = data.get("display_metadata")
292
+
293
+ return EvalResult(
294
+ eval_name=result_key,
295
+ full_model=full_model,
296
+ org=org,
297
+ model=model,
298
+ precision=precision,
299
+ results=results,
300
+ aggregate_results=aggregate_results,
301
+ average=average,
302
+ # Display Metadata
303
+ is_submission=True,
304
+ submission_date=display_metadata.get("submission_date", ""),
305
+ param_size=display_metadata.get("num_params", -1),
306
+ model_type=display_metadata.get("model_type", ModelType.UNKNOWN.value),
307
+ multilingual=display_metadata.get(
308
+ "multilinguality", Multilingual.UNKNOWN.value
309
+ ),
310
+ model_url=display_metadata.get(
311
+ "url",
312
+ "https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard",
313
+ ),
314
+ )
315
+ else:
316
+ return self(
317
+ eval_name=result_key,
318
+ full_model=full_model,
319
+ org=org,
320
+ model=model,
321
+ precision=precision,
322
+ results=results,
323
+ aggregate_results=aggregate_results,
324
+ is_submission=False,
325
+ average=filbench_score,
326
+ )
327
+
328
+ @classmethod
329
+ def compute_scores_per_benchmark(cls, results: dict) -> dict[str, float]:
330
+ scores_per_benchmark = {}
331
  for task in Tasks:
332
  task = task.value
333
+ if results.get(task.benchmark):
334
+ score = results.get(task.benchmark).get(task.metric)
335
  if "acc_" in task.metric:
336
  score = score * 100.0
337
  if "rougeL" in task.metric:
338
  score = score * 100.0
339
+ scores_per_benchmark[task.benchmark] = score
340
  else:
341
+ scores_per_benchmark[task.benchmark] = None
342
+ return scores_per_benchmark
343
+
344
+ @classmethod
345
+ def compute_aggregate_results(cls, results: dict) -> dict[str, float]:
 
 
 
 
 
 
 
 
 
 
 
346
  aggregate_results = {}
347
  for task_category in TaskCategory:
348
  tasks = [
 
359
  aggregate_results[task_category.value] = (
360
  weighted_total_category / total_category
361
  )
362
+ return aggregate_results
363
 
364
+ @classmethod
365
+ def compare_category_scores(
366
+ cls, precomputed: dict, computed: dict, threshold: float = 1e-2
367
+ ) -> bool:
368
+ """Compares precomputed and computed category scores."""
369
+ is_similar = True
370
+ for key, precomputed_value in precomputed.items():
371
+ computed_value = computed.get(key)
372
+ if precomputed_value is not None and computed_value is not None:
373
+ if abs(precomputed_value - computed_value) > threshold:
374
+ logging.warning(
375
+ f"Aggregate result for '{key}' differs"
376
+ f" (precomputed={precomputed_value}, computed={computed_value})"
377
+ )
378
+ is_similar = False
379
+ return is_similar
380
 
381
  def to_dict(self):
382
  """Converts the EvalResult to a dict compatible with our dataframe display"""
383
 
384
+ if not self.is_submission:
385
+ model_details = model_registry.get(
386
+ self.full_model,
387
+ ModelSUT(
388
+ param_size=-1,
389
+ model_type=ModelType.UNKNOWN.value,
390
+ multilingual=Multilingual.UNKNOWN.value,
391
+ ),
392
+ )
393
+ else:
394
+ model_details = ModelSUT(
395
+ param_size=self.param_size,
396
+ model_type=self.model_type,
397
+ multilingual=self.multilingual,
398
+ )
399
 
400
+ model_name_with_url = (
401
+ make_clickable_model(self.full_model)
402
+ if not self.is_submission
403
+ else f"📥 {model_hyperlink(self.model_url, self.full_model)}"
404
+ )
405
  data_dict = {
406
  "eval_name": self.eval_name, # not a column, just a save name
407
  AutoEvalColumn.precision.name: self.precision.value.name,
408
+ AutoEvalColumn.model.name: model_name_with_url,
409
  AutoEvalColumn.average.name: self.average,
410
  AutoEvalColumn.param_size.name: model_details.param_size,
411
  AutoEvalColumn.model_type.name: model_details.model_type,
412
  AutoEvalColumn.multilingual.name: model_details.multilingual,
413
+ AutoEvalColumn.is_submission.name: self.is_submission,
414
+ AutoEvalColumn.submission_date.name: self.submission_date,
415
  }
416
 
417
  for task in Tasks: