davidpomerenke commited on
Commit
68a93b5
·
verified ·
1 Parent(s): 5102b0a

Upload from GitHub Actions: Add auto-translated datasets

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.github/workflows/nightly-evals.yml CHANGED
@@ -8,8 +8,6 @@ on:
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
11
- # checking if this is working in case eval runs take longer than 6h github actions allowance
12
- timeout-minutes: 1440 # 24 hours timeout
13
  steps:
14
  - uses: actions/checkout@v3
15
 
@@ -27,8 +25,6 @@ jobs:
27
  env:
28
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
29
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
30
- N_SENTENCES: 20
31
- MAX_LANGUAGES: 150
32
  run: |
33
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
34
  uv run evals/download_data.py
 
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
 
 
11
  steps:
12
  - uses: actions/checkout@v3
13
 
 
25
  env:
26
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
27
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
 
 
28
  run: |
29
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
30
  uv run evals/download_data.py
.gitignore CHANGED
@@ -20,5 +20,3 @@ wheels/
20
  # folders and files to be ignored
21
  .specstory/
22
  .cursorindexingignore
23
-
24
-
 
20
  # folders and files to be ignored
21
  .specstory/
22
  .cursorindexingignore
 
 
Dockerfile CHANGED
@@ -14,7 +14,7 @@ ENV HOME=/home/user \
14
  RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
15
  USER user
16
  WORKDIR $HOME/app
17
- COPY --chown=user pyproject.toml uv.lock README.md ./
18
  RUN uv sync --frozen --no-dev
19
  COPY --chown=user evals/ evals/
20
  COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
 
14
  RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
15
  USER user
16
  WORKDIR $HOME/app
17
+ COPY --chown=user pyproject.toml uv.lock ./
18
  RUN uv sync --frozen --no-dev
19
  COPY --chown=user evals/ evals/
20
  COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
README.md CHANGED
@@ -45,7 +45,6 @@ _Tracking language proficiency of AI models for every language_
45
 
46
  ## Evaluate
47
 
48
- ### Local Development
49
  ```bash
50
  uv run --extra dev evals/main.py
51
  ```
@@ -56,7 +55,3 @@ uv run --extra dev evals/main.py
56
  uv run evals/backend.py
57
  cd frontend && npm i && npm start
58
  ```
59
-
60
- ## System Architecture
61
-
62
- See [system_architecture_diagram.md](system_architecture_diagram.md) for the complete system architecture diagram and component descriptions.
 
45
 
46
  ## Evaluate
47
 
 
48
  ```bash
49
  uv run --extra dev evals/main.py
50
  ```
 
55
  uv run evals/backend.py
56
  cd frontend && npm i && npm start
57
  ```
 
 
 
 
datasets.json CHANGED
@@ -219,7 +219,7 @@
219
  "parallel": true,
220
  "translation": "machine",
221
  "base": "MMLU",
222
- "implemented": false,
223
  "group": "Multitask Language Understanding"
224
  },
225
  {
@@ -256,7 +256,7 @@
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
- "implemented": false,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
@@ -360,7 +360,7 @@
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
- "implemented": false,
364
  "group": "ARC Question Answering"
365
  },
366
  {
@@ -375,7 +375,7 @@
375
  "parallel": true,
376
  "translation": "machine",
377
  "base": "AI2 ARC",
378
- "implemented": false,
379
  "group": "ARC Question Answering"
380
  },
381
  {
@@ -420,7 +420,7 @@
420
  "parallel": true,
421
  "translation": "machine",
422
  "base": "TruthfulQA",
423
- "implemented": false,
424
  "group": "Truthfulness"
425
  },
426
  {
@@ -435,7 +435,7 @@
435
  "parallel": true,
436
  "translation": "machine",
437
  "base": "TruthfulQA",
438
- "implemented": false,
439
  "group": "Truthfulness"
440
  },
441
  {
 
219
  "parallel": true,
220
  "translation": "machine",
221
  "base": "MMLU",
222
+ "implemented": true,
223
  "group": "Multitask Language Understanding"
224
  },
225
  {
 
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
+ "implemented": true,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
 
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
+ "implemented": true,
364
  "group": "ARC Question Answering"
365
  },
366
  {
 
375
  "parallel": true,
376
  "translation": "machine",
377
  "base": "AI2 ARC",
378
+ "implemented": true,
379
  "group": "ARC Question Answering"
380
  },
381
  {
 
420
  "parallel": true,
421
  "translation": "machine",
422
  "base": "TruthfulQA",
423
+ "implemented": true,
424
  "group": "Truthfulness"
425
  },
426
  {
 
435
  "parallel": true,
436
  "translation": "machine",
437
  "base": "TruthfulQA",
438
+ "implemented": true,
439
  "group": "Truthfulness"
440
  },
441
  {
evals/__init__.py CHANGED
@@ -0,0 +1 @@
 
 
1
+
evals/backend.py CHANGED
@@ -4,8 +4,7 @@ import os
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
7
-
8
- from evals.countries import make_country_table
9
  from fastapi import FastAPI, Request
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.middleware.gzip import GZipMiddleware
@@ -27,7 +26,7 @@ task_metrics = [
27
  "classification_accuracy",
28
  "mmlu_accuracy",
29
  "arc_accuracy",
30
- "truthfulqa_accuracy",
31
  "mgsm_accuracy",
32
  ]
33
 
@@ -40,77 +39,28 @@ def compute_normalized_average(df, metrics):
40
  col_min = normalized_df[col].min()
41
  col_max = normalized_df[col].max()
42
  if col_max > col_min: # Avoid division by zero
43
- normalized_df[col] = (normalized_df[col] - col_min) / (
44
- col_max - col_min
45
- )
46
  else:
47
  normalized_df[col] = 0 # If all values are the same, set to 0
48
  return normalized_df.mean(axis=1, skipna=False)
49
 
50
 
51
- def make_model_table(scores_df, models):
52
- # Create a combined task_metric for origin
53
- scores_df["task_metric_origin"] = (
54
- scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
55
- )
56
-
57
- # Pivot to get scores for each origin-specific metric
58
- scores_pivot = scores_df.pivot_table(
59
- index="model",
60
- columns="task_metric_origin",
61
- values="score",
62
- aggfunc="mean",
63
- )
64
-
65
- # Create the regular task_metric for the main average calculation
66
- scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
67
- main_pivot = scores_df.pivot_table(
68
- index="model", columns="task_metric", values="score", aggfunc="mean"
69
  )
70
-
71
- # Merge the two pivots
72
- df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
73
-
74
  for metric in task_metrics:
75
  if metric not in df.columns:
76
  df[metric] = np.nan
77
-
78
  df["average"] = compute_normalized_average(df, task_metrics)
79
-
80
- # Compute origin presence per model+metric
81
- origin_presence = (
82
- scores_df.groupby(["model", "task_metric", "origin"])
83
- .size()
84
- .unstack(fill_value=0)
85
- )
86
- # Add boolean flags: show asterisk only if exclusively machine-origin contributed
87
- for metric in task_metrics:
88
- human_col_name = "human" if "human" in origin_presence.columns else None
89
- machine_col_name = "machine" if "machine" in origin_presence.columns else None
90
- if human_col_name or machine_col_name:
91
- flags = []
92
- for model in df.index:
93
- try:
94
- counts = origin_presence.loc[(model, metric)]
95
- except KeyError:
96
- flags.append(False)
97
- continue
98
- human_count = counts.get(human_col_name, 0) if human_col_name else 0
99
- machine_count = (
100
- counts.get(machine_col_name, 0) if machine_col_name else 0
101
- )
102
- flags.append(machine_count > 0 and human_count == 0)
103
- df[f"{metric}_is_machine"] = flags
104
- else:
105
- df[f"{metric}_is_machine"] = False
106
  df = df.sort_values(by="average", ascending=False).reset_index()
107
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
108
  df["rank"] = df.index + 1
109
-
110
- # Dynamically find all metric columns to include
111
- final_cols = df.columns
112
- metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
113
-
114
  df = df[
115
  [
116
  "rank",
@@ -124,81 +74,27 @@ def make_model_table(scores_df, models):
124
  "license",
125
  "cost",
126
  "average",
127
- *sorted(list(set(metric_cols))),
128
  ]
129
  ]
130
  return df
131
 
132
 
133
- def make_language_table(scores_df, languages):
134
- # Create a combined task_metric for origin
135
- scores_df["task_metric_origin"] = (
136
- scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
137
- )
138
-
139
- # Pivot to get scores for each origin-specific metric
140
- scores_pivot = scores_df.pivot_table(
141
- index="bcp_47",
142
- columns="task_metric_origin",
143
- values="score",
144
- aggfunc="mean",
145
- )
146
-
147
- # Create the regular task_metric for the main average calculation
148
- scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
149
- main_pivot = scores_df.pivot_table(
150
- index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
151
  )
152
-
153
- # Merge the two pivots
154
- df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
155
-
156
  for metric in task_metrics:
157
  if metric not in df.columns:
158
  df[metric] = np.nan
159
-
160
  df["average"] = compute_normalized_average(df, task_metrics)
161
-
162
- # Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
163
- origin_presence = (
164
- scores_df.groupby(["bcp_47", "task_metric", "origin"])
165
- .size()
166
- .unstack(fill_value=0)
167
- )
168
- for metric in task_metrics:
169
- human_col_name = "human" if "human" in origin_presence.columns else None
170
- machine_col_name = "machine" if "machine" in origin_presence.columns else None
171
- if human_col_name or machine_col_name:
172
- flags = []
173
- for bcp in df.index:
174
- try:
175
- counts = origin_presence.loc[(bcp, metric)]
176
- except KeyError:
177
- flags.append(False)
178
- continue
179
- human_count = counts.get(human_col_name, 0) if human_col_name else 0
180
- machine_count = (
181
- counts.get(machine_col_name, 0) if machine_col_name else 0
182
- )
183
- flags.append(machine_count > 0 and human_count == 0)
184
- df[f"{metric}_is_machine"] = flags
185
- else:
186
- df[f"{metric}_is_machine"] = False
187
-
188
- # Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
189
- for metric in task_metrics:
190
- machine_col = f"{metric}_machine"
191
- if machine_col in df.columns:
192
- df[f"{metric}_is_machine"] = df[machine_col].notna()
193
- else:
194
- df[f"{metric}_is_machine"] = False
195
  df = pd.merge(languages, df, on="bcp_47", how="outer")
196
  df = df.sort_values(by="speakers", ascending=False)
197
-
198
- # Dynamically find all metric columns to include
199
- final_cols = df.columns
200
- metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
201
-
202
  df = df[
203
  [
204
  "bcp_47",
@@ -208,7 +104,7 @@ def make_language_table(scores_df, languages):
208
  "family",
209
  "average",
210
  "in_benchmark",
211
- *sorted(list(set(metric_cols))),
212
  ]
213
  ]
214
  return df
@@ -229,22 +125,10 @@ async def data(request: Request):
229
  body = await request.body()
230
  data = json.loads(body)
231
  selected_languages = data.get("selectedLanguages", {})
232
- df = (
233
- scores.groupby(["model", "bcp_47", "task", "metric", "origin"])
234
- .mean()
235
- .reset_index()
236
- )
237
  # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
238
  language_table = make_language_table(df, languages)
239
  datasets_df = pd.read_json("datasets.json")
240
-
241
- # Identify which metrics have machine translations available
242
- machine_translated_metrics = set()
243
- for _, row in df.iterrows():
244
- if row["origin"] == "machine":
245
- metric_name = f"{row['task']}_{row['metric']}"
246
- machine_translated_metrics.add(metric_name)
247
-
248
  if selected_languages:
249
  # the filtering is only applied for the model table and the country data
250
  df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
@@ -259,7 +143,6 @@ async def data(request: Request):
259
  "language_table": serialize(language_table),
260
  "dataset_table": serialize(datasets_df),
261
  "countries": serialize(countries),
262
- "machine_translated_metrics": list(machine_translated_metrics),
263
  }
264
  return JSONResponse(content=all_tables)
265
 
 
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
7
+ from countries import make_country_table
 
8
  from fastapi import FastAPI, Request
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.middleware.gzip import GZipMiddleware
 
26
  "classification_accuracy",
27
  "mmlu_accuracy",
28
  "arc_accuracy",
29
+ # "truthfulqa_accuracy",
30
  "mgsm_accuracy",
31
  ]
32
 
 
39
  col_min = normalized_df[col].min()
40
  col_max = normalized_df[col].max()
41
  if col_max > col_min: # Avoid division by zero
42
+ normalized_df[col] = (normalized_df[col] - col_min) / (col_max - col_min)
 
 
43
  else:
44
  normalized_df[col] = 0 # If all values are the same, set to 0
45
  return normalized_df.mean(axis=1, skipna=False)
46
 
47
 
48
+ def make_model_table(df, models):
49
+ df = (
50
+ df.groupby(["model", "task", "metric"])
51
+ .agg({"score": "mean", "bcp_47": "nunique"})
52
+ .reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  )
54
+ df["task_metric"] = df["task"] + "_" + df["metric"]
55
+ df = df.drop(columns=["task", "metric"])
56
+ df = df.pivot(index="model", columns="task_metric", values="score")
 
57
  for metric in task_metrics:
58
  if metric not in df.columns:
59
  df[metric] = np.nan
 
60
  df["average"] = compute_normalized_average(df, task_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  df = df.sort_values(by="average", ascending=False).reset_index()
62
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
63
  df["rank"] = df.index + 1
 
 
 
 
 
64
  df = df[
65
  [
66
  "rank",
 
74
  "license",
75
  "cost",
76
  "average",
77
+ *task_metrics,
78
  ]
79
  ]
80
  return df
81
 
82
 
83
+ def make_language_table(df, languages):
84
+ df = (
85
+ df.groupby(["bcp_47", "task", "metric"])
86
+ .agg({"score": "mean", "model": "nunique"})
87
+ .reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  )
89
+ df["task_metric"] = df["task"] + "_" + df["metric"]
90
+ df = df.drop(columns=["task", "metric"])
91
+ df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
 
92
  for metric in task_metrics:
93
  if metric not in df.columns:
94
  df[metric] = np.nan
 
95
  df["average"] = compute_normalized_average(df, task_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  df = pd.merge(languages, df, on="bcp_47", how="outer")
97
  df = df.sort_values(by="speakers", ascending=False)
 
 
 
 
 
98
  df = df[
99
  [
100
  "bcp_47",
 
104
  "family",
105
  "average",
106
  "in_benchmark",
107
+ *task_metrics,
108
  ]
109
  ]
110
  return df
 
125
  body = await request.body()
126
  data = json.loads(body)
127
  selected_languages = data.get("selectedLanguages", {})
128
+ df = scores.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
 
 
 
 
129
  # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
130
  language_table = make_language_table(df, languages)
131
  datasets_df = pd.read_json("datasets.json")
 
 
 
 
 
 
 
 
132
  if selected_languages:
133
  # the filtering is only applied for the model table and the country data
134
  df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
 
143
  "language_table": serialize(language_table),
144
  "dataset_table": serialize(datasets_df),
145
  "countries": serialize(countries),
 
146
  }
147
  return JSONResponse(content=all_tables)
148
 
evals/countries.py CHANGED
@@ -15,7 +15,6 @@ def population(bcp_47):
15
  }
16
  return items
17
 
18
-
19
  @cache
20
  def make_country_table(language_table):
21
  countries = defaultdict(list)
@@ -31,15 +30,10 @@ def make_country_table(language_table):
31
  )
32
  for country, languages in countries.items():
33
  speaker_pop = sum(entry["population"] for entry in languages)
34
-
35
- if speaker_pop < 1000: # Grey out low-population countries
36
- score = None # This will make them appear grey on the map
37
- else:
38
- score = (
39
- sum(entry["score"] * entry["population"] for entry in languages)
40
- / speaker_pop
41
- )
42
-
43
  countries[country] = {
44
  "score": score,
45
  "languages": languages,
 
15
  }
16
  return items
17
 
 
18
  @cache
19
  def make_country_table(language_table):
20
  countries = defaultdict(list)
 
30
  )
31
  for country, languages in countries.items():
32
  speaker_pop = sum(entry["population"] for entry in languages)
33
+ score = (
34
+ sum(entry["score"] * entry["population"] for entry in languages)
35
+ / speaker_pop
36
+ )
 
 
 
 
 
37
  countries[country] = {
38
  "score": score,
39
  "languages": languages,
evals/datasets_/arc.py CHANGED
@@ -1,10 +1,11 @@
1
  import random
 
2
 
3
- from langcodes import standardize_tag
4
  from rich import print
5
- from models import translate_google, get_google_supported_languages
6
  from tqdm import tqdm
7
- from datasets import load_dataset, Dataset
8
  import asyncio
9
  from tqdm.asyncio import tqdm_asyncio
10
  import os
@@ -13,33 +14,27 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
13
 
14
  slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
15
  tags_uhura_arc_easy = {
16
- standardize_tag(a.split("_")[0], macro=True): a
17
- for a in _get_dataset_config_names(slug_uhura_arc_easy)
18
  if not a.endswith("unmatched")
19
  }
20
 
21
 
22
  random.seed(42)
23
- id_sets_train = [
24
- set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
25
- for tag in tags_uhura_arc_easy.values()
26
- ]
27
  common_ids_train = list(sorted(set.intersection(*id_sets_train)))
28
  random.shuffle(common_ids_train)
29
- id_sets_test = [
30
- set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
31
- for tag in tags_uhura_arc_easy.values()
32
- ]
33
  common_ids_test = list(sorted(set.intersection(*id_sets_test)))
34
  random.shuffle(common_ids_test)
35
 
36
  slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
37
  tags_uhura_arc_easy_translated = {
38
- standardize_tag(a.split("_")[0], macro=True): a
39
- for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
40
  }
41
 
42
 
 
 
43
  def add_choices(row):
44
  row["choices"] = row["choices"]["text"]
45
  return row
@@ -50,40 +45,37 @@ def load_uhura_arc_easy(language_bcp_47, nr):
50
  ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
51
  ds = ds.map(add_choices)
52
  ds = ds.rename_column("answerKey", "answer")
 
 
53
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
54
- return "masakhane/uhura-arc-easy", task, "human"
55
  if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
56
- ds = _load_dataset(
57
- slug_uhura_arc_easy_translated,
58
- tags_uhura_arc_easy_translated[language_bcp_47],
59
- )
60
  ds = ds.rename_column("answerKey", "answer")
 
 
 
61
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
62
- return "fair-forward/arc-easy-autotranslated", task, "machine"
63
  else:
64
  return None, None, None
65
 
66
-
67
  def translate_arc(languages):
68
  human_translated = tags_uhura_arc_easy.keys()
69
  untranslated = [
70
  lang
71
  for lang in languages["bcp_47"].values[:100]
72
- if lang not in human_translated and lang in get_google_supported_languages()
73
  ]
74
  n_samples = 10
75
- train_ids = common_ids_train[: n_samples + 3]
76
- en_train = _load_dataset(
77
- slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train"
78
- )
79
  en_train = en_train.filter(lambda x: x["id"] in train_ids)
80
  test_ids = common_ids_test[:n_samples]
81
- en_test = _load_dataset(
82
- slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test"
83
- )
84
  en_test = en_test.filter(lambda x: x["id"] in test_ids)
85
  data = {"train": en_train, "test": en_test}
86
-
87
  slug = "fair-forward/arc-easy-autotranslated"
88
  for lang in tqdm(untranslated):
89
  # check if already exists on hub
@@ -92,22 +84,16 @@ def translate_arc(languages):
92
  except (ValueError, Exception):
93
  print(f"Translating {lang}...")
94
  for split, data_en in data.items():
95
- questions_tr = [
96
- translate_google(q, "en", lang) for q in data_en["question"]
97
- ]
98
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
99
  choices_texts_concatenated = []
100
  for choice in data_en["choices"]:
101
  for option in choice["text"]:
102
  choices_texts_concatenated.append(option)
103
- choices_tr = [
104
- translate_google(c, "en", lang) for c in choices_texts_concatenated
105
- ]
106
  choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
107
  # group into chunks of 4
108
- choices_tr = [
109
- choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
110
- ]
111
 
112
  ds_lang = Dataset.from_dict(
113
  {
@@ -124,8 +110,5 @@ def translate_arc(languages):
124
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
125
  )
126
  ds_lang.to_json(
127
- f"data/translations/arc/{lang}_{split}.json",
128
- lines=False,
129
- force_ascii=False,
130
- indent=2,
131
  )
 
1
  import random
2
+ from collections import Counter, defaultdict
3
 
4
+ from langcodes import Language, standardize_tag
5
  from rich import print
6
+ from models import translate_google, google_supported_languages
7
  from tqdm import tqdm
8
+ from datasets import Dataset, load_dataset
9
  import asyncio
10
  from tqdm.asyncio import tqdm_asyncio
11
  import os
 
14
 
15
  slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
16
  tags_uhura_arc_easy = {
17
+ standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy)
 
18
  if not a.endswith("unmatched")
19
  }
20
 
21
 
22
  random.seed(42)
23
+ id_sets_train = [set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"]) for tag in tags_uhura_arc_easy.values()]
 
 
 
24
  common_ids_train = list(sorted(set.intersection(*id_sets_train)))
25
  random.shuffle(common_ids_train)
26
+ id_sets_test = [set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"]) for tag in tags_uhura_arc_easy.values()]
 
 
 
27
  common_ids_test = list(sorted(set.intersection(*id_sets_test)))
28
  random.shuffle(common_ids_test)
29
 
30
  slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
31
  tags_uhura_arc_easy_translated = {
32
+ standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
 
33
  }
34
 
35
 
36
+
37
+
38
  def add_choices(row):
39
  row["choices"] = row["choices"]["text"]
40
  return row
 
45
  ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
46
  ds = ds.map(add_choices)
47
  ds = ds.rename_column("answerKey", "answer")
48
+ train_ids = common_ids_train[nr:nr+3]
49
+ examples = ds["train"].filter(lambda x: x["id"] in train_ids)
50
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
51
+ return "masakhane/uhura-arc-easy", examples, task
52
  if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
53
+ ds = _load_dataset(slug_uhura_arc_easy_translated, tags_uhura_arc_easy_translated[language_bcp_47])
 
 
 
54
  ds = ds.rename_column("answerKey", "answer")
55
+ train_ids = common_ids_train[nr:nr+3]
56
+ examples = ds["train"].filter(lambda x: x["id"] in train_ids)
57
+ # raise Exception(language_bcp_47)
58
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
59
+ return "fair-forward/arc-easy-autotranslated", examples, task
60
  else:
61
  return None, None, None
62
 
 
63
  def translate_arc(languages):
64
  human_translated = tags_uhura_arc_easy.keys()
65
  untranslated = [
66
  lang
67
  for lang in languages["bcp_47"].values[:100]
68
+ if lang not in human_translated and lang in google_supported_languages
69
  ]
70
  n_samples = 10
71
+ train_ids = common_ids_train[:n_samples+3]
72
+ en_train = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train")
 
 
73
  en_train = en_train.filter(lambda x: x["id"] in train_ids)
74
  test_ids = common_ids_test[:n_samples]
75
+ en_test = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test")
 
 
76
  en_test = en_test.filter(lambda x: x["id"] in test_ids)
77
  data = {"train": en_train, "test": en_test}
78
+
79
  slug = "fair-forward/arc-easy-autotranslated"
80
  for lang in tqdm(untranslated):
81
  # check if already exists on hub
 
84
  except (ValueError, Exception):
85
  print(f"Translating {lang}...")
86
  for split, data_en in data.items():
87
+ questions_tr = [translate_google(q, "en", lang) for q in data_en["question"]]
 
 
88
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
89
  choices_texts_concatenated = []
90
  for choice in data_en["choices"]:
91
  for option in choice["text"]:
92
  choices_texts_concatenated.append(option)
93
+ choices_tr = [translate_google(c, "en", lang) for c in choices_texts_concatenated]
 
 
94
  choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
95
  # group into chunks of 4
96
+ choices_tr = [choices_tr[i:i+4] for i in range(0, len(choices_tr), 4)]
 
 
97
 
98
  ds_lang = Dataset.from_dict(
99
  {
 
110
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
111
  )
112
  ds_lang.to_json(
113
+ f"data/translations/arc/{lang}_{split}.json", lines=False, force_ascii=False, indent=2
 
 
 
114
  )
evals/datasets_/fleurs.py CHANGED
@@ -11,7 +11,6 @@ fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
11
  lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
12
  )
13
 
14
-
15
  def download_file(url, path):
16
  response = requests.get(url)
17
  with open(path, "wb") as f:
@@ -35,4 +34,4 @@ def download_fleurs(transcription_langs_eval):
35
  if not tsv_path.exists():
36
  print(f"Downloading {tsv_url} to {tsv_path}")
37
  tsv_path.parent.mkdir(parents=True, exist_ok=True)
38
- download_file(tsv_url, tsv_path)
 
11
  lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
12
  )
13
 
 
14
  def download_file(url, path):
15
  response = requests.get(url)
16
  with open(path, "wb") as f:
 
34
  if not tsv_path.exists():
35
  print(f"Downloading {tsv_url} to {tsv_path}")
36
  tsv_path.parent.mkdir(parents=True, exist_ok=True)
37
+ download_file(tsv_url, tsv_path)
evals/datasets_/mgsm.py CHANGED
@@ -1,12 +1,10 @@
1
  import asyncio
2
  import os
3
- import random
4
 
5
  from datasets import Dataset, load_dataset
6
- from datasets_.util import _get_dataset_config_names, _load_dataset, cache
7
- from langcodes import Language, standardize_tag
8
- from models import get_google_supported_languages, translate_google
9
- from rich import print
10
  from tqdm import tqdm
11
  from tqdm.asyncio import tqdm_asyncio
12
 
@@ -39,50 +37,31 @@ def parse_number(i):
39
  return None
40
 
41
 
42
- @cache
43
- def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
44
- """Cache individual MGSM items efficiently"""
45
- try:
46
- ds = _load_dataset(
47
- dataset_slug,
48
- subset=subset_tag,
49
- split="test",
50
- trust_remote_code=trust_remote_code,
51
- )
52
- if nr >= len(ds):
53
- return None
54
-
55
- row = ds[nr]
56
-
57
- # Post-process based on dataset type
58
- if dataset_slug == slug_gsm8kx:
59
- row["answer_number"] = row["answer"].split("####")[1].strip()
60
-
61
- return row
62
- except Exception:
63
- # Dataset doesn't exist or doesn't have test split
64
- return None
65
-
66
-
67
  def load_mgsm(language_bcp_47, nr):
68
  if language_bcp_47 in tags_mgsm.keys():
69
- item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
70
- return slug_mgsm, item, "human" if item else (None, None, None)
71
  elif language_bcp_47 in tags_afrimgsm.keys():
72
- item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
73
- return slug_afrimgsm, item, "human" if item else (None, None, None)
74
- elif language_bcp_47 in tags_gsm8kx.keys():
75
- item = _get_mgsm_item(
76
- slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True
77
  )
78
- return slug_gsm8kx, item, "machine" if item else (None, None, None)
79
  elif language_bcp_47 in tags_gsm_autotranslated.keys():
80
- item = _get_mgsm_item(
81
- slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr
82
  )
83
- return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
 
 
 
 
 
 
 
 
 
84
  else:
85
- return None, None, None
86
 
87
 
88
  def translate_mgsm(languages):
@@ -90,7 +69,7 @@ def translate_mgsm(languages):
90
  untranslated = [
91
  lang
92
  for lang in languages["bcp_47"].values[:100]
93
- if lang not in human_translated and lang in get_google_supported_languages()
94
  ]
95
  en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
96
  slug = "fair-forward/gsm-autotranslated"
@@ -117,8 +96,5 @@ def translate_mgsm(languages):
117
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
118
  )
119
  ds_lang.to_json(
120
- f"data/translations/mgsm/{lang}.json",
121
- lines=False,
122
- force_ascii=False,
123
- indent=2,
124
  )
 
1
  import asyncio
2
  import os
 
3
 
4
  from datasets import Dataset, load_dataset
5
+ from datasets_.util import _get_dataset_config_names, _load_dataset
6
+ from langcodes import standardize_tag
7
+ from models import google_supported_languages, translate_google
 
8
  from tqdm import tqdm
9
  from tqdm.asyncio import tqdm_asyncio
10
 
 
37
  return None
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def load_mgsm(language_bcp_47, nr):
41
  if language_bcp_47 in tags_mgsm.keys():
42
+ ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
43
+ return slug_mgsm, ds[nr]
44
  elif language_bcp_47 in tags_afrimgsm.keys():
45
+ ds = _load_dataset(
46
+ slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
 
 
 
47
  )
48
+ return slug_afrimgsm, ds[nr]
49
  elif language_bcp_47 in tags_gsm_autotranslated.keys():
50
+ ds = _load_dataset(
51
+ slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
52
  )
53
+ return slug_gsm_autotranslated, ds[nr]
54
+ elif language_bcp_47 in tags_gsm8kx.keys():
55
+ row = _load_dataset(
56
+ slug_gsm8kx,
57
+ subset=tags_gsm8kx[language_bcp_47],
58
+ split="test",
59
+ trust_remote_code=True,
60
+ )[nr]
61
+ row["answer_number"] = row["answer"].split("####")[1].strip()
62
+ return slug_gsm8kx, row
63
  else:
64
+ return None, None
65
 
66
 
67
  def translate_mgsm(languages):
 
69
  untranslated = [
70
  lang
71
  for lang in languages["bcp_47"].values[:100]
72
+ if lang not in human_translated and lang in google_supported_languages
73
  ]
74
  en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
75
  slug = "fair-forward/gsm-autotranslated"
 
96
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
97
  )
98
  ds_lang.to_json(
99
+ f"data/translations/mgsm/{lang}.json", lines=False, force_ascii=False, indent=2
 
 
 
100
  )
evals/datasets_/mmlu.py CHANGED
@@ -4,9 +4,9 @@ import random
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
- from datasets_.util import _get_dataset_config_names, _load_dataset, cache
8
  from langcodes import Language, standardize_tag
9
- from models import get_google_supported_languages, translate_google
10
  from rich import print
11
  from tqdm import tqdm
12
  from tqdm.asyncio import tqdm_asyncio
@@ -111,7 +111,6 @@ def print_datasets_analysis():
111
  # MMLUX is translated using DeepL
112
  # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
113
 
114
-
115
  # print_datasets_analysis()
116
 
117
 
@@ -144,61 +143,32 @@ tags_mmlux = set(
144
  a.rsplit("_", 1)[1].split("-")[0].lower()
145
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
146
  )
147
- tags_mmlu_autotranslated = {
148
- standardize_tag(a, macro=True): a
149
- for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
150
- }
151
 
152
  categories = sorted(
153
- list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
154
- )
155
-
156
-
157
- @cache
158
- def _get_processed_mmlu_dataset(dataset_name, subset_tag):
159
- """Cache processed datasets to avoid reprocessing"""
160
- ds = _load_dataset(dataset_name, subset_tag)
161
- if dataset_name == "masakhane/afrimmlu":
162
- ds = ds.map(parse_choices)
163
- elif dataset_name == "CohereForAI/Global-MMLU":
164
- ds = ds.map(add_choices)
165
- return ds
166
-
167
-
168
- @cache
169
- def _get_mmlu_item(dataset_name, subset_tag, category, nr):
170
- """Cache individual MMLU items efficiently"""
171
- ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
172
- if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
173
- filtered = ds["test"].filter(lambda x: x["subject"] == category)
174
- return filtered[nr] if nr < len(filtered) else None
175
- else: # fair-forward/mmlu-autotranslated
176
- filtered = ds["test"].filter(lambda x: x["subject"] == category)
177
- return filtered[nr] if nr < len(filtered) else None
178
 
179
 
180
- async def load_mmlu(language_bcp_47, nr):
181
  category = categories[nr % len(categories)]
182
  if language_bcp_47 in tags_afrimmlu.keys():
183
- task = _get_mmlu_item(
184
- "masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr
185
- )
186
- return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
 
187
  elif language_bcp_47 in tags_global_mmlu.keys():
188
- task = _get_mmlu_item(
189
- "CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr
190
- )
191
- return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
192
- # TODO: add in Okapi, MMLUX @Jonas
193
  elif language_bcp_47 in tags_mmlu_autotranslated:
194
- task = _get_mmlu_item(
195
- "fair-forward/mmlu-autotranslated", language_bcp_47, category, nr
196
- )
197
- return (
198
- "fair-forward/mmlu-autotranslated",
199
- task,
200
- "machine" if task else (None, None, None),
201
- )
202
  else:
203
  return None, None, None
204
 
@@ -207,10 +177,10 @@ def translate_mmlu(languages):
207
  human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
208
  untranslated = [
209
  lang
210
- for lang in languages["bcp_47"].values[:150]
211
- if lang not in human_translated and lang in get_google_supported_languages()
212
  ]
213
- n_samples = 20
214
 
215
  slug = "fair-forward/mmlu-autotranslated"
216
  for lang in tqdm(untranslated):
@@ -226,10 +196,8 @@ def translate_mmlu(languages):
226
  if split == "dev":
227
  samples.extend(ds.filter(lambda x: x["subject"] == category))
228
  else:
229
- # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
230
- filtered = ds.filter(lambda x: x["subject"] == category)
231
- for i in range(min(n_samples, len(filtered))):
232
- task = filtered[i]
233
  samples.append(task)
234
  questions_tr = [
235
  translate_google(s["question"], "en", lang) for s in samples
 
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
+ from datasets_.util import _get_dataset_config_names, _load_dataset
8
  from langcodes import Language, standardize_tag
9
+ from models import google_supported_languages, translate_google
10
  from rich import print
11
  from tqdm import tqdm
12
  from tqdm.asyncio import tqdm_asyncio
 
111
  # MMLUX is translated using DeepL
112
  # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
113
 
 
114
  # print_datasets_analysis()
115
 
116
 
 
143
  a.rsplit("_", 1)[1].split("-")[0].lower()
144
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
145
  )
146
+ tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
 
 
 
147
 
148
  categories = sorted(
149
+ list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
150
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
 
153
+ def load_mmlu(language_bcp_47, nr):
154
  category = categories[nr % len(categories)]
155
  if language_bcp_47 in tags_afrimmlu.keys():
156
+ ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
157
+ ds = ds.map(parse_choices)
158
+ examples = ds["dev"].filter(lambda x: x["subject"] == category)
159
+ task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
160
+ return "masakhane/afrimmlu", examples, task
161
  elif language_bcp_47 in tags_global_mmlu.keys():
162
+ ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
163
+ ds = ds.map(add_choices)
164
+ examples = ds["dev"].filter(lambda x: x["subject"] == category)
165
+ task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
166
+ return "CohereForAI/Global-MMLU", examples, task
167
  elif language_bcp_47 in tags_mmlu_autotranslated:
168
+ ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
169
+ examples = ds["dev"].filter(lambda x: x["subject"] == category)
170
+ task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
171
+ return "fair-forward/mmlu-autotranslated", examples, task
 
 
 
 
172
  else:
173
  return None, None, None
174
 
 
177
  human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
178
  untranslated = [
179
  lang
180
+ for lang in languages["bcp_47"].values[:100]
181
+ if lang not in human_translated and lang in google_supported_languages
182
  ]
183
+ n_samples = 10
184
 
185
  slug = "fair-forward/mmlu-autotranslated"
186
  for lang in tqdm(untranslated):
 
196
  if split == "dev":
197
  samples.extend(ds.filter(lambda x: x["subject"] == category))
198
  else:
199
+ for i in range(n_samples):
200
+ task = ds.filter(lambda x: x["subject"] == category)[i]
 
 
201
  samples.append(task)
202
  questions_tr = [
203
  translate_google(s["question"], "en", lang) for s in samples
evals/datasets_/truthfulqa.py CHANGED
@@ -8,29 +8,17 @@ import asyncio
8
  from tqdm.asyncio import tqdm_asyncio
9
  import os
10
 
11
- from datasets import Dataset, load_dataset, DatasetNotFoundError
12
- from models import translate_google, get_google_supported_languages
13
 
14
  from datasets_.util import _get_dataset_config_names, _load_dataset
15
 
16
  slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
17
- slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
18
-
19
  tags_uhura_truthfulqa = {
20
- standardize_tag(a.split("_")[0], macro=True): a
21
- for a in _get_dataset_config_names(slug_uhura_truthfulqa)
22
  if a.endswith("multiple_choice")
23
  }
24
 
25
- # Get available auto-translated languages
26
- try:
27
- tags_truthfulqa_autotranslated = {
28
- standardize_tag(a, macro=True): a
29
- for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
30
- }
31
- except DatasetNotFoundError:
32
- tags_truthfulqa_autotranslated = {}
33
-
34
 
35
  def add_choices(row):
36
  row["choices"] = row["mc1_targets"]["choices"]
@@ -38,42 +26,26 @@ def add_choices(row):
38
  return row
39
 
40
 
41
- async def load_truthfulqa(language_bcp_47, nr):
42
  if language_bcp_47 in tags_uhura_truthfulqa.keys():
43
- ds = _load_dataset(
44
- slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
45
- )
46
  ds = ds.map(add_choices)
 
47
  task = ds["test"][nr]
48
- # Ensure there is a correct answer before returning the task
49
- if 1 not in task["labels"]:
50
- return None, None, None
51
- return "masakhane/uhura-truthfulqa", task, "human"
52
- elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
53
- # Load from auto-translated dataset (same samples as translation)
54
- ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
55
- test_split = ds["test"] if "test" in ds else ds
56
- task = test_split[nr]
57
- # Ensure there is a correct answer before returning the task
58
- if 1 not in task.get("labels", []):
59
- return None, None, None
60
- return slug_truthfulqa_autotranslated, task, "machine"
61
- # TODO: add Okapi, TruthfulQA-X @Jonas
62
  else:
63
  return None, None, None
64
 
65
 
 
66
  def translate_truthfulqa(languages):
67
  human_translated = [*tags_uhura_truthfulqa.keys()]
68
  untranslated = [
69
  lang
70
- for lang in languages["bcp_47"].values[:150]
71
- if lang not in human_translated and lang in get_google_supported_languages()
72
  ]
73
- n_samples = 20
74
-
75
- # Set fixed seed for consistent sample selection across all languages
76
- random.seed(42)
77
 
78
  slug = "fair-forward/truthfulqa-autotranslated"
79
  for lang in tqdm(untranslated):
@@ -83,47 +55,37 @@ def translate_truthfulqa(languages):
83
  except (ValueError, Exception):
84
  print(f"Translating {lang}...")
85
  for split in ["train", "test"]:
86
- ds = _load_dataset(
87
- slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split
88
- )
89
  samples = []
90
  if split == "train":
91
  samples.extend(ds)
92
  else:
93
- # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
94
- for i in range(min(n_samples, len(ds))):
95
  task = ds[i]
96
  samples.append(task)
97
-
98
- # Translate questions
99
  questions_tr = [
100
  translate_google(s["question"], "en", lang) for s in samples
101
  ]
102
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
103
-
104
- # Translate choices for each sample
105
- all_choices_tr = []
106
- all_labels = []
107
-
108
  for s in samples:
109
- # Get choices from mc1_targets
110
- choices = s["mc1_targets"]["choices"]
111
- labels = s["mc1_targets"]["labels"]
112
-
113
- # Translate choices
114
- choices_tr = [
115
- translate_google(choice, "en", lang) for choice in choices
116
- ]
117
- choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
118
-
119
- all_choices_tr.append(choices_tr)
120
- all_labels.append(labels)
121
 
122
  ds_lang = Dataset.from_dict(
123
  {
 
124
  "question": questions_tr,
125
- "choices": all_choices_tr,
126
- "labels": all_labels,
127
  }
128
  )
129
  ds_lang.push_to_hub(
@@ -133,7 +95,7 @@ def translate_truthfulqa(languages):
133
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
134
  )
135
  ds_lang.to_json(
136
- f"data/translations/truthfulqa/{lang}_{split}.json",
137
  lines=False,
138
  force_ascii=False,
139
  indent=2,
 
8
  from tqdm.asyncio import tqdm_asyncio
9
  import os
10
 
11
+ from datasets import Dataset, load_dataset
12
+ from models import translate_google, google_supported_languages
13
 
14
  from datasets_.util import _get_dataset_config_names, _load_dataset
15
 
16
  slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
 
 
17
  tags_uhura_truthfulqa = {
18
+ standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
 
19
  if a.endswith("multiple_choice")
20
  }
21
 
 
 
 
 
 
 
 
 
 
22
 
23
  def add_choices(row):
24
  row["choices"] = row["mc1_targets"]["choices"]
 
26
  return row
27
 
28
 
29
+ def load_truthfulqa(language_bcp_47, nr):
30
  if language_bcp_47 in tags_uhura_truthfulqa.keys():
31
+ ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
 
 
32
  ds = ds.map(add_choices)
33
+ examples = ds["train"]
34
  task = ds["test"][nr]
35
+ return "masakhane/uhura-truthfulqa", examples, task
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  else:
37
  return None, None, None
38
 
39
 
40
+
41
  def translate_truthfulqa(languages):
42
  human_translated = [*tags_uhura_truthfulqa.keys()]
43
  untranslated = [
44
  lang
45
+ for lang in languages["bcp_47"].values[:100]
46
+ if lang not in human_translated and lang in google_supported_languages
47
  ]
48
+ n_samples = 10
 
 
 
49
 
50
  slug = "fair-forward/truthfulqa-autotranslated"
51
  for lang in tqdm(untranslated):
 
55
  except (ValueError, Exception):
56
  print(f"Translating {lang}...")
57
  for split in ["train", "test"]:
58
+ ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split)
 
 
59
  samples = []
60
  if split == "train":
61
  samples.extend(ds)
62
  else:
63
+ for i in range(n_samples):
 
64
  task = ds[i]
65
  samples.append(task)
 
 
66
  questions_tr = [
67
  translate_google(s["question"], "en", lang) for s in samples
68
  ]
69
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
70
+ choices_texts_concatenated = []
 
 
 
 
71
  for s in samples:
72
+ for choice in eval(s["choices"]):
73
+ choices_texts_concatenated.append(choice)
74
+ choices_tr = [
75
+ translate_google(c, "en", lang) for c in choices_texts_concatenated
76
+ ]
77
+ choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
78
+ # group into chunks of 4
79
+ choices_tr = [
80
+ choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
81
+ ]
 
 
82
 
83
  ds_lang = Dataset.from_dict(
84
  {
85
+ "subject": [s["subject"] for s in samples],
86
  "question": questions_tr,
87
+ "choices": choices_tr,
88
+ "answer": [s["answer"] for s in samples],
89
  }
90
  )
91
  ds_lang.push_to_hub(
 
95
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
96
  )
97
  ds_lang.to_json(
98
+ f"data/translations/mmlu/{lang}_{split}.json",
99
  lines=False,
100
  force_ascii=False,
101
  indent=2,
evals/datasets_/util.py CHANGED
@@ -12,11 +12,3 @@ def _get_dataset_config_names(dataset, **kwargs):
12
  @cache
13
  def _load_dataset(dataset, subset, **kwargs):
14
  return load_dataset(dataset, subset, **kwargs)
15
-
16
-
17
- # Cache individual dataset items to avoid reloading entire datasets
18
- @cache
19
- def _get_dataset_item(dataset, subset, split, index, **kwargs):
20
- """Load a single item from a dataset efficiently"""
21
- ds = load_dataset(dataset, subset, split=split, **kwargs)
22
- return ds[index] if index < len(ds) else None
 
12
  @cache
13
  def _load_dataset(dataset, subset, **kwargs):
14
  return load_dataset(dataset, subset, **kwargs)
 
 
 
 
 
 
 
 
evals/download_data.py CHANGED
@@ -8,7 +8,6 @@ from pathlib import Path
8
  import sys
9
  import huggingface_hub
10
  from datasets import load_dataset, DatasetDict
11
-
12
  # Import fleurs DataFrame directly from its source module
13
  from datasets_.fleurs import fleurs
14
 
@@ -25,25 +24,22 @@ DATA_DIR = project_root / "data"
25
  FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
26
  FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
27
 
28
- GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
29
  GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
30
  GLOTTOLOG_CSV_NAME = "languoid.csv"
31
 
32
- SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
33
  SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
34
 
35
- SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
36
  SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
37
  SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
38
- SPBLEU_DICT_URL = (
39
- "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
40
- )
41
  SPBLEU_DICT_NAME = "dictionary.txt"
42
 
43
 
44
  # --- Helper Functions ---
45
 
46
-
47
  def download_file(url, path: Path):
48
  """Downloads a file from a URL to a local path."""
49
  print(f"Downloading {url} to {path}...")
@@ -88,16 +84,11 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
88
  break
89
 
90
  if target_zip_path:
91
- with (
92
- z.open(target_zip_path) as source,
93
- open(extract_path / target_filename, "wb") as target,
94
- ):
95
  target.write(source.read())
96
  print(f"Successfully extracted {target_filename}.")
97
  else:
98
- print(
99
- f"Error: Could not find {target_filename} within the zip archive."
100
- )
101
 
102
  except zipfile.BadZipFile:
103
  print("Error: Downloaded file is not a valid zip archive.")
@@ -107,14 +98,13 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
107
 
108
  # --- Download Functions ---
109
 
110
-
111
  def download_fleurs_data():
112
  """Downloads Fleurs audio and text data."""
113
  print("\n--- Downloading Fleurs Data ---")
114
  FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
115
 
116
  # Use the fleurs_tag column from the imported DataFrame
117
- fleurs_tags_list = fleurs["fleurs_tag"].tolist()
118
 
119
  if not fleurs_tags_list:
120
  print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
@@ -127,9 +117,7 @@ def download_fleurs_data():
127
  audio_dir = lang_dir / "audio"
128
  dev_tsv_path = lang_dir / "dev.tsv"
129
  dev_audio_archive_path = audio_dir / "dev.tar.gz"
130
- audio_extracted_marker = (
131
- audio_dir / "dev"
132
- ) # Check if extraction likely happened
133
 
134
  # Download TSV
135
  if not dev_tsv_path.exists():
@@ -141,15 +129,15 @@ def download_fleurs_data():
141
  # Download and Extract Audio
142
  if not audio_extracted_marker.exists():
143
  if not dev_audio_archive_path.exists():
144
- tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
145
- download_file(tar_url, dev_audio_archive_path)
146
 
147
  if dev_audio_archive_path.exists():
148
- extract_tar_gz(dev_audio_archive_path, audio_dir)
149
  else:
150
  print(f"Audio archive missing, cannot extract for {lang_tag}")
151
  else:
152
- print(f"Found extracted audio: {audio_extracted_marker}")
153
 
154
 
155
  def download_glottolog_data():
@@ -177,9 +165,7 @@ def download_scriptcodes_data():
177
  # The URL points to an HTML page, not a direct CSV link.
178
  # Manual download is likely required for ScriptCodes.csv.
179
  print(f"Cannot automatically download from {SCRIPTCODES_URL}")
180
- print(
181
- "Please manually download the ISO 15924 codes list (often available as a .txt file)"
182
- )
183
  print("from the Unicode website or related sources and save it as:")
184
  print(f"{SCRIPTCODES_TARGET_FILE}")
185
  if SCRIPTCODES_TARGET_FILE.exists():
@@ -210,24 +196,21 @@ def download_spbleu_data():
210
 
211
  # --- Main Execution ---
212
 
213
-
214
  def main():
215
  """Runs all download functions and the conversion step."""
216
  print("Starting data download process...")
217
  DATA_DIR.mkdir(exist_ok=True)
218
 
219
- # download_fleurs_data()
220
  download_glottolog_data()
221
  download_scriptcodes_data()
222
  download_spbleu_data()
223
 
224
  print("\nData download process finished.")
225
  print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
226
- print(
227
- "Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well"
228
- )
229
  print("in 'evals/datasets_/flores.py' to be read correctly.")
230
 
231
 
232
  if __name__ == "__main__":
233
- main()
 
8
  import sys
9
  import huggingface_hub
10
  from datasets import load_dataset, DatasetDict
 
11
  # Import fleurs DataFrame directly from its source module
12
  from datasets_.fleurs import fleurs
13
 
 
24
  FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
25
  FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
26
 
27
+ GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
28
  GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
29
  GLOTTOLOG_CSV_NAME = "languoid.csv"
30
 
31
+ SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
32
  SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
33
 
34
+ SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
35
  SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
36
  SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
37
+ SPBLEU_DICT_URL = "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
 
 
38
  SPBLEU_DICT_NAME = "dictionary.txt"
39
 
40
 
41
  # --- Helper Functions ---
42
 
 
43
  def download_file(url, path: Path):
44
  """Downloads a file from a URL to a local path."""
45
  print(f"Downloading {url} to {path}...")
 
84
  break
85
 
86
  if target_zip_path:
87
+ with z.open(target_zip_path) as source, open(extract_path / target_filename, "wb") as target:
 
 
 
88
  target.write(source.read())
89
  print(f"Successfully extracted {target_filename}.")
90
  else:
91
+ print(f"Error: Could not find {target_filename} within the zip archive.")
 
 
92
 
93
  except zipfile.BadZipFile:
94
  print("Error: Downloaded file is not a valid zip archive.")
 
98
 
99
  # --- Download Functions ---
100
 
 
101
  def download_fleurs_data():
102
  """Downloads Fleurs audio and text data."""
103
  print("\n--- Downloading Fleurs Data ---")
104
  FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
105
 
106
  # Use the fleurs_tag column from the imported DataFrame
107
+ fleurs_tags_list = fleurs['fleurs_tag'].tolist()
108
 
109
  if not fleurs_tags_list:
110
  print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
 
117
  audio_dir = lang_dir / "audio"
118
  dev_tsv_path = lang_dir / "dev.tsv"
119
  dev_audio_archive_path = audio_dir / "dev.tar.gz"
120
+ audio_extracted_marker = audio_dir / "dev" # Check if extraction likely happened
 
 
121
 
122
  # Download TSV
123
  if not dev_tsv_path.exists():
 
129
  # Download and Extract Audio
130
  if not audio_extracted_marker.exists():
131
  if not dev_audio_archive_path.exists():
132
+ tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
133
+ download_file(tar_url, dev_audio_archive_path)
134
 
135
  if dev_audio_archive_path.exists():
136
+ extract_tar_gz(dev_audio_archive_path, audio_dir)
137
  else:
138
  print(f"Audio archive missing, cannot extract for {lang_tag}")
139
  else:
140
+ print(f"Found extracted audio: {audio_extracted_marker}")
141
 
142
 
143
  def download_glottolog_data():
 
165
  # The URL points to an HTML page, not a direct CSV link.
166
  # Manual download is likely required for ScriptCodes.csv.
167
  print(f"Cannot automatically download from {SCRIPTCODES_URL}")
168
+ print(f"Please manually download the ISO 15924 codes list (often available as a .txt file)")
 
 
169
  print("from the Unicode website or related sources and save it as:")
170
  print(f"{SCRIPTCODES_TARGET_FILE}")
171
  if SCRIPTCODES_TARGET_FILE.exists():
 
196
 
197
  # --- Main Execution ---
198
 
 
199
  def main():
200
  """Runs all download functions and the conversion step."""
201
  print("Starting data download process...")
202
  DATA_DIR.mkdir(exist_ok=True)
203
 
204
+ #download_fleurs_data()
205
  download_glottolog_data()
206
  download_scriptcodes_data()
207
  download_spbleu_data()
208
 
209
  print("\nData download process finished.")
210
  print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
211
+ print("Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well")
 
 
212
  print("in 'evals/datasets_/flores.py' to be read correctly.")
213
 
214
 
215
  if __name__ == "__main__":
216
+ main()
evals/languages.py CHANGED
@@ -31,7 +31,6 @@ glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
31
  lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
32
  )
33
 
34
-
35
  @cache
36
  def language_family(bcp_47):
37
  languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
@@ -40,7 +39,6 @@ def language_family(bcp_47):
40
  family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
41
  return family["name"]
42
 
43
-
44
  languages["family"] = languages["bcp_47"].apply(language_family)
45
 
46
  # load script codes and names
@@ -48,7 +46,6 @@ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
48
  columns={"Code": "iso15924", "English Name": "script_name"}
49
  )
50
 
51
-
52
  def script_name(iso15924):
53
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
54
 
 
31
  lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
32
  )
33
 
 
34
  @cache
35
  def language_family(bcp_47):
36
  languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
 
39
  family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
40
  return family["name"]
41
 
 
42
  languages["family"] = languages["bcp_47"].apply(language_family)
43
 
44
  # load script codes and names
 
46
  columns={"Code": "iso15924", "English Name": "script_name"}
47
  )
48
 
 
49
  def script_name(iso15924):
50
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
51
 
evals/main.py CHANGED
@@ -1,190 +1,62 @@
1
  import asyncio
 
2
  import pandas as pd
3
- import time
4
- from datetime import datetime, timedelta
5
  from models import models
6
  from tasks import tasks
7
- from languages import languages
8
- import os
9
-
10
 
11
- async def evaluate():
12
- # Configuration - easily adjustable defaults
13
- n_sentences = int(
14
- os.environ.get("N_SENTENCES", 20)
15
- ) # Default: 20 sentences per task
16
- max_languages = int(
17
- os.environ.get("MAX_LANGUAGES", 150)
18
- ) # Default: 150 top languages
19
- single_model = os.environ.get(
20
- "SINGLE_MODEL"
21
- ) # Optional: run only one specific model
22
- test_mode = os.environ.get("TEST", "").lower() in (
23
- "1",
24
- "true",
25
- "yes",
26
- ) # Optional: skip results loading/saving
27
-
28
- # Keep original DataFrames for saving metadata - distinction added for single model test runs.
29
- original_models_df = pd.DataFrame(models)
30
- original_languages_df = pd.DataFrame(languages)
31
 
32
- # Create working copies for single evaluation runs
33
- models_df = original_models_df.copy()
34
- languages_df = original_languages_df.copy()
35
- top_languages = languages.head(max_languages)
36
 
37
- # Filter to single model if specified (only affects evaluation, not saving)
38
- if single_model:
39
- models_df = models_df[models_df["id"] == single_model]
40
- if len(models_df) == 0:
41
- print(f"Error: Model '{single_model}' not found. Available models:")
42
- for model_id in original_models_df["id"]:
43
- print(f" {model_id}")
44
- return pd.DataFrame()
45
 
46
- print(
47
- f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task"
48
- )
49
- if test_mode:
50
- print("TEST MODE: Skipping results loading/saving")
51
- start_time = time.time()
52
 
53
- # Load existing results to avoid re-evaluation (skip in test mode)
54
- if test_mode:
55
- old_results = pd.DataFrame(
56
- columns=["model", "bcp_47", "task", "metric", "origin", "score"]
57
- )
58
- else:
59
  old_results = pd.read_json("results.json")
60
-
61
- # Get all combinations that need evaluation
62
- combis = [
63
- (model, lang.bcp_47, task_name)
64
- for model in models_df["id"]
65
- for lang in top_languages.itertuples()
66
- for task_name, task in tasks.items()
67
- if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
68
- ]
69
-
70
- # Filter out already evaluated combinations
71
- combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
72
- if not old_results.empty:
73
- completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1))
74
- # set + combis is faster than merge (locally it made a difference for me when loading all data/tasks into memory)
75
- mask = ~combis.apply(
76
- lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1
77
- )
78
- combis = combis[mask]
79
-
80
- # Create all evaluation tasks
81
- all_tasks = []
82
- for i in range(n_sentences):
83
- for model, bcp_47, task_name in combis.itertuples(index=False):
84
- all_tasks.append((tasks[task_name], model, bcp_47, i))
85
-
86
- print(f"Running {len(all_tasks)} evaluation tasks...")
87
-
88
- # For single model runs, we stop immediately on first API error to inspect.
89
- # For full evaluations, we continue despite errors to get maximum coverage.
90
- stop_on_error = single_model is not None
91
-
92
- # Process tasks in batches to avoid memory issues (for full evaluation locally that helped a lot)
93
- batch_size = 1000
94
- all_results = []
95
-
96
- try:
97
- for i in range(0, len(all_tasks), batch_size):
98
- batch = all_tasks[i : i + batch_size]
99
- batch_results = await asyncio.gather(
100
- *[
101
- task_func(model, bcp_47, sentence_nr)
102
- for task_func, model, bcp_47, sentence_nr in batch
103
- ],
104
- return_exceptions=not stop_on_error,
105
  )
106
- all_results.extend(batch_results)
107
-
108
- results = all_results
109
-
110
- # Process results and logging API errors separately to understand what are the main issues.
111
- valid_results = []
112
- errors = []
113
-
114
- for i, r in enumerate(results):
115
- if isinstance(r, Exception):
116
- if i < len(all_tasks):
117
- task_info = all_tasks[i]
118
- errors.append(f"{task_info[1]},{task_info[2]},{str(r)}")
119
- elif isinstance(r, list):
120
- valid_results.extend(r)
121
- elif r is not None:
122
- valid_results.append(r)
123
-
124
- # log errors and store
125
- if errors:
126
- with open("errors.log", "w") as f:
127
- f.write("model,task,error\n")
128
- for error in errors:
129
- f.write(error + "\n")
130
-
131
- # Track model completion (TO BE DELETED - was for local run only)
132
- if valid_results:
133
- completed_models = set()
134
- for result in valid_results:
135
- if isinstance(result, dict) and "model" in result:
136
- model = result["model"]
137
- if model not in completed_models:
138
- completed_models.add(model)
139
- print(f"Completed: {model}")
140
-
141
- print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors")
142
-
143
- # this is for local single model runs - for testing and development
144
- except Exception as e:
145
- print(f"EVALUATION STOPPED - API Error occurred:")
146
- print(f"Error type: {type(e).__name__}")
147
- print(f"Error message: {str(e)}")
148
- return pd.DataFrame()
149
-
150
- # Save results (skipped in test mode as we do not want to overwrite existing results)
151
- if valid_results:
152
- results_df = pd.DataFrame(valid_results)
153
-
154
- # Aggregate results
155
- results_df = (
156
- results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
157
- .agg({"score": "mean"})
158
- .reset_index()
159
- )
160
-
161
- if not test_mode:
162
- args = dict(orient="records", indent=2, force_ascii=False)
163
-
164
- # Merge with existing results
165
- if not old_results.empty:
166
- results_df = pd.concat([old_results, results_df])
167
- results_df = results_df.drop_duplicates(
168
- subset=["model", "bcp_47", "task", "metric", "origin"]
169
- )
170
-
171
- results_df = results_df.sort_values(
172
- by=["model", "bcp_47", "task", "metric"]
173
- )
174
- results_df.to_json("results.json", **args)
175
-
176
- # Save model and language info (always save complete metadata, not filtered)
177
- original_models_df.to_json("models.json", **args)
178
- original_languages_df.to_json("languages.json", **args)
179
- else:
180
- print("TEST MODE: Skipping results saving")
181
-
182
- elapsed = time.time() - start_time
183
- print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
184
-
185
- return results_df
186
-
187
- return pd.DataFrame()
188
 
189
 
190
  if __name__ == "__main__":
 
1
  import asyncio
2
+
3
  import pandas as pd
4
+ from languages import languages
 
5
  from models import models
6
  from tasks import tasks
7
+ from tqdm.asyncio import tqdm_asyncio
 
 
8
 
9
+ # ===== config =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ n_sentences = 10
 
 
 
12
 
13
+ # ===== run evaluation and aggregate results =====
 
 
 
 
 
 
 
14
 
 
 
 
 
 
 
15
 
16
+ async def evaluate():
17
+ # FIXME we should not need this for-loop, but it helps
18
+ for n_languages in range(10, 101, 10):
19
+ print(f"running evaluations for {n_languages} languages")
 
 
20
  old_results = pd.read_json("results.json")
21
+ old_models = pd.read_json("models.json")
22
+ # get all combinations of model, language and task
23
+ combis = [
24
+ (model, lang.bcp_47, task_name)
25
+ for model in models["id"]
26
+ for lang in languages.iloc[:n_languages].itertuples()
27
+ for task_name, task in tasks.items()
28
+ if task_name in models[models["id"] == model]["tasks"].iloc[0]
29
+ ]
30
+ # filter out combinations that have already been evaluated
31
+ combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
32
+ combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
33
+ combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
34
+ # run evaluations
35
+ results = [
36
+ tasks[task_name](model, bcp_47, i)
37
+ for i in range(n_sentences)
38
+ for model, bcp_47, task_name in combis.itertuples(index=False)
39
+ ]
40
+ results = await tqdm_asyncio.gather(*results, miniters=1)
41
+ results = [r for group in results for r in group]
42
+ args = dict(orient="records", indent=2, force_ascii=False)
43
+ if results:
44
+ # aggregate results
45
+ results = pd.DataFrame(results)
46
+ results = (
47
+ results.groupby(["model", "bcp_47", "task", "metric"])
48
+ .agg({"score": "mean"})
49
+ .reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  )
51
+ # save results
52
+ results = pd.concat([old_results, results])
53
+ results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
54
+ results.to_json("results.json", **args)
55
+ # save up-to-date info on models and languages
56
+ all_models = pd.concat([pd.DataFrame(models), old_models])
57
+ all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
58
+ all_models.to_json("models.json", **args)
59
+ pd.DataFrame(languages).to_json("languages.json", **args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  if __name__ == "__main__":
evals/models.py CHANGED
@@ -1,4 +1,3 @@
1
- import asyncio
2
  import json
3
  import re
4
  from collections import defaultdict
@@ -8,6 +7,7 @@ from os import getenv
8
  import pandas as pd
9
  from aiolimiter import AsyncLimiter
10
  from dotenv import load_dotenv
 
11
  from google.cloud import translate_v2 as translate
12
  from huggingface_hub import AsyncInferenceClient, HfApi
13
  from joblib.memory import Memory
@@ -22,17 +22,14 @@ important_models = [
22
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
23
  "meta-llama/llama-3-70b-instruct", # 0.4$
24
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
25
- "openai/gpt-5",
26
- "openai/gpt-5-nano", # include if/when available
27
  "openai/gpt-4.1", # 8$
28
  "openai/gpt-4.1-mini", # 1.6$
29
  "openai/gpt-4.1-nano", # 0.4$
30
  "openai/gpt-4o-mini", # 0.6$
31
- "openai/gpt-4o-2024-11-20", # 10$
32
- "openai/gpt-oss-120b",
33
- "anthropic/claude-3.7-sonnet", # 15$ - added for full coverage
34
- "anthropic/claude-sonnet-4", # 15$ - added for full coverage
35
- "anthropic/claude-opus-4.1", # 15$ - added for full coverage
36
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
37
  "mistralai/mistral-saba", # 0.6$
38
  "mistralai/mistral-nemo", # 0.08$
@@ -51,13 +48,10 @@ important_models = [
51
  "microsoft/phi-4", # 0.07$
52
  "microsoft/phi-4-multimodal-instruct", # 0.1$
53
  "amazon/nova-micro-v1", # 0.09$
54
- "moonshotai/kimi-k2", # 0.6$ - added to prevent missing from models.json
55
- "x-ai/grok-4",
56
  ]
57
 
58
  blocklist = [
59
  "google/gemini-2.5-pro-preview",
60
- "google/gemini-2.5-pro",
61
  "google/gemini-2.5-flash-preview",
62
  "google/gemini-2.5-flash-lite-preview",
63
  "google/gemini-2.5-flash-preview-04-17",
@@ -65,7 +59,6 @@ blocklist = [
65
  "google/gemini-2.5-flash-lite-preview-06-17",
66
  "google/gemini-2.5-pro-preview-06-05",
67
  "google/gemini-2.5-pro-preview-05-06",
68
- "perplexity/sonar-deep-research",
69
  ]
70
 
71
  transcription_models = [
@@ -100,81 +93,28 @@ def get_model(permaslug):
100
 
101
  @cache
102
  def get_historical_popular_models(date: date):
103
- try:
104
- raw = get("https://openrouter.ai/rankings").text
105
-
106
- # Extract model data from rankingData using regex
107
- import re
108
- import json
109
-
110
- # Find all count and model_permaslug pairs in the data
111
- # Format: "count":number,"model_permaslug":"model/name"
112
- pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
113
- matches = re.findall(pattern, raw)
114
-
115
- if matches:
116
- # Aggregate model counts
117
- model_counts = {}
118
- for count_str, model_slug in matches:
119
- count = float(count_str)
120
- if not model_slug.startswith("openrouter") and model_slug != "Others":
121
- # Remove variant suffixes for aggregation
122
- base_model = model_slug.split(":")[0]
123
- model_counts[base_model] = model_counts.get(base_model, 0) + count
124
-
125
- # Sort by popularity and return top models
126
- sorted_models = sorted(
127
- model_counts.items(), key=lambda x: x[1], reverse=True
128
- )
129
- result = []
130
- for model_slug, count in sorted_models[:20]: # Top 20
131
- result.append({"slug": model_slug, "count": int(count)})
132
-
133
- return result
134
- else:
135
- return []
136
-
137
- except Exception as e:
138
- return []
139
 
140
 
141
  @cache
142
  def get_current_popular_models(date: date):
143
- try:
144
- raw = get("https://openrouter.ai/rankings?view=day").text
145
-
146
- # Extract model data from daily rankings
147
- import re
148
- import json
149
-
150
- # Find all count and model_permaslug pairs in the daily data
151
- pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
152
- matches = re.findall(pattern, raw)
153
-
154
- if matches:
155
- # Aggregate model counts
156
- model_counts = {}
157
- for count_str, model_slug in matches:
158
- count = float(count_str)
159
- if not model_slug.startswith("openrouter") and model_slug != "Others":
160
- # Remove variant suffixes for aggregation
161
- base_model = model_slug.split(":")[0]
162
- model_counts[base_model] = model_counts.get(base_model, 0) + count
163
-
164
- # Sort by popularity and return top models
165
- sorted_models = sorted(
166
- model_counts.items(), key=lambda x: x[1], reverse=True
167
- )
168
- result = []
169
- for model_slug, count in sorted_models[:10]: # Top 10
170
- result.append({"slug": model_slug, "count": int(count)})
171
-
172
- return result
173
- else:
174
- return []
175
-
176
- except Exception as e:
177
- return []
178
 
179
 
180
  def get_translation_models():
@@ -221,10 +161,7 @@ async def complete(**kwargs) -> str | None:
221
 
222
 
223
  translate_client = translate.Client()
224
-
225
-
226
- def get_google_supported_languages():
227
- return [l["language"] for l in translate_client.get_languages()]
228
 
229
 
230
  @cache
@@ -294,15 +231,12 @@ def get_hf_metadata(row):
294
  return empty
295
  try:
296
  info = api.model_info(id)
297
- license = ""
298
- if (
299
- info.card_data
300
- and hasattr(info.card_data, "license")
301
- and info.card_data.license
302
- ):
303
- license = (
304
- info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
305
- )
306
  return {
307
  "hf_id": info.id,
308
  "creation_date": info.created_at,
@@ -315,14 +249,8 @@ def get_hf_metadata(row):
315
 
316
 
317
  def get_cost(row):
318
- """
319
- row: a row from the OpenRouter models dataframe
320
- """
321
- try:
322
- cost = float(row["endpoint"]["pricing"]["completion"])
323
- return round(cost * 1_000_000, 2)
324
- except (TypeError, KeyError):
325
- return None
326
 
327
 
328
  @cache
@@ -332,17 +260,8 @@ def load_models(date: date):
332
  + get_current_popular_models(date.today())[:10]
333
  )
334
  popular_models = [m["slug"] for m in popular_models]
335
- all_model_candidates = set(important_models + popular_models) - set(blocklist)
336
-
337
- # Validate models exist on OpenRouter before including them
338
- valid_models = []
339
-
340
- for model_id in all_model_candidates:
341
- metadata = get_or_metadata(model_id)
342
- if metadata is not None:
343
- valid_models.append(model_id)
344
-
345
- models = pd.DataFrame(sorted(valid_models), columns=["id"])
346
  or_metadata = models["id"].apply(get_or_metadata)
347
  hf_metadata = or_metadata.apply(get_hf_metadata)
348
  creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
@@ -362,18 +281,9 @@ def load_models(date: date):
362
  license=hf_metadata.str["license"],
363
  creation_date=creation_date_hf.combine_first(creation_date_or),
364
  )
365
- # Filter out expensive models to keep costs reasonable
366
- models = models[models["cost"] <= 15.0].reset_index(drop=True)
367
  models["tasks"] = [
368
- [
369
- "translation_from",
370
- "translation_to",
371
- "classification",
372
- "mmlu",
373
- "arc",
374
- "truthfulqa",
375
- "mgsm",
376
- ]
377
  ] * len(models)
378
  models = pd.concat([models, get_translation_models()])
379
  return models
 
 
1
  import json
2
  import re
3
  from collections import defaultdict
 
7
  import pandas as pd
8
  from aiolimiter import AsyncLimiter
9
  from dotenv import load_dotenv
10
+ from elevenlabs import AsyncElevenLabs
11
  from google.cloud import translate_v2 as translate
12
  from huggingface_hub import AsyncInferenceClient, HfApi
13
  from joblib.memory import Memory
 
22
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
23
  "meta-llama/llama-3-70b-instruct", # 0.4$
24
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
 
 
25
  "openai/gpt-4.1", # 8$
26
  "openai/gpt-4.1-mini", # 1.6$
27
  "openai/gpt-4.1-nano", # 0.4$
28
  "openai/gpt-4o-mini", # 0.6$
29
+ # "openai/gpt-4o-2024-11-20", # 10$
30
+ "openai/gpt-3.5-turbo-0613", # 2$
31
+ # "openai/gpt-3.5-turbo", # 1.5$
32
+ # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
 
33
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
34
  "mistralai/mistral-saba", # 0.6$
35
  "mistralai/mistral-nemo", # 0.08$
 
48
  "microsoft/phi-4", # 0.07$
49
  "microsoft/phi-4-multimodal-instruct", # 0.1$
50
  "amazon/nova-micro-v1", # 0.09$
 
 
51
  ]
52
 
53
  blocklist = [
54
  "google/gemini-2.5-pro-preview",
 
55
  "google/gemini-2.5-flash-preview",
56
  "google/gemini-2.5-flash-lite-preview",
57
  "google/gemini-2.5-flash-preview-04-17",
 
59
  "google/gemini-2.5-flash-lite-preview-06-17",
60
  "google/gemini-2.5-pro-preview-06-05",
61
  "google/gemini-2.5-pro-preview-05-06",
 
62
  ]
63
 
64
  transcription_models = [
 
93
 
94
  @cache
95
  def get_historical_popular_models(date: date):
96
+ raw = get("https://openrouter.ai/rankings").text
97
+ data = re.search(r'{\\"data\\":(.*),\\"isPercentage\\"', raw).group(1)
98
+ data = json.loads(data.replace("\\", ""))
99
+ counts = defaultdict(int)
100
+ for day in data:
101
+ for model, count in day["ys"].items():
102
+ if model.startswith("openrouter") or model == "Others":
103
+ continue
104
+ counts[model.split(":")[0]] += count
105
+ counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
106
+ models = [get_model(model) for model, _ in counts]
107
+ return [m for m in models if m]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  @cache
111
  def get_current_popular_models(date: date):
112
+ raw = get("https://openrouter.ai/rankings?view=day").text.replace("\\", "")
113
+ data = re.search(r'"rankingData":(.*),"rankingType":"day"', raw).group(1)
114
+ data = json.loads(data)
115
+ data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
116
+ models = [get_model(model["model_permaslug"]) for model in data]
117
+ return [m for m in models if m]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
 
120
  def get_translation_models():
 
161
 
162
 
163
  translate_client = translate.Client()
164
+ google_supported_languages = [l["language"] for l in translate_client.get_languages()]
 
 
 
165
 
166
 
167
  @cache
 
231
  return empty
232
  try:
233
  info = api.model_info(id)
234
+ license = (
235
+ (info.card_data.license or "")
236
+ .replace("-", " ")
237
+ .replace("mit", "MIT")
238
+ .title()
239
+ )
 
 
 
240
  return {
241
  "hf_id": info.id,
242
  "creation_date": info.created_at,
 
249
 
250
 
251
  def get_cost(row):
252
+ cost = float(row["endpoint"]["pricing"]["completion"])
253
+ return round(cost * 1_000_000, 2)
 
 
 
 
 
 
254
 
255
 
256
  @cache
 
260
  + get_current_popular_models(date.today())[:10]
261
  )
262
  popular_models = [m["slug"] for m in popular_models]
263
+ models = set(important_models + popular_models) - set(blocklist)
264
+ models = pd.DataFrame(sorted(list(models)), columns=["id"])
 
 
 
 
 
 
 
 
 
265
  or_metadata = models["id"].apply(get_or_metadata)
266
  hf_metadata = or_metadata.apply(get_hf_metadata)
267
  creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
 
281
  license=hf_metadata.str["license"],
282
  creation_date=creation_date_hf.combine_first(creation_date_or),
283
  )
284
+ # models = models[models["cost"] <= 2.0].reset_index(drop=True)
 
285
  models["tasks"] = [
286
+ ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
 
 
 
 
 
 
 
 
287
  ] * len(models)
288
  models = pd.concat([models, get_translation_models()])
289
  return models
evals/plots.py CHANGED
@@ -9,33 +9,34 @@ df = pd.read_json("../results.json")
9
  df = df[df["metric"] != "chrf"]
10
  df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
11
 
12
-
13
  # Apply logit transformation to classification scores to reduce skewness
14
  def transform_classification_scores(row):
15
- if row["task"] == "classification":
16
  # Avoid division by zero and infinite values by clipping
17
- score = np.clip(row["score"], 0.001, 0.999)
18
  # Apply logit transformation (log(p/(1-p)))
19
  return logit(score)
20
  else:
21
- return row["score"]
22
-
23
 
24
- df["score"] = df.apply(transform_classification_scores, axis=1)
25
 
26
  # Create a pivot table with tasks as columns and languages as rows
27
  pivot_df = df.pivot_table(
28
- values="score", index="bcp_47", columns="task", aggfunc="mean"
 
 
 
29
  )
30
 
31
  # Sort and filter tasks
32
  ordered_tasks = [
33
- "translation_from",
34
- "translation_to",
35
- "classification",
36
- "mmlu",
37
- "arc",
38
- "mgsm",
39
  ]
40
  # Drop 'truthfulqa' if present and reindex columns
41
  pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
@@ -45,29 +46,29 @@ correlation_matrix = pivot_df.corr()
45
 
46
  # Create the correlation plot
47
  plt.figure(figsize=(8, 6))
48
- # Create mask for upper triangle including diagonal to show only lower triangle
49
  mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
50
 
51
  # Create a heatmap
52
  sns.heatmap(
53
- correlation_matrix,
54
- annot=True,
55
- cmap="Blues",
56
  center=0,
57
  square=True,
58
  mask=mask,
59
- cbar_kws={"shrink": 0.8},
60
- fmt=".3f",
61
  )
62
 
63
- plt.xlabel("Tasks", fontsize=12)
64
- plt.ylabel("Tasks", fontsize=12)
65
- plt.xticks(rotation=45, ha="right")
66
  plt.yticks(rotation=0)
67
  plt.tight_layout()
68
 
69
  # Save the plot
70
- plt.savefig("task_correlation_matrix.png", dpi=300, bbox_inches="tight")
71
  plt.show()
72
 
73
  # Print correlation values for reference
@@ -76,91 +77,56 @@ print("Note: Classification scores have been logit-transformed to reduce skewnes
76
  print(correlation_matrix.round(3))
77
 
78
  # Also create a scatter plot matrix for pairwise relationships with highlighted languages
79
- highlighted_languages = ["en", "zh", "hi", "es", "ar"]
80
-
81
 
82
  # Create color mapping
83
  def get_color_and_label(lang_code):
84
  if lang_code in highlighted_languages:
85
- color_map = {
86
- "en": "red",
87
- "zh": "blue",
88
- "hi": "green",
89
- "es": "orange",
90
- "ar": "purple",
91
- }
92
  return color_map[lang_code], lang_code
93
  else:
94
- return "lightgray", "Other"
95
-
96
 
97
  # Create custom scatter plot matrix
98
  tasks = pivot_df.columns.tolist()
99
  n_tasks = len(tasks)
100
 
101
  fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
102
- fig.suptitle("Pairwise Task Performance", fontsize=16, fontweight="bold")
103
 
104
  # Create legend elements
105
  legend_elements = []
106
  for lang in highlighted_languages:
107
  color, _ = get_color_and_label(lang)
108
- legend_elements.append(
109
- plt.Line2D(
110
- [0],
111
- [0],
112
- marker="o",
113
- color="w",
114
- markerfacecolor=color,
115
- markersize=8,
116
- label=lang,
117
- )
118
- )
119
- legend_elements.append(
120
- plt.Line2D(
121
- [0],
122
- [0],
123
- marker="o",
124
- color="w",
125
- markerfacecolor="lightgray",
126
- markersize=8,
127
- label="Other",
128
- )
129
- )
130
 
131
  for i, task_y in enumerate(tasks):
132
  for j, task_x in enumerate(tasks):
133
  ax = axes[i, j]
134
-
135
  if i == j:
136
  # Diagonal: histogram
137
  task_data = pivot_df[task_y].dropna()
138
  colors = [get_color_and_label(lang)[0] for lang in task_data.index]
139
- ax.hist(task_data, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
140
- ax.set_title(f"{task_y}", fontsize=10)
141
  else:
142
  # Off-diagonal: scatter plot
143
  for lang_code in pivot_df.index:
144
- if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(
145
- pivot_df.loc[lang_code, task_y]
146
- ):
147
  color, _ = get_color_and_label(lang_code)
148
  alpha = 0.8 if lang_code in highlighted_languages else 0.3
149
  size = 50 if lang_code in highlighted_languages else 20
150
- ax.scatter(
151
- pivot_df.loc[lang_code, task_x],
152
- pivot_df.loc[lang_code, task_y],
153
- c=color,
154
- alpha=alpha,
155
- s=size,
156
- )
157
-
158
  # Set labels
159
  if i == n_tasks - 1:
160
  ax.set_xlabel(task_x, fontsize=10)
161
  if j == 0:
162
  ax.set_ylabel(task_y, fontsize=10)
163
-
164
  # Remove tick labels except for edges
165
  if i != n_tasks - 1:
166
  ax.set_xticklabels([])
@@ -170,15 +136,15 @@ for i, task_y in enumerate(tasks):
170
  # Add legend
171
  fig.legend(
172
  handles=legend_elements,
173
- loc="lower center",
174
  bbox_to_anchor=(0.5, -0.05),
175
  ncol=len(legend_elements),
176
  frameon=False,
177
  fontsize=10,
178
  handletextpad=0.5,
179
- columnspacing=1.0,
180
  )
181
 
182
  plt.tight_layout()
183
- plt.savefig("task_scatter_matrix.png", dpi=300, bbox_inches="tight")
184
  plt.show()
 
9
  df = df[df["metric"] != "chrf"]
10
  df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
11
 
 
12
  # Apply logit transformation to classification scores to reduce skewness
13
  def transform_classification_scores(row):
14
+ if row['task'] == 'classification':
15
  # Avoid division by zero and infinite values by clipping
16
+ score = np.clip(row['score'], 0.001, 0.999)
17
  # Apply logit transformation (log(p/(1-p)))
18
  return logit(score)
19
  else:
20
+ return row['score']
 
21
 
22
+ df['score'] = df.apply(transform_classification_scores, axis=1)
23
 
24
  # Create a pivot table with tasks as columns and languages as rows
25
  pivot_df = df.pivot_table(
26
+ values='score',
27
+ index='bcp_47',
28
+ columns='task',
29
+ aggfunc='mean'
30
  )
31
 
32
  # Sort and filter tasks
33
  ordered_tasks = [
34
+ 'translation_from',
35
+ 'translation_to',
36
+ 'classification',
37
+ 'mmlu',
38
+ 'arc',
39
+ 'mgsm',
40
  ]
41
  # Drop 'truthfulqa' if present and reindex columns
42
  pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
 
46
 
47
  # Create the correlation plot
48
  plt.figure(figsize=(8, 6))
49
+ # Create mask for upper triangle including diagonal to show only lower triangle
50
  mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
51
 
52
  # Create a heatmap
53
  sns.heatmap(
54
+ correlation_matrix,
55
+ annot=True,
56
+ cmap='Blues',
57
  center=0,
58
  square=True,
59
  mask=mask,
60
+ cbar_kws={"shrink": .8},
61
+ fmt='.3f'
62
  )
63
 
64
+ plt.xlabel('Tasks', fontsize=12)
65
+ plt.ylabel('Tasks', fontsize=12)
66
+ plt.xticks(rotation=45, ha='right')
67
  plt.yticks(rotation=0)
68
  plt.tight_layout()
69
 
70
  # Save the plot
71
+ plt.savefig('task_correlation_matrix.png', dpi=300, bbox_inches='tight')
72
  plt.show()
73
 
74
  # Print correlation values for reference
 
77
  print(correlation_matrix.round(3))
78
 
79
  # Also create a scatter plot matrix for pairwise relationships with highlighted languages
80
+ highlighted_languages = ['en', 'zh', 'hi', 'es', 'ar']
 
81
 
82
  # Create color mapping
83
  def get_color_and_label(lang_code):
84
  if lang_code in highlighted_languages:
85
+ color_map = {'en': 'red', 'zh': 'blue', 'hi': 'green', 'es': 'orange', 'ar': 'purple'}
 
 
 
 
 
 
86
  return color_map[lang_code], lang_code
87
  else:
88
+ return 'lightgray', 'Other'
 
89
 
90
  # Create custom scatter plot matrix
91
  tasks = pivot_df.columns.tolist()
92
  n_tasks = len(tasks)
93
 
94
  fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
95
+ fig.suptitle('Pairwise Task Performance', fontsize=16, fontweight='bold')
96
 
97
  # Create legend elements
98
  legend_elements = []
99
  for lang in highlighted_languages:
100
  color, _ = get_color_and_label(lang)
101
+ legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=8, label=lang))
102
+ legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='lightgray', markersize=8, label='Other'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  for i, task_y in enumerate(tasks):
105
  for j, task_x in enumerate(tasks):
106
  ax = axes[i, j]
107
+
108
  if i == j:
109
  # Diagonal: histogram
110
  task_data = pivot_df[task_y].dropna()
111
  colors = [get_color_and_label(lang)[0] for lang in task_data.index]
112
+ ax.hist(task_data, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
113
+ ax.set_title(f'{task_y}', fontsize=10)
114
  else:
115
  # Off-diagonal: scatter plot
116
  for lang_code in pivot_df.index:
117
+ if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(pivot_df.loc[lang_code, task_y]):
 
 
118
  color, _ = get_color_and_label(lang_code)
119
  alpha = 0.8 if lang_code in highlighted_languages else 0.3
120
  size = 50 if lang_code in highlighted_languages else 20
121
+ ax.scatter(pivot_df.loc[lang_code, task_x], pivot_df.loc[lang_code, task_y],
122
+ c=color, alpha=alpha, s=size)
123
+
 
 
 
 
 
124
  # Set labels
125
  if i == n_tasks - 1:
126
  ax.set_xlabel(task_x, fontsize=10)
127
  if j == 0:
128
  ax.set_ylabel(task_y, fontsize=10)
129
+
130
  # Remove tick labels except for edges
131
  if i != n_tasks - 1:
132
  ax.set_xticklabels([])
 
136
  # Add legend
137
  fig.legend(
138
  handles=legend_elements,
139
+ loc='lower center',
140
  bbox_to_anchor=(0.5, -0.05),
141
  ncol=len(legend_elements),
142
  frameon=False,
143
  fontsize=10,
144
  handletextpad=0.5,
145
+ columnspacing=1.0
146
  )
147
 
148
  plt.tight_layout()
149
+ plt.savefig('task_scatter_matrix.png', dpi=300, bbox_inches='tight')
150
  plt.show()
evals/tasks.py CHANGED
@@ -1,4 +1,3 @@
1
- import asyncio
2
  import random
3
  from functools import partial
4
  from textwrap import dedent
@@ -6,10 +5,10 @@ from textwrap import dedent
6
  import evaluate
7
  import pandas as pd
8
  import sentencepiece as spm
9
- from datasets_.arc import load_uhura_arc_easy
10
  from datasets_.flores import flores_sentences
11
  from datasets_.mgsm import load_mgsm, parse_number
12
  from datasets_.mmlu import load_mmlu
 
13
  from datasets_.truthfulqa import load_truthfulqa
14
  from google.cloud import translate_v2 as translate
15
  from langcodes import closest_supported_match
@@ -48,7 +47,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
48
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
49
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
50
  script = script_name(target_language.flores_path.split("_")[1])
51
- translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
52
  if model == "google/translate-v2":
53
  original_language = closest_supported_match(
54
  original_language, supported_languages
@@ -68,7 +66,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
68
  messages=[
69
  {
70
  "role": "user",
71
- "content": translation_prompt,
72
  }
73
  ],
74
  temperature=0,
@@ -93,7 +91,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
93
  "task": f"translation_{mode}",
94
  "metric": metric,
95
  "score": score,
96
- "origin": "human", # FLORES+ is human-translated
97
  "sentence_nr": sentence_nr,
98
  }
99
  for metric, score in (
@@ -115,33 +112,57 @@ async def classify_and_evaluate(model, bcp_47, nr):
115
  )
116
  top_topics = paragraphs.value_counts("topic").head(5).index
117
  paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
118
- test_paragraph = paragraphs.sample(n=1, random_state=nr).iloc[0]
119
-
120
- prompt = f"""Classify the following text into one of these topics: {", ".join(top_topics)}.
121
- Reply with only the topic name.
122
-
123
- Text:
124
- {test_paragraph.text}
125
- """
126
- response = await complete(
127
- model=model,
128
- messages=[{"role": "user", "content": prompt}],
129
- temperature=0,
130
- max_tokens=30,
131
  )
 
132
 
133
- pred = response.lower().strip() if response else ""
134
- true = test_paragraph.topic.lower().strip()
135
- others = [t for t in top_topics if t != true]
136
- acc = (
137
- int(
138
- pred.startswith(true)
139
- or (true in pred and not any(o in pred for o in others))
140
- )
141
- if pred
142
- else 0
143
- )
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  return [
146
  {
147
  "model": model,
@@ -149,7 +170,6 @@ Text:
149
  "task": "classification",
150
  "metric": "accuracy",
151
  "score": acc,
152
- "origin": "human", # FLORES+ is human-translated
153
  "sentence_nr": nr,
154
  }
155
  ]
@@ -212,38 +232,39 @@ def format_multiple_choice(item):
212
  A: {item["choices"][0]}
213
  B: {item["choices"][1]}
214
  C: {item["choices"][2]}
215
- D: {item["choices"][3]}"""
 
 
216
 
217
 
218
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
219
- ds_name, task, origin = await load_mmlu(language_bcp_47, nr)
220
  if not task:
221
  return []
222
 
223
- messages = [
224
- {
225
- "role": "user",
226
- "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
227
-
228
- Response format: <reasoning> #### <letter>
229
-
230
- ---
231
-
232
- {format_multiple_choice(task)}""",
233
- },
234
- ]
235
- response = await complete(
236
- model=model,
237
- messages=messages,
238
- temperature=0,
239
- max_tokens=1024,
240
- )
241
- if response and "####" in response:
242
- answer = response.split("####")[-1].strip()
243
- acc = int(answer[:1] == task["answer"])
244
- else:
245
- acc = 0
246
-
247
  return [
248
  {
249
  "model": model,
@@ -251,40 +272,39 @@ Response format: <reasoning> #### <letter>
251
  "task": "mmlu",
252
  "metric": "accuracy",
253
  "score": acc,
254
- "origin": origin, # Add origin tag to results
255
  "sentence_nr": nr,
256
  }
257
  ]
258
 
259
 
260
  async def arc_and_evaluate(model, language_bcp_47, nr):
261
- ds_name, task, origin = load_uhura_arc_easy(language_bcp_47, nr)
262
  if not task:
263
  return []
264
 
265
- messages = [
266
- {
267
- "role": "user",
268
- "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
269
-
270
- Response format: <reasoning> #### <letter>
271
-
272
- ---
273
-
274
- {format_multiple_choice(task)}""",
275
- },
276
- ]
277
- response = await complete(
278
- model=model,
279
- messages=messages,
280
- temperature=0,
281
- max_tokens=1024,
282
- )
283
- if response and "####" in response:
284
- answer = response.split("####")[-1].strip()
285
- acc = int(answer[:1] == task["answer"])
286
- else:
287
- acc = 0
288
  return [
289
  {
290
  "model": model,
@@ -292,7 +312,6 @@ Response format: <reasoning> #### <letter>
292
  "task": "arc",
293
  "metric": "accuracy",
294
  "score": acc,
295
- "origin": origin,
296
  "sentence_nr": nr,
297
  }
298
  ]
@@ -313,42 +332,40 @@ def format_multiple_choice_truthfulqa(item):
313
  text = item["question"] + "\n\n"
314
  for i, choice in enumerate(item["choices"]):
315
  text += f"{letters[i]}: {choice}\n"
 
316
  return text
317
 
318
 
319
  async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
320
- ds_name, task, origin = await load_truthfulqa(language_bcp_47, nr)
321
  if not task:
322
  return []
323
-
324
- # Find the correct answer
325
- correct_choice_index = task["labels"].index(1)
326
- answer = letters[correct_choice_index]
327
-
328
- messages = [
329
- {
330
- "role": "user",
331
- "content": f"""Answer the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
332
-
333
- Response format: <reasoning> #### <letter>
334
-
335
- ---
336
-
337
- {format_multiple_choice_truthfulqa(task)}""",
338
- },
339
- ]
340
- response = await complete(
341
- model=model,
342
- messages=messages,
343
- temperature=0,
344
- max_tokens=1024, # Increased for reasoning
345
- )
346
- if response and "####" in response:
347
- pred_answer = response.split("####")[-1].strip()
348
- acc = int(pred_answer[:1].upper() == answer)
349
- else:
350
- acc = 0
351
-
352
  return [
353
  {
354
  "model": model,
@@ -356,36 +373,30 @@ Response format: <reasoning> #### <letter>
356
  "task": "truthfulqa",
357
  "metric": "accuracy",
358
  "score": acc,
359
- "origin": origin,
360
  "sentence_nr": nr,
361
  }
362
  ]
363
 
364
 
365
  async def mgsm_and_evaluate(model, language_bcp_47, nr):
366
- ds_slug, question, origin = load_mgsm(language_bcp_47, nr)
 
 
 
 
 
367
  if not question:
368
  return []
369
-
370
- messages = [
371
- {
372
- "role": "user",
373
- "content": f"""Solve the following math problem. Reason step-by-step and then write the final answer as a number.
374
-
375
- Response format: <reasoning> #### <number>
376
-
377
- ---
378
-
379
- {question["question"]}""",
380
- },
381
- ]
382
  response = await complete(
383
  model=model,
384
- messages=messages,
 
 
 
385
  temperature=0,
386
  max_tokens=1024,
387
  )
388
- if response and "####" in response:
389
  number = response.split("####")[1].strip()
390
  accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
391
  else:
@@ -398,7 +409,6 @@ Response format: <reasoning> #### <number>
398
  "task": "mgsm",
399
  "metric": "accuracy",
400
  "score": accuracy,
401
- "origin": origin,
402
  "sentence_nr": nr,
403
  }
404
  ]
@@ -439,8 +449,10 @@ tasks = {
439
  "translation_from": partial(translate_and_evaluate, mode="from"),
440
  "translation_to": partial(translate_and_evaluate, mode="to"),
441
  "classification": classify_and_evaluate,
 
442
  "mmlu": mmlu_and_evaluate,
443
  "arc": arc_and_evaluate,
444
  "truthfulqa": truthfulqa_and_evaluate,
445
  "mgsm": mgsm_and_evaluate,
 
446
  }
 
 
1
  import random
2
  from functools import partial
3
  from textwrap import dedent
 
5
  import evaluate
6
  import pandas as pd
7
  import sentencepiece as spm
 
8
  from datasets_.flores import flores_sentences
9
  from datasets_.mgsm import load_mgsm, parse_number
10
  from datasets_.mmlu import load_mmlu
11
+ from datasets_.arc import load_uhura_arc_easy
12
  from datasets_.truthfulqa import load_truthfulqa
13
  from google.cloud import translate_v2 as translate
14
  from langcodes import closest_supported_match
 
47
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
48
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
49
  script = script_name(target_language.flores_path.split("_")[1])
 
50
  if model == "google/translate-v2":
51
  original_language = closest_supported_match(
52
  original_language, supported_languages
 
66
  messages=[
67
  {
68
  "role": "user",
69
+ "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
70
  }
71
  ],
72
  temperature=0,
 
91
  "task": f"translation_{mode}",
92
  "metric": metric,
93
  "score": score,
 
94
  "sentence_nr": sentence_nr,
95
  }
96
  for metric, score in (
 
112
  )
113
  top_topics = paragraphs.value_counts("topic").head(5).index
114
  paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
115
+ examples = pd.concat(
116
+ [
117
+ paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42)
118
+ for t in top_topics
119
+ ]
120
+ ).sample(frac=1, random_state=nr)
121
+ test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
122
+ frac=1, random_state=42
 
 
 
 
 
123
  )
124
+ test_paragraph = test_paragraphs.iloc[nr]
125
 
126
+ def format_prompt(text):
127
+ return f"{text}\n\nTopic: {'|'.join(top_topics)}?"
 
 
 
 
 
 
 
 
 
128
 
129
+ messages = []
130
+ for example in examples.itertuples():
131
+ messages += [
132
+ {"role": "user", "content": format_prompt(example.text)},
133
+ {"role": "assistant", "content": example.topic},
134
+ ]
135
+ # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
136
+ # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
137
+ try:
138
+ pred = await complete(
139
+ model=model,
140
+ messages=[
141
+ *messages,
142
+ {
143
+ "role": "user",
144
+ "content": format_prompt(test_paragraph.text),
145
+ },
146
+ ],
147
+ temperature=0,
148
+ max_tokens=30,
149
+ )
150
+ true = test_paragraph.topic
151
+ others = [t for t in top_topics if t != true]
152
+ acc = (
153
+ int(
154
+ pred.startswith(true)
155
+ or (true in pred and not any(o in pred for o in others))
156
+ )
157
+ if pred
158
+ else 0
159
+ )
160
+ except Exception as e:
161
+ if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
162
+ print(f"Max tokens exceeded for {model} in {bcp_47}")
163
+ acc = 0
164
+ else:
165
+ raise e
166
  return [
167
  {
168
  "model": model,
 
170
  "task": "classification",
171
  "metric": "accuracy",
172
  "score": acc,
 
173
  "sentence_nr": nr,
174
  }
175
  ]
 
232
  A: {item["choices"][0]}
233
  B: {item["choices"][1]}
234
  C: {item["choices"][2]}
235
+ D: {item["choices"][3]}
236
+
237
+ A|B|C|D?"""
238
 
239
 
240
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
241
+ ds_name, examples, task = load_mmlu(language_bcp_47, nr)
242
  if not task:
243
  return []
244
 
245
+ messages = []
246
+ for example in examples:
247
+ messages += [
248
+ {"role": "user", "content": format_multiple_choice(example)},
249
+ {"role": "assistant", "content": example["answer"]},
250
+ ]
251
+ messages += [{"role": "user", "content": format_multiple_choice(task)}]
252
+ try:
253
+ response = await complete(
254
+ model=model,
255
+ messages=messages,
256
+ temperature=0,
257
+ max_tokens=1,
258
+ )
259
+ if response:
260
+ acc = int(response[:1].strip() == task["answer"])
261
+ else:
262
+ acc = 0
263
+ except Exception as e:
264
+ if "ResponsibleAIPolicyViolation" in str(e):
265
+ acc = 0
266
+ else:
267
+ raise e
 
268
  return [
269
  {
270
  "model": model,
 
272
  "task": "mmlu",
273
  "metric": "accuracy",
274
  "score": acc,
 
275
  "sentence_nr": nr,
276
  }
277
  ]
278
 
279
 
280
  async def arc_and_evaluate(model, language_bcp_47, nr):
281
+ ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
282
  if not task:
283
  return []
284
 
285
+ messages = []
286
+ for example in examples:
287
+ messages += [
288
+ {"role": "user", "content": format_multiple_choice(example)},
289
+ {"role": "assistant", "content": example["answer"]},
290
+ ]
291
+ messages += [{"role": "user", "content": format_multiple_choice(task)}]
292
+ try:
293
+ response = await complete(
294
+ model=model,
295
+ messages=messages,
296
+ temperature=0,
297
+ max_tokens=1,
298
+ )
299
+ if response:
300
+ acc = int(response[:1].strip() == task["answer"])
301
+ else:
302
+ acc = 0
303
+ except Exception as e:
304
+ if "ResponsibleAIPolicyViolation" in str(e):
305
+ acc = 0
306
+ else:
307
+ raise e
308
  return [
309
  {
310
  "model": model,
 
312
  "task": "arc",
313
  "metric": "accuracy",
314
  "score": acc,
 
315
  "sentence_nr": nr,
316
  }
317
  ]
 
332
  text = item["question"] + "\n\n"
333
  for i, choice in enumerate(item["choices"]):
334
  text += f"{letters[i]}: {choice}\n"
335
+ text += "|".join(letters[: len(item["choices"])]) + "?"
336
  return text
337
 
338
 
339
  async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
340
+ ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
341
  if not task:
342
  return []
343
+ task = shuffle_choices_and_labels(task)
344
+ answer = letters[task["labels"].index(1)]
345
+ messages = []
346
+ for example in examples:
347
+ example = shuffle_choices_and_labels(example)
348
+ messages += [
349
+ {"role": "user", "content": format_multiple_choice_truthfulqa(example)},
350
+ {"role": "assistant", "content": letters[example["labels"].index(1)]},
351
+ ]
352
+ messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
353
+ try:
354
+ response = await complete(
355
+ model=model,
356
+ messages=messages,
357
+ temperature=0,
358
+ max_tokens=1,
359
+ )
360
+ if response:
361
+ acc = int(response[:1].strip() == answer)
362
+ else:
363
+ acc = 0
364
+ except Exception as e:
365
+ if "ResponsibleAIPolicyViolation" in str(e):
366
+ acc = 0
367
+ else:
368
+ raise e
 
 
 
369
  return [
370
  {
371
  "model": model,
 
373
  "task": "truthfulqa",
374
  "metric": "accuracy",
375
  "score": acc,
 
376
  "sentence_nr": nr,
377
  }
378
  ]
379
 
380
 
381
  async def mgsm_and_evaluate(model, language_bcp_47, nr):
382
+ system_prompt = """
383
+ Solve the math problem. Use reasoning, and finally give the answer as a number.
384
+ Response format: <reasoning> #### <number>
385
+ """
386
+ system_prompt = dedent(system_prompt).strip()
387
+ ds_slug, question = load_mgsm(language_bcp_47, nr)
388
  if not question:
389
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  response = await complete(
391
  model=model,
392
+ messages=[
393
+ {"role": "system", "content": system_prompt},
394
+ {"role": "user", "content": question["question"]},
395
+ ],
396
  temperature=0,
397
  max_tokens=1024,
398
  )
399
+ if response and len(response.split("####")) == 2:
400
  number = response.split("####")[1].strip()
401
  accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
402
  else:
 
409
  "task": "mgsm",
410
  "metric": "accuracy",
411
  "score": accuracy,
 
412
  "sentence_nr": nr,
413
  }
414
  ]
 
449
  "translation_from": partial(translate_and_evaluate, mode="from"),
450
  "translation_to": partial(translate_and_evaluate, mode="to"),
451
  "classification": classify_and_evaluate,
452
+ # "mlm": mlm_and_evaluate,
453
  "mmlu": mmlu_and_evaluate,
454
  "arc": arc_and_evaluate,
455
  "truthfulqa": truthfulqa_and_evaluate,
456
  "mgsm": mgsm_and_evaluate,
457
+ # "asr": transcribe_and_evaluate,
458
  }
evals/translate.py CHANGED
@@ -6,4 +6,4 @@ from datasets_.mmlu import translate_mmlu
6
  if __name__ == "__main__":
7
  translate_mmlu(languages)
8
  translate_mgsm(languages)
9
- translate_arc(languages)
 
6
  if __name__ == "__main__":
7
  translate_mmlu(languages)
8
  translate_mgsm(languages)
9
+ translate_arc(languages)
frontend/package-lock.json CHANGED
The diff for this file is too large to render. See raw diff
 
frontend/package.json CHANGED
@@ -6,12 +6,13 @@
6
  "@observablehq/plot": "^0.6.17",
7
  "@testing-library/dom": "^10.4.0",
8
  "@testing-library/jest-dom": "^6.6.3",
9
- "@testing-library/react": "^15.0.0",
10
  "@testing-library/user-event": "^13.5.0",
11
  "primeicons": "^7.0.0",
12
  "primereact": "^10.9.3",
13
- "react": "^18.2.0",
14
- "react-dom": "^18.2.0",
 
15
  "topojson-simplify": "^3.0.3",
16
  "web-vitals": "^2.1.4"
17
  },
@@ -40,8 +41,5 @@
40
  "last 1 safari version"
41
  ]
42
  },
43
- "proxy": "http://localhost:8000",
44
- "devDependencies": {
45
- "react-scripts": "^5.0.1"
46
- }
47
  }
 
6
  "@observablehq/plot": "^0.6.17",
7
  "@testing-library/dom": "^10.4.0",
8
  "@testing-library/jest-dom": "^6.6.3",
9
+ "@testing-library/react": "^16.2.0",
10
  "@testing-library/user-event": "^13.5.0",
11
  "primeicons": "^7.0.0",
12
  "primereact": "^10.9.3",
13
+ "react": "^19.0.0",
14
+ "react-dom": "^19.0.0",
15
+ "react-scripts": "5.0.1",
16
  "topojson-simplify": "^3.0.3",
17
  "web-vitals": "^2.1.4"
18
  },
 
41
  "last 1 safari version"
42
  ]
43
  },
44
+ "proxy": "http://localhost:8000"
 
 
 
45
  }
frontend/src/App.js CHANGED
@@ -19,14 +19,9 @@ function App () {
19
  const [loading, setLoading] = useState(true)
20
  const [error, setError] = useState(null)
21
  const [selectedLanguages, setSelectedLanguages] = useState([])
22
- const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
23
  const [dialogVisible, setDialogVisible] = useState(false)
24
  const [aboutVisible, setAboutVisible] = useState(false)
25
  const [contributeVisible, setContributeVisible] = useState(false)
26
-
27
- // Add state for carousel items
28
- const [carouselItems, setCarouselItems] = useState([])
29
- const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
30
 
31
  useEffect(() => {
32
  fetch('/api/data', {
@@ -41,7 +36,6 @@ function App () {
41
  })
42
  .then(jsonData => {
43
  setData(jsonData)
44
- setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
45
  setLoading(false)
46
  })
47
  .catch(err => {
@@ -50,27 +44,8 @@ function App () {
50
  })
51
  }, [selectedLanguages])
52
 
53
- // Create carousel items when data is loaded
54
- useEffect(() => {
55
- if (data) {
56
- // Add a small delay to ensure components are ready
57
- const timer = setTimeout(() => {
58
- setCarouselItems([
59
- <WorldMap key="worldmap-0" data={data.countries} allLanguages={data.language_table} width={750} height={500} />,
60
- <LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
61
- <SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
62
- <HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
63
- <CostPlot key="costplot-4" data={data} width={750} height={500} />
64
- ]);
65
- }, 100);
66
-
67
- return () => clearTimeout(timer);
68
- }
69
- }, [data])
70
-
71
  const [windowWidth, setWindowWidth] = useState(window.innerWidth)
72
  const [windowHeight, setWindowHeight] = useState(window.innerHeight)
73
-
74
  useEffect(() => {
75
  const handleResize = () => {
76
  setWindowWidth(window.innerWidth)
@@ -80,44 +55,6 @@ function App () {
80
  return () => window.removeEventListener('resize', handleResize)
81
  }, [])
82
 
83
- // Create full-screen carousel items when data or window size changes
84
- useEffect(() => {
85
- if (data) {
86
- const timer = setTimeout(() => {
87
- setFullScreenCarouselItems([
88
- <WorldMap
89
- key="fs-worldmap-0"
90
- data={data.countries}
91
- allLanguages={data.language_table}
92
- width={windowWidth * 0.7}
93
- height={windowHeight * 0.6}
94
- />,
95
- <LanguagePlot
96
- key="fs-langplot-1"
97
- data={data}
98
- width={windowWidth * 0.7}
99
- height={windowHeight * 0.6}
100
- />,
101
- <SpeakerPlot
102
- key="fs-speakerplot-2"
103
- data={data}
104
- width={windowWidth * 0.7}
105
- height={windowHeight * 0.6}
106
- />,
107
- <HistoryPlot
108
- key="fs-histplot-3"
109
- data={data}
110
- width={windowWidth * 0.7}
111
- height={windowHeight * 0.6}
112
- />,
113
- <CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
114
- ]);
115
- }, 100);
116
-
117
- return () => clearTimeout(timer);
118
- }
119
- }, [data, windowWidth, windowHeight])
120
-
121
  return (
122
  <PrimeReactProvider>
123
  <div
@@ -132,50 +69,35 @@ function App () {
132
  style={{
133
  backgroundColor: '#fff3cd',
134
  color: '#856404',
135
- padding: '1rem 1.5rem',
136
  marginBottom: '1rem',
137
  border: '1px solid #ffeeba',
138
  borderRadius: '0.25rem',
139
- textAlign: 'center',
140
- lineHeight: '1.5',
141
- position: 'relative'
142
  }}
143
  >
144
  <strong>Work in Progress:</strong> This dashboard is currently under
145
- active development. Evaluation results are not yet final. More extensive evaluation runs will be released later this year.
146
- </div>
147
- <div
148
- style={{
149
- display: 'flex',
150
- justifyContent: 'flex-end',
151
- padding: '0 1.5rem',
152
- marginBottom: '1rem'
153
- }}
154
- >
155
  <a
156
  href='https://github.com/datenlabor-bmz/ai-language-monitor'
157
  target='_blank'
158
  rel='noopener noreferrer'
159
  style={{
160
  textDecoration: 'none',
161
- color: '#6c757d',
162
- fontSize: '1rem',
163
- fontWeight: '500',
164
- padding: '0.5rem 1rem',
165
- borderRadius: '0.375rem',
166
- backgroundColor: '#f8f9fa',
167
- border: '1px solid #e9ecef',
168
- display: 'flex',
169
- alignItems: 'center',
170
- gap: '0.5rem',
171
- transition: 'all 0.2s ease',
172
- ':hover': {
173
- backgroundColor: '#e9ecef',
174
- color: '#495057'
175
- }
176
  }}
177
  >
178
- <i className='pi pi-github' title='View on GitHub' />
 
 
 
 
179
  GitHub
180
  </a>
181
  </div>
@@ -227,88 +149,39 @@ function App () {
227
  <div
228
  style={{
229
  display: 'flex',
230
- gap: '0.75rem',
231
- marginBottom: '2rem',
232
  flexWrap: 'wrap',
233
  justifyContent: 'center'
234
  }}
235
  >
236
- <button
 
 
237
  onClick={() => setAboutVisible(true)}
238
  style={{
239
- background: 'linear-gradient(135deg, #667eea 0%, #764ba2 100%)',
240
- color: 'white',
241
- border: 'none',
242
- padding: '0.75rem 1.5rem',
243
- borderRadius: '12px',
244
- fontSize: '0.95rem',
245
- fontWeight: '500',
246
- cursor: 'pointer',
247
- display: 'flex',
248
- alignItems: 'center',
249
- gap: '0.5rem',
250
- boxShadow: '0 4px 15px rgba(102, 126, 234, 0.25)',
251
- transition: 'all 0.3s ease',
252
- ':hover': {
253
- transform: 'translateY(-2px)',
254
- boxShadow: '0 8px 25px rgba(102, 126, 234, 0.35)'
255
- }
256
- }}
257
- onMouseEnter={(e) => {
258
- e.target.style.transform = 'translateY(-2px)';
259
- e.target.style.boxShadow = '0 8px 25px rgba(102, 126, 234, 0.35)';
260
  }}
261
- onMouseLeave={(e) => {
262
- e.target.style.transform = 'translateY(0)';
263
- e.target.style.boxShadow = '0 4px 15px rgba(102, 126, 234, 0.25)';
264
- }}
265
- >
266
- <span style={{ fontSize: '1.1rem' }}>📚</span>
267
- About this tool
268
- </button>
269
 
270
- <button
 
 
271
  onClick={() => setContributeVisible(true)}
272
- title='This feature is on our roadmap and will be available soon.'
 
273
  style={{
274
- background: 'linear-gradient(135deg, #ff9a9e 0%, #fecfef 50%, #fecfef 100%)',
275
- color: '#6b46c1',
276
- border: 'none',
277
- padding: '0.75rem 1.5rem',
278
- borderRadius: '12px',
279
- fontSize: '0.95rem',
280
- fontWeight: '500',
281
- cursor: 'pointer',
282
- display: 'flex',
283
- alignItems: 'center',
284
- gap: '0.5rem',
285
- boxShadow: '0 4px 15px rgba(255, 154, 158, 0.25)',
286
- transition: 'all 0.3s ease',
287
- position: 'relative',
288
- overflow: 'hidden'
289
  }}
290
- onMouseEnter={(e) => {
291
- e.target.style.transform = 'translateY(-2px)';
292
- e.target.style.boxShadow = '0 8px 25px rgba(255, 154, 158, 0.35)';
293
- }}
294
- onMouseLeave={(e) => {
295
- e.target.style.transform = 'translateY(0)';
296
- e.target.style.boxShadow = '0 4px 15px rgba(255, 154, 158, 0.25)';
297
- }}
298
- >
299
- <span style={{ fontSize: '1.1rem' }}>🚀</span>
300
- Add your model
301
- <span style={{
302
- fontSize: '0.75rem',
303
- backgroundColor: 'rgba(107, 70, 193, 0.15)',
304
- padding: '0.2rem 0.5rem',
305
- borderRadius: '6px',
306
- marginLeft: '0.5rem',
307
- fontWeight: '600'
308
- }}>
309
- soon
310
- </span>
311
- </button>
312
  </div>
313
 
314
  {data && (
@@ -347,7 +220,6 @@ function App () {
347
  data={data.model_table}
348
  selectedLanguages={selectedLanguages}
349
  allLanguages={data.language_table || []}
350
- machineTranslatedMetrics={machineTranslatedMetrics}
351
  />
352
  <LanguageTable
353
  data={data.language_table}
@@ -376,18 +248,20 @@ function App () {
376
  color: '#666'
377
  }}
378
  />
379
- {carouselItems.length > 0 && (
380
- <Carousel
381
- key={`main-carousel-${carouselItems.length}-${Date.now()}`}
382
- value={carouselItems}
383
- numScroll={1}
384
- numVisible={1}
385
- itemTemplate={item => item}
386
- circular={false}
387
- activeIndex={0}
388
- style={{ width: '100%', minHeight: '650px' }}
389
- />
390
- )}
 
 
391
  </div>
392
  </>
393
  )}
@@ -535,16 +409,36 @@ function App () {
535
  modal
536
  header={null}
537
  >
538
- {fullScreenCarouselItems.length > 0 && (
539
  <div style={{ width: '100%', height: '100%' }}>
540
  <Carousel
541
- key={`fs-carousel-${fullScreenCarouselItems.length}-${Date.now()}`}
542
- value={fullScreenCarouselItems}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  numScroll={1}
544
  numVisible={1}
545
  itemTemplate={item => item}
546
- circular={false}
547
- activeIndex={0}
548
  style={{ width: '100%', height: 'calc(90vh - 120px)' }}
549
  />
550
  </div>
@@ -555,4 +449,4 @@ function App () {
555
  )
556
  }
557
 
558
- export default App
 
19
  const [loading, setLoading] = useState(true)
20
  const [error, setError] = useState(null)
21
  const [selectedLanguages, setSelectedLanguages] = useState([])
 
22
  const [dialogVisible, setDialogVisible] = useState(false)
23
  const [aboutVisible, setAboutVisible] = useState(false)
24
  const [contributeVisible, setContributeVisible] = useState(false)
 
 
 
 
25
 
26
  useEffect(() => {
27
  fetch('/api/data', {
 
36
  })
37
  .then(jsonData => {
38
  setData(jsonData)
 
39
  setLoading(false)
40
  })
41
  .catch(err => {
 
44
  })
45
  }, [selectedLanguages])
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  const [windowWidth, setWindowWidth] = useState(window.innerWidth)
48
  const [windowHeight, setWindowHeight] = useState(window.innerHeight)
 
49
  useEffect(() => {
50
  const handleResize = () => {
51
  setWindowWidth(window.innerWidth)
 
55
  return () => window.removeEventListener('resize', handleResize)
56
  }, [])
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  return (
59
  <PrimeReactProvider>
60
  <div
 
69
  style={{
70
  backgroundColor: '#fff3cd',
71
  color: '#856404',
72
+ padding: '0.75rem 1.25rem',
73
  marginBottom: '1rem',
74
  border: '1px solid #ffeeba',
75
  borderRadius: '0.25rem',
76
+ textAlign: 'center'
 
 
77
  }}
78
  >
79
  <strong>Work in Progress:</strong> This dashboard is currently under
80
+ active development. Evaluation results are not yet final.
 
 
 
 
 
 
 
 
 
81
  <a
82
  href='https://github.com/datenlabor-bmz/ai-language-monitor'
83
  target='_blank'
84
  rel='noopener noreferrer'
85
  style={{
86
  textDecoration: 'none',
87
+ color: '#856404',
88
+ float: 'right',
89
+ fontSize: '1.2rem',
90
+ fontWeight: 'bold',
91
+ padding: '0 0.5rem',
92
+ borderRadius: '3px',
93
+ backgroundColor: 'rgba(255,255,255,0.3)'
 
 
 
 
 
 
 
 
94
  }}
95
  >
96
+ <i
97
+ className='pi pi-github'
98
+ title='View on GitHub'
99
+ style={{ marginRight: '0.3rem' }}
100
+ />
101
  GitHub
102
  </a>
103
  </div>
 
149
  <div
150
  style={{
151
  display: 'flex',
152
+ gap: '1rem',
153
+ marginBottom: '1.5rem',
154
  flexWrap: 'wrap',
155
  justifyContent: 'center'
156
  }}
157
  >
158
+ <Button
159
+ label='📚 About this tool'
160
+ className='p-button-text'
161
  onClick={() => setAboutVisible(true)}
162
  style={{
163
+ color: '#666',
164
+ border: '1px solid #ddd',
165
+ padding: '0.5rem 1rem',
166
+ borderRadius: '4px',
167
+ fontSize: '0.9rem'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  }}
169
+ />
 
 
 
 
 
 
 
170
 
171
+ <Button
172
+ label='🚀 Add your model (soon)'
173
+ className='p-button-text'
174
  onClick={() => setContributeVisible(true)}
175
+ tooltip='This feature is on our roadmap and will be available soon.'
176
+ tooltipOptions={{ position: 'bottom' }}
177
  style={{
178
+ color: '#666',
179
+ border: '1px solid #ddd',
180
+ padding: '0.5rem 1rem',
181
+ borderRadius: '4px',
182
+ fontSize: '0.9rem'
 
 
 
 
 
 
 
 
 
 
183
  }}
184
+ />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  </div>
186
 
187
  {data && (
 
220
  data={data.model_table}
221
  selectedLanguages={selectedLanguages}
222
  allLanguages={data.language_table || []}
 
223
  />
224
  <LanguageTable
225
  data={data.language_table}
 
248
  color: '#666'
249
  }}
250
  />
251
+ <Carousel
252
+ value={[
253
+ <WorldMap data={data.countries} />,
254
+ <LanguagePlot data={data} />,
255
+ <SpeakerPlot data={data} />,
256
+ <HistoryPlot data={data} />,
257
+ <CostPlot data={data} />
258
+ ]}
259
+ numScroll={1}
260
+ numVisible={1}
261
+ itemTemplate={item => item}
262
+ circular
263
+ style={{ width: '100%', minHeight: '650px' }}
264
+ />
265
  </div>
266
  </>
267
  )}
 
409
  modal
410
  header={null}
411
  >
412
+ {data && (
413
  <div style={{ width: '100%', height: '100%' }}>
414
  <Carousel
415
+ value={[
416
+ <WorldMap
417
+ data={data.countries}
418
+ width={windowWidth * 0.7}
419
+ height={windowHeight * 0.6}
420
+ />,
421
+ <LanguagePlot
422
+ data={data}
423
+ width={windowWidth * 0.7}
424
+ height={windowHeight * 0.6}
425
+ />,
426
+ <SpeakerPlot
427
+ data={data}
428
+ width={windowWidth * 0.7}
429
+ height={windowHeight * 0.6}
430
+ />,
431
+ <HistoryPlot
432
+ data={data}
433
+ width={windowWidth * 0.7}
434
+ height={windowHeight * 0.6}
435
+ />,
436
+ <CostPlot data={data} />
437
+ ]}
438
  numScroll={1}
439
  numVisible={1}
440
  itemTemplate={item => item}
441
+ circular
 
442
  style={{ width: '100%', height: 'calc(90vh - 120px)' }}
443
  />
444
  </div>
 
449
  )
450
  }
451
 
452
+ export default App
frontend/src/components/HistoryPlot.js CHANGED
@@ -50,12 +50,12 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
50
  ...models.filter(d => d.newRecord),
51
  {
52
  creation_date: new Date(),
53
- maxAverage: models[models.length - 1]?.maxAverage || 0
54
  }
55
  ],
56
  {
57
  x: d => d.creation_date,
58
- y: d => d.maxAverage || 0,
59
  curve: 'step-after',
60
  strokeOpacity: 0.3
61
  }
 
50
  ...models.filter(d => d.newRecord),
51
  {
52
  creation_date: new Date(),
53
+ maxAverage: models[models.length - 1].maxAverage
54
  }
55
  ],
56
  {
57
  x: d => d.creation_date,
58
+ y: d => d.maxAverage,
59
  curve: 'step-after',
60
  strokeOpacity: 0.3
61
  }
frontend/src/components/LanguageTable.js CHANGED
@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
172
  filterElement={familyRowFilterTemplate}
173
  style={{ minWidth: '10rem' }}
174
  />
175
- {ScoreColumns()}
176
  </DataTable>
177
  )
178
  }
 
172
  filterElement={familyRowFilterTemplate}
173
  style={{ minWidth: '10rem' }}
174
  />
175
+ {ScoreColumns}
176
  </DataTable>
177
  )
178
  }
frontend/src/components/ModelTable.js CHANGED
@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
6
  import Medal from './Medal'
7
  import { Slider } from 'primereact/slider'
8
  import ScoreColumns from './ScoreColumns'
9
- const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTranslatedMetrics = [] }) => {
10
  const [filters, setFilters] = useState({
11
  type: { value: null, matchMode: FilterMatchMode.IN },
12
  size: { value: null, matchMode: FilterMatchMode.BETWEEN },
@@ -50,10 +50,10 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
50
  }
51
 
52
  const SliderWithLabel = ({ value, onChange, min, max }) => {
53
- const p = 10;
54
- const start = value === null || value[0] === null ? min : Math.log(value[0]) / Math.log(p);
55
- const stop = value === null || value[1] === null ? max : Math.log(value[1]) / Math.log(p);
56
- const [_value, _setValue] = useState([start, stop]);
57
  useEffect(() => {
58
  const timer = setTimeout(() => {
59
  onChange({
@@ -61,11 +61,11 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
61
  // set to "no filter" when (almost) the whole range is selected
62
  _value[0] <= min + 0.1 && _value[1] >= max - 0.1
63
  ? null
64
- : [p ** _value[0], p ** _value[1]],
65
- });
66
- }, 1000);
67
- return () => clearTimeout(timer);
68
- }, [_value, onChange, min, max]);
69
  return (
70
  <div style={{ minWidth: '20rem' }}>
71
  <div>{formatSize(p ** _value[0])}</div>
@@ -147,35 +147,21 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
147
  }
148
 
149
  const costBodyTemplate = rowData => {
150
- return (
151
- <div style={{ textAlign: 'center' }}>
152
- {rowData.cost === null ? 'n/a' : `$${rowData.cost.toFixed(2)}`}
153
- </div>
154
- )
155
  }
156
 
157
  const getHeaderText = () => {
158
- // Count languages that have any evaluation data (any task scores available)
159
- const evaluatedLanguagesCount = allLanguages.filter(lang => {
160
- // Check if language has any task scores (not just average)
161
- const hasAnyScores = [
162
- 'translation_from_bleu',
163
- 'translation_to_bleu',
164
- 'classification_accuracy',
165
- 'mmlu_accuracy',
166
- 'arc_accuracy',
167
- 'truthfulqa_accuracy',
168
- 'mgsm_accuracy'
169
- ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
170
- return hasAnyScores
171
- }).length
172
 
173
  if (selectedLanguages.length === 0) {
174
  return (
175
  <span>
176
  <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
177
  <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
178
- Performance across {evaluatedLanguagesCount} evaluated languages
179
  </span>
180
  </span>
181
  )
@@ -259,7 +245,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
259
  body={costBodyTemplate}
260
  style={{ minWidth: '5rem' }}
261
  />
262
- {ScoreColumns(machineTranslatedMetrics)}
263
  </DataTable>
264
  )
265
  }
 
6
  import Medal from './Medal'
7
  import { Slider } from 'primereact/slider'
8
  import ScoreColumns from './ScoreColumns'
9
+ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
10
  const [filters, setFilters] = useState({
11
  type: { value: null, matchMode: FilterMatchMode.IN },
12
  size: { value: null, matchMode: FilterMatchMode.BETWEEN },
 
50
  }
51
 
52
  const SliderWithLabel = ({ value, onChange, min, max }) => {
53
+ const p = 10
54
+ const start = value === null ? min : Math.log(value[0]) / Math.log(p)
55
+ const stop = value === null ? max : Math.log(value[1]) / Math.log(p)
56
+ const [_value, _setValue] = useState([start, stop])
57
  useEffect(() => {
58
  const timer = setTimeout(() => {
59
  onChange({
 
61
  // set to "no filter" when (almost) the whole range is selected
62
  _value[0] <= min + 0.1 && _value[1] >= max - 0.1
63
  ? null
64
+ : [p ** _value[0], p ** _value[1]]
65
+ })
66
+ }, 1000)
67
+ return () => clearTimeout(timer)
68
+ }, [_value, onChange, min, max])
69
  return (
70
  <div style={{ minWidth: '20rem' }}>
71
  <div>{formatSize(p ** _value[0])}</div>
 
147
  }
148
 
149
  const costBodyTemplate = rowData => {
150
+ return <div style={{ textAlign: 'center' }}>${rowData.cost?.toFixed(2)}</div>
 
 
 
 
151
  }
152
 
153
  const getHeaderText = () => {
154
+ // Count languages that have evaluation data (average score available)
155
+ const evaluatedLanguagesCount = allLanguages.filter(lang =>
156
+ lang.average !== null && lang.average !== undefined
157
+ ).length
 
 
 
 
 
 
 
 
 
 
158
 
159
  if (selectedLanguages.length === 0) {
160
  return (
161
  <span>
162
  <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
163
  <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
164
+ Average performance across {evaluatedLanguagesCount} evaluated languages
165
  </span>
166
  </span>
167
  )
 
245
  body={costBodyTemplate}
246
  style={{ minWidth: '5rem' }}
247
  />
248
+ {ScoreColumns}
249
  </DataTable>
250
  )
251
  }
frontend/src/components/ScoreColumns.js CHANGED
@@ -2,28 +2,21 @@ import { Column } from 'primereact/column'
2
  import ScoreField from './ScoreField'
3
 
4
  const scoreBodyTemplate = (field, options = {}) => {
5
- const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
6
 
7
  return rowData => {
8
  const score = rowData[field]
9
- // Prefer per-row flag if present (backend sets `<metric>_is_machine`),
10
- // otherwise fall back to global list
11
- const rowFlagKey = `${field}_is_machine`
12
- const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
13
- const isMachineTranslated = hasRowFlag
14
- ? !!rowData[rowFlagKey]
15
- : machineTranslatedMetrics.includes(field)
16
- return ScoreField(score, minScore, maxScore, isMachineTranslated)
17
  }
18
  }
19
 
20
- const ScoreColumns = (machineTranslatedMetrics = []) => [
21
  <Column
22
  field='average'
23
  header='Proficiency'
24
  headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
25
  sortable
26
- body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
27
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
28
  />,
29
  <Column
@@ -33,8 +26,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
33
  sortable
34
  body={scoreBodyTemplate('translation_from_bleu', {
35
  minScore: 0,
36
- maxScore: 0.5,
37
- machineTranslatedMetrics
38
  })}
39
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
40
  />,
@@ -45,8 +37,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
45
  sortable
46
  body={scoreBodyTemplate('translation_to_bleu', {
47
  minScore: 0,
48
- maxScore: 0.5,
49
- machineTranslatedMetrics
50
  })}
51
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
52
  />,
@@ -57,8 +48,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
57
  sortable
58
  body={scoreBodyTemplate('classification_accuracy', {
59
  minScore: 0,
60
- maxScore: 0.5,
61
- machineTranslatedMetrics
62
  })}
63
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
64
  />,
@@ -79,8 +69,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
79
  sortable
80
  body={scoreBodyTemplate('mmlu_accuracy', {
81
  minScore: 0,
82
- maxScore: 1,
83
- machineTranslatedMetrics
84
  })}
85
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
86
  />,
@@ -91,8 +80,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
91
  sortable
92
  body={scoreBodyTemplate('arc_accuracy', {
93
  minScore: 0,
94
- maxScore: 1,
95
- machineTranslatedMetrics
96
  })}
97
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
98
  />,
@@ -103,8 +91,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
103
  sortable
104
  body={scoreBodyTemplate('mgsm_accuracy', {
105
  minScore: 0,
106
- maxScore: 1,
107
- machineTranslatedMetrics
108
  })}
109
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
110
  />,
 
2
  import ScoreField from './ScoreField'
3
 
4
  const scoreBodyTemplate = (field, options = {}) => {
5
+ const { minScore = 0, maxScore = 1 } = options
6
 
7
  return rowData => {
8
  const score = rowData[field]
9
+ return ScoreField(score, minScore, maxScore)
 
 
 
 
 
 
 
10
  }
11
  }
12
 
13
+ const ScoreColumns = [
14
  <Column
15
  field='average'
16
  header='Proficiency'
17
  headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
18
  sortable
19
+ body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
20
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
21
  />,
22
  <Column
 
26
  sortable
27
  body={scoreBodyTemplate('translation_from_bleu', {
28
  minScore: 0,
29
+ maxScore: 0.5
 
30
  })}
31
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
32
  />,
 
37
  sortable
38
  body={scoreBodyTemplate('translation_to_bleu', {
39
  minScore: 0,
40
+ maxScore: 0.5
 
41
  })}
42
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
43
  />,
 
48
  sortable
49
  body={scoreBodyTemplate('classification_accuracy', {
50
  minScore: 0,
51
+ maxScore: 0.5
 
52
  })}
53
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
54
  />,
 
69
  sortable
70
  body={scoreBodyTemplate('mmlu_accuracy', {
71
  minScore: 0,
72
+ maxScore: 1
 
73
  })}
74
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
75
  />,
 
80
  sortable
81
  body={scoreBodyTemplate('arc_accuracy', {
82
  minScore: 0,
83
+ maxScore: 1
 
84
  })}
85
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
86
  />,
 
91
  sortable
92
  body={scoreBodyTemplate('mgsm_accuracy', {
93
  minScore: 0,
94
+ maxScore: 1
 
95
  })}
96
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
97
  />,
frontend/src/components/ScoreField.js CHANGED
@@ -1,4 +1,4 @@
1
- const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
2
  let percentage = 100
3
  let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
4
  if (score !== null) {
@@ -50,7 +50,6 @@ const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
50
  }}
51
  >
52
  {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
53
- {isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
54
  </span>
55
  </div>
56
  )
 
1
+ const ScoreField = (score, minScore, maxScore) => {
2
  let percentage = 100
3
  let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
4
  if (score !== null) {
 
50
  }}
51
  >
52
  {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
 
53
  </span>
54
  </div>
55
  )
frontend/src/components/SpeakerPlot.js CHANGED
@@ -73,10 +73,10 @@ const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
73
  textStrokeOpacity: 0,
74
  textFillOpacity: 0
75
  }),
76
- ...(languages.length >= 40 ? [Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
77
  x: 40,
78
  y: languages[39].cumSpeakers / 1e6
79
- })] : [])
80
  ]
81
  })
82
  containerRef.current.append(plot)
 
73
  textStrokeOpacity: 0,
74
  textFillOpacity: 0
75
  }),
76
+ Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
77
  x: 40,
78
  y: languages[39].cumSpeakers / 1e6
79
+ })
80
  ]
81
  })
82
  containerRef.current.append(plot)
frontend/src/components/WorldMap.js CHANGED
@@ -26,13 +26,13 @@ const makeTitle = data => d => {
26
  a =>
27
  `${smoothProgressBar(a.population / pop)} ${
28
  a.name
29
- } – ${a.score === null || a.score === undefined ? "n/a" : a.score.toFixed(2)}`
30
  )
31
  .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
32
- return `${d.properties.ADMIN} – ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
33
  }
34
 
35
- const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
36
  const containerRef = useRef()
37
  const [mapData, setMapData] = useState()
38
 
@@ -48,22 +48,8 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
48
  acc[country.iso2] = country
49
  return acc
50
  }, {})
51
- // Count languages that have any evaluation data
52
- const evaluatedLanguagesCount = allLanguages.filter(lang => {
53
- const hasAnyScores = [
54
- 'translation_from_bleu',
55
- 'translation_to_bleu',
56
- 'classification_accuracy',
57
- 'mmlu_accuracy',
58
- 'arc_accuracy',
59
- 'truthfulqa_accuracy',
60
- 'mgsm_accuracy'
61
- ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
62
- return hasAnyScores
63
- }).length
64
-
65
  const plot = Plot.plot({
66
- subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
67
  width: width,
68
  height: height,
69
  projection: 'equal-earth',
@@ -75,12 +61,11 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
75
  })
76
  ],
77
  color: {
78
- scheme: 'RdYlGn',
79
- unknown: '#d0d0d0',
80
  label: 'Score',
81
  legend: true,
82
- domain: [0, 1],
83
- pivot: 0.5
84
  },
85
  style: {
86
  fontFamily: 'monospace'
 
26
  a =>
27
  `${smoothProgressBar(a.population / pop)} ${
28
  a.name
29
+ } – ${a.score.toFixed(2)}`
30
  )
31
  .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
32
+ return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
33
  }
34
 
35
+ const WorldMap = ({ data, width = 750, height = 500 }) => {
36
  const containerRef = useRef()
37
  const [mapData, setMapData] = useState()
38
 
 
48
  acc[country.iso2] = country
49
  return acc
50
  }, {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  const plot = Plot.plot({
52
+ subtitle: 'Language Proficiency Score by Country',
53
  width: width,
54
  height: height,
55
  projection: 'equal-earth',
 
61
  })
62
  ],
63
  color: {
64
+ scheme: 'Greens',
65
+ unknown: 'gray',
66
  label: 'Score',
67
  legend: true,
68
+ domain: [0, 1]
 
69
  },
70
  style: {
71
  fontFamily: 'monospace'
languages.json CHANGED
@@ -7,7 +7,7 @@
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
- "commonvoice_hours":2683.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
@@ -32,7 +32,7 @@
32
  "flores_path":"hin_Deva",
33
  "fleurs_tag":"hi_in",
34
  "commonvoice_hours":16.0,
35
- "commonvoice_locale":"hi",
36
  "in_benchmark":true
37
  },
38
  {
@@ -43,7 +43,7 @@
43
  "family":"Indo-European",
44
  "flores_path":"spa_Latn",
45
  "fleurs_tag":"es_419",
46
- "commonvoice_hours":449.0,
47
  "commonvoice_locale":"es",
48
  "in_benchmark":true
49
  },
@@ -79,7 +79,7 @@
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
- "commonvoice_hours":1073.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
@@ -103,7 +103,7 @@
103
  "family":"Indo-European",
104
  "flores_path":"por_Latn",
105
  "fleurs_tag":"pt_br",
106
- "commonvoice_hours":181.0,
107
  "commonvoice_locale":"pt",
108
  "in_benchmark":true
109
  },
@@ -115,7 +115,7 @@
115
  "family":"Indo-European",
116
  "flores_path":"pan_Guru",
117
  "fleurs_tag":"pa_in",
118
- "commonvoice_hours":2.5,
119
  "commonvoice_locale":"pa-IN",
120
  "in_benchmark":true
121
  },
@@ -127,7 +127,7 @@
127
  "family":"Indo-European",
128
  "flores_path":"rus_Cyrl",
129
  "fleurs_tag":"ru_ru",
130
- "commonvoice_hours":247.0,
131
  "commonvoice_locale":"ru",
132
  "in_benchmark":true
133
  },
@@ -139,7 +139,7 @@
139
  "family":"Atlantic-Congo",
140
  "flores_path":"swh_Latn",
141
  "fleurs_tag":"sw_ke",
142
- "commonvoice_hours":412.0,
143
  "commonvoice_locale":"sw",
144
  "in_benchmark":true
145
  },
@@ -151,7 +151,7 @@
151
  "family":"Austronesian",
152
  "flores_path":"ind_Latn",
153
  "fleurs_tag":"id_id",
154
- "commonvoice_hours":34.0,
155
  "commonvoice_locale":"id",
156
  "in_benchmark":true
157
  },
@@ -163,7 +163,7 @@
163
  "family":"Indo-European",
164
  "flores_path":"deu_Latn",
165
  "fleurs_tag":"de_de",
166
- "commonvoice_hours":1372.0,
167
  "commonvoice_locale":"de",
168
  "in_benchmark":true
169
  },
@@ -379,7 +379,7 @@
379
  "family":"Indo-European",
380
  "flores_path":null,
381
  "fleurs_tag":"ps_af",
382
- "commonvoice_hours":82.0,
383
  "commonvoice_locale":"ps",
384
  "in_benchmark":false
385
  },
@@ -439,7 +439,7 @@
439
  "family":"Indo-European",
440
  "flores_path":"pol_Latn",
441
  "fleurs_tag":"pl_pl",
442
- "commonvoice_hours":176.0,
443
  "commonvoice_locale":"pl",
444
  "in_benchmark":true
445
  },
@@ -619,7 +619,7 @@
619
  "family":"Indo-European",
620
  "flores_path":"nld_Latn",
621
  "fleurs_tag":"nl_nl",
622
- "commonvoice_hours":123.0,
623
  "commonvoice_locale":"nl",
624
  "in_benchmark":true
625
  },
@@ -655,7 +655,7 @@
655
  "family":"Atlantic-Congo",
656
  "flores_path":"yor_Latn",
657
  "fleurs_tag":"yo_ng",
658
- "commonvoice_hours":6.4,
659
  "commonvoice_locale":"yo",
660
  "in_benchmark":true
661
  },
@@ -979,7 +979,7 @@
979
  "family":"Turkic",
980
  "flores_path":"kaz_Cyrl",
981
  "fleurs_tag":"kk_kz",
982
- "commonvoice_hours":2.3,
983
  "commonvoice_locale":"kk",
984
  "in_benchmark":true
985
  },
@@ -1027,7 +1027,7 @@
1027
  "family":"Uralic",
1028
  "flores_path":"hun_Latn",
1029
  "fleurs_tag":"hu_hu",
1030
- "commonvoice_hours":94.0,
1031
  "commonvoice_locale":"hu",
1032
  "in_benchmark":true
1033
  },
@@ -1099,7 +1099,7 @@
1099
  "family":"Indo-European",
1100
  "flores_path":"ckb_Arab",
1101
  "fleurs_tag":"ckb_iq",
1102
- "commonvoice_hours":136.0,
1103
  "commonvoice_locale":"ckb",
1104
  "in_benchmark":true
1105
  },
@@ -1183,7 +1183,7 @@
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
- "commonvoice_hours":1812.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
@@ -1207,7 +1207,7 @@
1207
  "family":"Indo-European",
1208
  "flores_path":"tgk_Cyrl",
1209
  "fleurs_tag":"tg_tj",
1210
- "commonvoice_hours":0.6,
1211
  "commonvoice_locale":"tg",
1212
  "in_benchmark":true
1213
  },
@@ -1243,7 +1243,7 @@
1243
  "family":"Indo-European",
1244
  "flores_path":"afr_Latn",
1245
  "fleurs_tag":"af_za",
1246
- "commonvoice_hours":0.6,
1247
  "commonvoice_locale":"af",
1248
  "in_benchmark":true
1249
  },
@@ -1291,7 +1291,7 @@
1291
  "family":"Indo-European",
1292
  "flores_path":"cat_Latn",
1293
  "fleurs_tag":"ca_es",
1294
- "commonvoice_hours":2883.0,
1295
  "commonvoice_locale":"ca",
1296
  "in_benchmark":true
1297
  },
@@ -1303,7 +1303,7 @@
1303
  "family":"Afro-Asiatic",
1304
  "flores_path":"heb_Hebr",
1305
  "fleurs_tag":"he_il",
1306
- "commonvoice_hours":2.0,
1307
  "commonvoice_locale":"he",
1308
  "in_benchmark":true
1309
  },
@@ -1375,7 +1375,7 @@
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
- "commonvoice_hours":437.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
@@ -1519,7 +1519,7 @@
1519
  "family":"Indo-European",
1520
  "flores_path":"kmr_Latn",
1521
  "fleurs_tag":null,
1522
- "commonvoice_hours":71.0,
1523
  "commonvoice_locale":"kmr",
1524
  "in_benchmark":true
1525
  },
@@ -1555,7 +1555,7 @@
1555
  "family":"Indo-European",
1556
  "flores_path":"slk_Latn",
1557
  "fleurs_tag":"sk_sk",
1558
- "commonvoice_hours":52.0,
1559
  "commonvoice_locale":"sk",
1560
  "in_benchmark":true
1561
  },
@@ -1675,7 +1675,7 @@
1675
  "family":"Tupian",
1676
  "flores_path":"gug_Latn",
1677
  "fleurs_tag":null,
1678
- "commonvoice_hours":4.5,
1679
  "commonvoice_locale":"gn",
1680
  "in_benchmark":true
1681
  },
@@ -1747,7 +1747,7 @@
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
- "commonvoice_hours":1.8,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
@@ -2155,7 +2155,7 @@
2155
  "family":"Kartvelian",
2156
  "flores_path":"kat_Geor",
2157
  "fleurs_tag":"ka_ge",
2158
- "commonvoice_hours":167.0,
2159
  "commonvoice_locale":"ka",
2160
  "in_benchmark":true
2161
  },
@@ -2167,7 +2167,7 @@
2167
  "family":"Indo-European",
2168
  "flores_path":"glg_Latn",
2169
  "fleurs_tag":"gl_es",
2170
- "commonvoice_hours":164.0,
2171
  "commonvoice_locale":"gl",
2172
  "in_benchmark":true
2173
  },
@@ -2323,7 +2323,7 @@
2323
  "family":"Dravidian",
2324
  "flores_path":null,
2325
  "fleurs_tag":null,
2326
- "commonvoice_hours":11.0,
2327
  "commonvoice_locale":"brh",
2328
  "in_benchmark":false
2329
  },
@@ -2623,7 +2623,7 @@
2623
  "family":"Indo-European",
2624
  "flores_path":null,
2625
  "fleurs_tag":null,
2626
- "commonvoice_hours":11.0,
2627
  "commonvoice_locale":"haz",
2628
  "in_benchmark":false
2629
  },
@@ -2695,7 +2695,7 @@
2695
  "family":"Indo-European",
2696
  "flores_path":"oci_Latn",
2697
  "fleurs_tag":"oc_fr",
2698
- "commonvoice_hours":1.9,
2699
  "commonvoice_locale":"oc",
2700
  "in_benchmark":true
2701
  },
@@ -3175,8 +3175,8 @@
3175
  "family":"Atlantic-Congo",
3176
  "flores_path":null,
3177
  "fleurs_tag":null,
3178
- "commonvoice_hours":0.0,
3179
- "commonvoice_locale":"seh",
3180
  "in_benchmark":false
3181
  },
3182
  {
@@ -3319,8 +3319,8 @@
3319
  "family":"Indo-European",
3320
  "flores_path":null,
3321
  "fleurs_tag":null,
3322
- "commonvoice_hours":0.0,
3323
- "commonvoice_locale":"mfe",
3324
  "in_benchmark":false
3325
  },
3326
  {
@@ -3331,7 +3331,7 @@
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
- "commonvoice_hours":9.3,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
@@ -3487,7 +3487,7 @@
3487
  "family":"Indo-European",
3488
  "flores_path":"lvs_Latn",
3489
  "fleurs_tag":"lv_lv",
3490
- "commonvoice_hours":263.0,
3491
  "commonvoice_locale":"lv",
3492
  "in_benchmark":true
3493
  },
@@ -3535,7 +3535,7 @@
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
- "commonvoice_hours":453.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
@@ -3559,7 +3559,7 @@
3559
  "family":"Abkhaz-Adyge",
3560
  "flores_path":null,
3561
  "fleurs_tag":null,
3562
- "commonvoice_hours":106.0,
3563
  "commonvoice_locale":"kbd",
3564
  "in_benchmark":false
3565
  },
@@ -3679,7 +3679,7 @@
3679
  "family":"Indo-European",
3680
  "flores_path":"ydd_Hebr",
3681
  "fleurs_tag":null,
3682
- "commonvoice_hours":1.8,
3683
  "commonvoice_locale":"yi",
3684
  "in_benchmark":true
3685
  },
@@ -3991,8 +3991,8 @@
3991
  "family":"Atlantic-Congo",
3992
  "flores_path":null,
3993
  "fleurs_tag":null,
3994
- "commonvoice_hours":0.0,
3995
- "commonvoice_locale":"gaa",
3996
  "in_benchmark":false
3997
  },
3998
  {
@@ -4099,8 +4099,8 @@
4099
  "family":"Indo-European",
4100
  "flores_path":null,
4101
  "fleurs_tag":null,
4102
- "commonvoice_hours":0.0,
4103
- "commonvoice_locale":"pcd",
4104
  "in_benchmark":false
4105
  },
4106
  {
@@ -4351,7 +4351,7 @@
4351
  "family":"Indo-European",
4352
  "flores_path":null,
4353
  "fleurs_tag":null,
4354
- "commonvoice_hours":30.0,
4355
  "commonvoice_locale":"br",
4356
  "in_benchmark":false
4357
  },
@@ -4651,7 +4651,7 @@
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
- "commonvoice_hours":32.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
@@ -5011,7 +5011,7 @@
5011
  "family":"Nakh-Daghestanian",
5012
  "flores_path":"dar_Cyrl",
5013
  "fleurs_tag":null,
5014
- "commonvoice_hours":1.3,
5015
  "commonvoice_locale":"dar",
5016
  "in_benchmark":true
5017
  },
@@ -7879,7 +7879,7 @@
7879
  "family":"Artificial Language",
7880
  "flores_path":"epo_Latn",
7881
  "fleurs_tag":null,
7882
- "commonvoice_hours":1437.0,
7883
  "commonvoice_locale":"eo",
7884
  "in_benchmark":true
7885
  },
 
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
+ "commonvoice_hours":2674.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
 
32
  "flores_path":"hin_Deva",
33
  "fleurs_tag":"hi_in",
34
  "commonvoice_hours":16.0,
35
+ "commonvoice_locale":"hi-IN",
36
  "in_benchmark":true
37
  },
38
  {
 
43
  "family":"Indo-European",
44
  "flores_path":"spa_Latn",
45
  "fleurs_tag":"es_419",
46
+ "commonvoice_hours":448.0,
47
  "commonvoice_locale":"es",
48
  "in_benchmark":true
49
  },
 
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
+ "commonvoice_hours":1065.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
 
103
  "family":"Indo-European",
104
  "flores_path":"por_Latn",
105
  "fleurs_tag":"pt_br",
106
+ "commonvoice_hours":180.0,
107
  "commonvoice_locale":"pt",
108
  "in_benchmark":true
109
  },
 
115
  "family":"Indo-European",
116
  "flores_path":"pan_Guru",
117
  "fleurs_tag":"pa_in",
118
+ "commonvoice_hours":2.3,
119
  "commonvoice_locale":"pa-IN",
120
  "in_benchmark":true
121
  },
 
127
  "family":"Indo-European",
128
  "flores_path":"rus_Cyrl",
129
  "fleurs_tag":"ru_ru",
130
+ "commonvoice_hours":245.0,
131
  "commonvoice_locale":"ru",
132
  "in_benchmark":true
133
  },
 
139
  "family":"Atlantic-Congo",
140
  "flores_path":"swh_Latn",
141
  "fleurs_tag":"sw_ke",
142
+ "commonvoice_hours":411.0,
143
  "commonvoice_locale":"sw",
144
  "in_benchmark":true
145
  },
 
151
  "family":"Austronesian",
152
  "flores_path":"ind_Latn",
153
  "fleurs_tag":"id_id",
154
+ "commonvoice_hours":33.0,
155
  "commonvoice_locale":"id",
156
  "in_benchmark":true
157
  },
 
163
  "family":"Indo-European",
164
  "flores_path":"deu_Latn",
165
  "fleurs_tag":"de_de",
166
+ "commonvoice_hours":1369.0,
167
  "commonvoice_locale":"de",
168
  "in_benchmark":true
169
  },
 
379
  "family":"Indo-European",
380
  "flores_path":null,
381
  "fleurs_tag":"ps_af",
382
+ "commonvoice_hours":81.0,
383
  "commonvoice_locale":"ps",
384
  "in_benchmark":false
385
  },
 
439
  "family":"Indo-European",
440
  "flores_path":"pol_Latn",
441
  "fleurs_tag":"pl_pl",
442
+ "commonvoice_hours":175.0,
443
  "commonvoice_locale":"pl",
444
  "in_benchmark":true
445
  },
 
619
  "family":"Indo-European",
620
  "flores_path":"nld_Latn",
621
  "fleurs_tag":"nl_nl",
622
+ "commonvoice_hours":120.0,
623
  "commonvoice_locale":"nl",
624
  "in_benchmark":true
625
  },
 
655
  "family":"Atlantic-Congo",
656
  "flores_path":"yor_Latn",
657
  "fleurs_tag":"yo_ng",
658
+ "commonvoice_hours":6.3,
659
  "commonvoice_locale":"yo",
660
  "in_benchmark":true
661
  },
 
979
  "family":"Turkic",
980
  "flores_path":"kaz_Cyrl",
981
  "fleurs_tag":"kk_kz",
982
+ "commonvoice_hours":2.2,
983
  "commonvoice_locale":"kk",
984
  "in_benchmark":true
985
  },
 
1027
  "family":"Uralic",
1028
  "flores_path":"hun_Latn",
1029
  "fleurs_tag":"hu_hu",
1030
+ "commonvoice_hours":93.0,
1031
  "commonvoice_locale":"hu",
1032
  "in_benchmark":true
1033
  },
 
1099
  "family":"Indo-European",
1100
  "flores_path":"ckb_Arab",
1101
  "fleurs_tag":"ckb_iq",
1102
+ "commonvoice_hours":135.0,
1103
  "commonvoice_locale":"ckb",
1104
  "in_benchmark":true
1105
  },
 
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
+ "commonvoice_hours":1810.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
 
1207
  "family":"Indo-European",
1208
  "flores_path":"tgk_Cyrl",
1209
  "fleurs_tag":"tg_tj",
1210
+ "commonvoice_hours":0.4,
1211
  "commonvoice_locale":"tg",
1212
  "in_benchmark":true
1213
  },
 
1243
  "family":"Indo-European",
1244
  "flores_path":"afr_Latn",
1245
  "fleurs_tag":"af_za",
1246
+ "commonvoice_hours":0.5,
1247
  "commonvoice_locale":"af",
1248
  "in_benchmark":true
1249
  },
 
1291
  "family":"Indo-European",
1292
  "flores_path":"cat_Latn",
1293
  "fleurs_tag":"ca_es",
1294
+ "commonvoice_hours":2863.0,
1295
  "commonvoice_locale":"ca",
1296
  "in_benchmark":true
1297
  },
 
1303
  "family":"Afro-Asiatic",
1304
  "flores_path":"heb_Hebr",
1305
  "fleurs_tag":"he_il",
1306
+ "commonvoice_hours":1.4,
1307
  "commonvoice_locale":"he",
1308
  "in_benchmark":true
1309
  },
 
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
+ "commonvoice_hours":411.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
 
1519
  "family":"Indo-European",
1520
  "flores_path":"kmr_Latn",
1521
  "fleurs_tag":null,
1522
+ "commonvoice_hours":69.0,
1523
  "commonvoice_locale":"kmr",
1524
  "in_benchmark":true
1525
  },
 
1555
  "family":"Indo-European",
1556
  "flores_path":"slk_Latn",
1557
  "fleurs_tag":"sk_sk",
1558
+ "commonvoice_hours":51.0,
1559
  "commonvoice_locale":"sk",
1560
  "in_benchmark":true
1561
  },
 
1675
  "family":"Tupian",
1676
  "flores_path":"gug_Latn",
1677
  "fleurs_tag":null,
1678
+ "commonvoice_hours":4.0,
1679
  "commonvoice_locale":"gn",
1680
  "in_benchmark":true
1681
  },
 
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
+ "commonvoice_hours":0.5,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
 
2155
  "family":"Kartvelian",
2156
  "flores_path":"kat_Geor",
2157
  "fleurs_tag":"ka_ge",
2158
+ "commonvoice_hours":166.0,
2159
  "commonvoice_locale":"ka",
2160
  "in_benchmark":true
2161
  },
 
2167
  "family":"Indo-European",
2168
  "flores_path":"glg_Latn",
2169
  "fleurs_tag":"gl_es",
2170
+ "commonvoice_hours":117.0,
2171
  "commonvoice_locale":"gl",
2172
  "in_benchmark":true
2173
  },
 
2323
  "family":"Dravidian",
2324
  "flores_path":null,
2325
  "fleurs_tag":null,
2326
+ "commonvoice_hours":1.2,
2327
  "commonvoice_locale":"brh",
2328
  "in_benchmark":false
2329
  },
 
2623
  "family":"Indo-European",
2624
  "flores_path":null,
2625
  "fleurs_tag":null,
2626
+ "commonvoice_hours":0.9,
2627
  "commonvoice_locale":"haz",
2628
  "in_benchmark":false
2629
  },
 
2695
  "family":"Indo-European",
2696
  "flores_path":"oci_Latn",
2697
  "fleurs_tag":"oc_fr",
2698
+ "commonvoice_hours":1.8,
2699
  "commonvoice_locale":"oc",
2700
  "in_benchmark":true
2701
  },
 
3175
  "family":"Atlantic-Congo",
3176
  "flores_path":null,
3177
  "fleurs_tag":null,
3178
+ "commonvoice_hours":null,
3179
+ "commonvoice_locale":null,
3180
  "in_benchmark":false
3181
  },
3182
  {
 
3319
  "family":"Indo-European",
3320
  "flores_path":null,
3321
  "fleurs_tag":null,
3322
+ "commonvoice_hours":null,
3323
+ "commonvoice_locale":null,
3324
  "in_benchmark":false
3325
  },
3326
  {
 
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
+ "commonvoice_hours":8.3,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
 
3487
  "family":"Indo-European",
3488
  "flores_path":"lvs_Latn",
3489
  "fleurs_tag":"lv_lv",
3490
+ "commonvoice_hours":262.0,
3491
  "commonvoice_locale":"lv",
3492
  "in_benchmark":true
3493
  },
 
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
+ "commonvoice_hours":440.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
 
3559
  "family":"Abkhaz-Adyge",
3560
  "flores_path":null,
3561
  "fleurs_tag":null,
3562
+ "commonvoice_hours":83.0,
3563
  "commonvoice_locale":"kbd",
3564
  "in_benchmark":false
3565
  },
 
3679
  "family":"Indo-European",
3680
  "flores_path":"ydd_Hebr",
3681
  "fleurs_tag":null,
3682
+ "commonvoice_hours":0.7,
3683
  "commonvoice_locale":"yi",
3684
  "in_benchmark":true
3685
  },
 
3991
  "family":"Atlantic-Congo",
3992
  "flores_path":null,
3993
  "fleurs_tag":null,
3994
+ "commonvoice_hours":null,
3995
+ "commonvoice_locale":null,
3996
  "in_benchmark":false
3997
  },
3998
  {
 
4099
  "family":"Indo-European",
4100
  "flores_path":null,
4101
  "fleurs_tag":null,
4102
+ "commonvoice_hours":null,
4103
+ "commonvoice_locale":null,
4104
  "in_benchmark":false
4105
  },
4106
  {
 
4351
  "family":"Indo-European",
4352
  "flores_path":null,
4353
  "fleurs_tag":null,
4354
+ "commonvoice_hours":29.0,
4355
  "commonvoice_locale":"br",
4356
  "in_benchmark":false
4357
  },
 
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
+ "commonvoice_hours":30.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
 
5011
  "family":"Nakh-Daghestanian",
5012
  "flores_path":"dar_Cyrl",
5013
  "fleurs_tag":null,
5014
+ "commonvoice_hours":0.0,
5015
  "commonvoice_locale":"dar",
5016
  "in_benchmark":true
5017
  },
 
7879
  "family":"Artificial Language",
7880
  "flores_path":"epo_Latn",
7881
  "fleurs_tag":null,
7882
+ "commonvoice_hours":1436.0,
7883
  "commonvoice_locale":"eo",
7884
  "in_benchmark":true
7885
  },
models.json CHANGED
@@ -20,15 +20,15 @@
20
  ]
21
  },
22
  {
23
- "id":"anthropic\/claude-3-haiku",
24
- "name":"Claude 3 Haiku",
25
  "provider_name":"Anthropic",
26
- "cost":1.25,
27
  "hf_id":null,
28
  "size":null,
29
  "type":"closed-source",
30
  "license":null,
31
- "creation_date":1710288000000,
32
  "tasks":[
33
  "translation_from",
34
  "translation_to",
@@ -79,51 +79,11 @@
79
  "mgsm"
80
  ]
81
  },
82
- {
83
- "id":"arliai\/qwq-32b-arliai-rpr-v1",
84
- "name":"QwQ 32B RpR v1",
85
- "provider_name":"ArliAI",
86
- "cost":0.0,
87
- "hf_id":"ArliAI\/QwQ-32B-ArliAI-RpR-v1",
88
- "size":32763876352.0,
89
- "type":"open-source",
90
- "license":"Apache 2.0",
91
- "creation_date":1743984000000,
92
- "tasks":[
93
- "translation_from",
94
- "translation_to",
95
- "classification",
96
- "mmlu",
97
- "arc",
98
- "truthfulqa",
99
- "mgsm"
100
- ]
101
- },
102
- {
103
- "id":"cohere\/command-r-08-2024",
104
- "name":"Command R (08-2024)",
105
- "provider_name":"Cohere",
106
- "cost":0.6,
107
- "hf_id":null,
108
- "size":null,
109
- "type":"closed-source",
110
- "license":null,
111
- "creation_date":1724976000000,
112
- "tasks":[
113
- "translation_from",
114
- "translation_to",
115
- "classification",
116
- "mmlu",
117
- "arc",
118
- "truthfulqa",
119
- "mgsm"
120
- ]
121
- },
122
  {
123
  "id":"deepseek\/deepseek-chat",
124
  "name":"DeepSeek V3",
125
  "provider_name":"DeepSeek",
126
- "cost":0.8,
127
  "hf_id":"deepseek-ai\/DeepSeek-V3",
128
  "size":684531386000.0,
129
  "type":"open-source",
@@ -159,26 +119,6 @@
159
  "mgsm"
160
  ]
161
  },
162
- {
163
- "id":"deepseek\/deepseek-chat-v3.1",
164
- "name":"DeepSeek V3.1",
165
- "provider_name":"DeepSeek",
166
- "cost":0.0,
167
- "hf_id":"deepseek-ai\/DeepSeek-V3.1",
168
- "size":684531386000.0,
169
- "type":"open-source",
170
- "license":"Mit",
171
- "creation_date":1755734400000,
172
- "tasks":[
173
- "translation_from",
174
- "translation_to",
175
- "classification",
176
- "mmlu",
177
- "arc",
178
- "truthfulqa",
179
- "mgsm"
180
- ]
181
- },
182
  {
183
  "id":"deepseek\/deepseek-r1",
184
  "name":"R1",
@@ -200,15 +140,15 @@
200
  ]
201
  },
202
  {
203
- "id":"deepseek\/deepseek-r1-0528-qwen3-8b",
204
- "name":"Deepseek R1 0528 Qwen3 8B",
205
  "provider_name":"DeepSeek",
206
  "cost":0.0,
207
- "hf_id":"deepseek-ai\/DeepSeek-R1-0528-Qwen3-8B",
208
- "size":8190735360.0,
209
  "type":"open-source",
210
  "license":"Mit",
211
- "creation_date":1748476800000,
212
  "tasks":[
213
  "translation_from",
214
  "translation_to",
@@ -280,15 +220,145 @@
280
  ]
281
  },
282
  {
283
- "id":"google\/gemma-3-12b-it",
284
- "name":"Gemma 3 12B",
285
  "provider_name":"Google",
286
- "cost":0.0,
287
- "hf_id":"google\/gemma-3-12b-it",
288
- "size":12187325040.0,
289
- "type":"open-source",
290
- "license":"Gemma",
291
- "creation_date":1740787200000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  "tasks":[
293
  "translation_from",
294
  "translation_to",
@@ -319,6 +389,41 @@
319
  "mgsm"
320
  ]
321
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  {
323
  "id":"meta-llama\/llama-3-70b-instruct",
324
  "name":"Llama 3 70B Instruct",
@@ -359,6 +464,30 @@
359
  "mgsm"
360
  ]
361
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  {
363
  "id":"meta-llama\/llama-3.3-70b-instruct",
364
  "name":"Llama 3.3 70B Instruct",
@@ -439,26 +568,6 @@
439
  "mgsm"
440
  ]
441
  },
442
- {
443
- "id":"mistralai\/mistral-7b-instruct-v0.3",
444
- "name":"Mistral 7B Instruct v0.3",
445
- "provider_name":"Mistral",
446
- "cost":0.05,
447
- "hf_id":"mistralai\/Mistral-7B-Instruct-v0.3",
448
- "size":7248023552.0,
449
- "type":"open-source",
450
- "license":"Apache 2.0",
451
- "creation_date":1716336000000,
452
- "tasks":[
453
- "translation_from",
454
- "translation_to",
455
- "classification",
456
- "mmlu",
457
- "arc",
458
- "truthfulqa",
459
- "mgsm"
460
- ]
461
- },
462
  {
463
  "id":"mistralai\/mistral-nemo",
464
  "name":"Mistral Nemo",
@@ -520,55 +629,15 @@
520
  ]
521
  },
522
  {
523
- "id":"moonshotai\/kimi-k2",
524
- "name":"Kimi K2",
525
- "provider_name":"MoonshotAI",
526
- "cost":0.0,
527
- "hf_id":"moonshotai\/Kimi-K2-Instruct",
528
  "size":null,
529
- "type":"open-source",
530
- "license":"Other",
531
- "creation_date":1752192000000,
532
- "tasks":[
533
- "translation_from",
534
- "translation_to",
535
- "classification",
536
- "mmlu",
537
- "arc",
538
- "truthfulqa",
539
- "mgsm"
540
- ]
541
- },
542
- {
543
- "id":"neversleep\/llama-3-lumimaid-70b",
544
- "name":"Llama 3 Lumimaid 70B",
545
- "provider_name":"NeverSleep",
546
- "cost":6.0,
547
- "hf_id":"NeverSleep\/Llama-3-Lumimaid-70B-v0.1",
548
- "size":70553706496.0,
549
- "type":"open-source",
550
- "license":"Cc By Nc 4.0",
551
- "creation_date":1714262400000,
552
- "tasks":[
553
- "translation_from",
554
- "translation_to",
555
- "classification",
556
- "mmlu",
557
- "arc",
558
- "truthfulqa",
559
- "mgsm"
560
- ]
561
- },
562
- {
563
- "id":"nvidia\/llama-3.1-nemotron-70b-instruct",
564
- "name":"Llama 3.1 Nemotron 70B Instruct",
565
- "provider_name":"NVIDIA",
566
- "cost":0.3,
567
- "hf_id":"nvidia\/Llama-3.1-Nemotron-70B-Instruct-HF",
568
- "size":70553706496.0,
569
- "type":"open-source",
570
- "license":"Llama3.1",
571
- "creation_date":1728691200000,
572
  "tasks":[
573
  "translation_from",
574
  "translation_to",
@@ -639,26 +708,6 @@
639
  "mgsm"
640
  ]
641
  },
642
- {
643
- "id":"openai\/gpt-4o-2024-11-20",
644
- "name":"GPT-4o (2024-11-20)",
645
- "provider_name":"OpenAI",
646
- "cost":10.0,
647
- "hf_id":null,
648
- "size":null,
649
- "type":"closed-source",
650
- "license":null,
651
- "creation_date":1732060800000,
652
- "tasks":[
653
- "translation_from",
654
- "translation_to",
655
- "classification",
656
- "mmlu",
657
- "arc",
658
- "truthfulqa",
659
- "mgsm"
660
- ]
661
- },
662
  {
663
  "id":"openai\/gpt-4o-mini",
664
  "name":"GPT-4o-mini",
@@ -679,86 +728,6 @@
679
  "mgsm"
680
  ]
681
  },
682
- {
683
- "id":"openai\/gpt-5",
684
- "name":"GPT-5",
685
- "provider_name":"OpenAI",
686
- "cost":10.0,
687
- "hf_id":null,
688
- "size":null,
689
- "type":"closed-source",
690
- "license":null,
691
- "creation_date":1754524800000,
692
- "tasks":[
693
- "translation_from",
694
- "translation_to",
695
- "classification",
696
- "mmlu",
697
- "arc",
698
- "truthfulqa",
699
- "mgsm"
700
- ]
701
- },
702
- {
703
- "id":"openai\/gpt-5-nano",
704
- "name":"GPT-5 Nano",
705
- "provider_name":"OpenAI",
706
- "cost":0.4,
707
- "hf_id":null,
708
- "size":null,
709
- "type":"closed-source",
710
- "license":null,
711
- "creation_date":1754524800000,
712
- "tasks":[
713
- "translation_from",
714
- "translation_to",
715
- "classification",
716
- "mmlu",
717
- "arc",
718
- "truthfulqa",
719
- "mgsm"
720
- ]
721
- },
722
- {
723
- "id":"openai\/gpt-oss-120b",
724
- "name":"gpt-oss-120b",
725
- "provider_name":"OpenAI",
726
- "cost":0.0,
727
- "hf_id":"openai\/gpt-oss-120b",
728
- "size":120412337472.0,
729
- "type":"open-source",
730
- "license":"Apache 2.0",
731
- "creation_date":1754265600000,
732
- "tasks":[
733
- "translation_from",
734
- "translation_to",
735
- "classification",
736
- "mmlu",
737
- "arc",
738
- "truthfulqa",
739
- "mgsm"
740
- ]
741
- },
742
- {
743
- "id":"qwen\/qwen-2.5-coder-32b-instruct",
744
- "name":"Qwen2.5 Coder 32B Instruct",
745
- "provider_name":"Qwen2.5 Coder 32B Instruct (free)",
746
- "cost":0.0,
747
- "hf_id":"Qwen\/Qwen2.5-Coder-32B-Instruct",
748
- "size":32763876352.0,
749
- "type":"open-source",
750
- "license":"Apache 2.0",
751
- "creation_date":1730851200000,
752
- "tasks":[
753
- "translation_from",
754
- "translation_to",
755
- "classification",
756
- "mmlu",
757
- "arc",
758
- "truthfulqa",
759
- "mgsm"
760
- ]
761
- },
762
  {
763
  "id":"qwen\/qwen3-235b-a22b",
764
  "name":"Qwen3 235B A22B",
@@ -803,7 +772,7 @@
803
  "id":"qwen\/qwen3-32b",
804
  "name":"Qwen3 32B",
805
  "provider_name":"Qwen",
806
- "cost":0.07,
807
  "hf_id":"Qwen\/Qwen3-32B",
808
  "size":32762123264.0,
809
  "type":"open-source",
@@ -818,120 +787,5 @@
818
  "truthfulqa",
819
  "mgsm"
820
  ]
821
- },
822
- {
823
- "id":"scb10x\/llama3.1-typhoon2-70b-instruct",
824
- "name":"Typhoon2 70B Instruct",
825
- "provider_name":"Typhoon2 70B Instruct",
826
- "cost":0.88,
827
- "hf_id":"scb10x\/llama3.1-typhoon2-70b-instruct",
828
- "size":70553706496.0,
829
- "type":"open-source",
830
- "license":"Llama3.1",
831
- "creation_date":1734220800000,
832
- "tasks":[
833
- "translation_from",
834
- "translation_to",
835
- "classification",
836
- "mmlu",
837
- "arc",
838
- "truthfulqa",
839
- "mgsm"
840
- ]
841
- },
842
- {
843
- "id":"tencent\/hunyuan-a13b-instruct",
844
- "name":"Hunyuan A13B Instruct",
845
- "provider_name":"Tencent",
846
- "cost":0.0,
847
- "hf_id":"tencent\/Hunyuan-A13B-Instruct",
848
- "size":80393183232.0,
849
- "type":"open-source",
850
- "license":"Other",
851
- "creation_date":1750809600000,
852
- "tasks":[
853
- "translation_from",
854
- "translation_to",
855
- "classification",
856
- "mmlu",
857
- "arc",
858
- "truthfulqa",
859
- "mgsm"
860
- ]
861
- },
862
- {
863
- "id":"thedrummer\/anubis-pro-105b-v1",
864
- "name":"Anubis Pro 105B V1",
865
- "provider_name":"TheDrummer",
866
- "cost":1.0,
867
- "hf_id":"TheDrummer\/Anubis-Pro-105B-v1",
868
- "size":104779882496.0,
869
- "type":"open-source",
870
- "license":"Other",
871
- "creation_date":1738454400000,
872
- "tasks":[
873
- "translation_from",
874
- "translation_to",
875
- "classification",
876
- "mmlu",
877
- "arc",
878
- "truthfulqa",
879
- "mgsm"
880
- ]
881
- },
882
- {
883
- "id":"x-ai\/grok-4",
884
- "name":"Grok 4",
885
- "provider_name":"xAI",
886
- "cost":15.0,
887
- "hf_id":null,
888
- "size":null,
889
- "type":"closed-source",
890
- "license":null,
891
- "creation_date":1752019200000,
892
- "tasks":[
893
- "translation_from",
894
- "translation_to",
895
- "classification",
896
- "mmlu",
897
- "arc",
898
- "truthfulqa",
899
- "mgsm"
900
- ]
901
- },
902
- {
903
- "id":"z-ai\/glm-4.5v",
904
- "name":"GLM 4.5V",
905
- "provider_name":"Z.AI",
906
- "cost":1.8,
907
- "hf_id":"zai-org\/GLM-4.5V",
908
- "size":107710933120.0,
909
- "type":"open-source",
910
- "license":"Mit",
911
- "creation_date":1754784000000,
912
- "tasks":[
913
- "translation_from",
914
- "translation_to",
915
- "classification",
916
- "mmlu",
917
- "arc",
918
- "truthfulqa",
919
- "mgsm"
920
- ]
921
- },
922
- {
923
- "id":"google\/translate-v2",
924
- "name":"Google Translate",
925
- "provider_name":"Google",
926
- "cost":20.0,
927
- "hf_id":null,
928
- "size":null,
929
- "type":"closed-source",
930
- "license":null,
931
- "creation_date":null,
932
- "tasks":[
933
- "translation_from",
934
- "translation_to"
935
- ]
936
  }
937
  ]
 
20
  ]
21
  },
22
  {
23
+ "id":"anthropic\/claude-3.5-sonnet",
24
+ "name":"Claude 3.5 Sonnet",
25
  "provider_name":"Anthropic",
26
+ "cost":15.0,
27
  "hf_id":null,
28
  "size":null,
29
  "type":"closed-source",
30
  "license":null,
31
+ "creation_date":1729555200000,
32
  "tasks":[
33
  "translation_from",
34
  "translation_to",
 
79
  "mgsm"
80
  ]
81
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  {
83
  "id":"deepseek\/deepseek-chat",
84
  "name":"DeepSeek V3",
85
  "provider_name":"DeepSeek",
86
+ "cost":0.0,
87
  "hf_id":"deepseek-ai\/DeepSeek-V3",
88
  "size":684531386000.0,
89
  "type":"open-source",
 
119
  "mgsm"
120
  ]
121
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  {
123
  "id":"deepseek\/deepseek-r1",
124
  "name":"R1",
 
140
  ]
141
  },
142
  {
143
+ "id":"deepseek\/deepseek-r1-0528",
144
+ "name":"R1 0528",
145
  "provider_name":"DeepSeek",
146
  "cost":0.0,
147
+ "hf_id":"deepseek-ai\/DeepSeek-R1-0528",
148
+ "size":684531386000.0,
149
  "type":"open-source",
150
  "license":"Mit",
151
+ "creation_date":1748390400000.0,
152
  "tasks":[
153
  "translation_from",
154
  "translation_to",
 
220
  ]
221
  },
222
  {
223
+ "id":"google\/gemini-2.5-flash-lite-preview-06-17",
224
+ "name":"Gemini 2.5 Flash Lite Preview 06-17",
225
  "provider_name":"Google",
226
+ "cost":0.4,
227
+ "hf_id":null,
228
+ "size":null,
229
+ "type":"closed-source",
230
+ "license":null,
231
+ "creation_date":1750118400000.0,
232
+ "tasks":[
233
+ "translation_from",
234
+ "translation_to",
235
+ "classification",
236
+ "mmlu",
237
+ "mgsm"
238
+ ]
239
+ },
240
+ {
241
+ "id":"google\/gemini-2.5-flash-preview",
242
+ "name":"Gemini 2.5 Flash Preview 04-17",
243
+ "provider_name":"Google",
244
+ "cost":0.6,
245
+ "hf_id":null,
246
+ "size":null,
247
+ "type":"closed-source",
248
+ "license":null,
249
+ "creation_date":1744848000000.0,
250
+ "tasks":[
251
+ "translation_from",
252
+ "translation_to",
253
+ "classification",
254
+ "mmlu",
255
+ "mgsm"
256
+ ]
257
+ },
258
+ {
259
+ "id":"google\/gemini-2.5-flash-preview-05-20",
260
+ "name":"Gemini 2.5 Flash Preview 05-20",
261
+ "provider_name":"Google",
262
+ "cost":0.6,
263
+ "hf_id":null,
264
+ "size":null,
265
+ "type":"closed-source",
266
+ "license":null,
267
+ "creation_date":1747699200000.0,
268
+ "tasks":[
269
+ "translation_from",
270
+ "translation_to",
271
+ "classification",
272
+ "mmlu",
273
+ "mgsm"
274
+ ]
275
+ },
276
+ {
277
+ "id":"google\/gemini-2.5-pro",
278
+ "name":"Gemini 2.5 Pro",
279
+ "provider_name":"Google",
280
+ "cost":10.0,
281
+ "hf_id":null,
282
+ "size":null,
283
+ "type":"closed-source",
284
+ "license":null,
285
+ "creation_date":1750118400000,
286
+ "tasks":[
287
+ "translation_from",
288
+ "translation_to",
289
+ "classification",
290
+ "mmlu",
291
+ "arc",
292
+ "truthfulqa",
293
+ "mgsm"
294
+ ]
295
+ },
296
+ {
297
+ "id":"google\/gemini-2.5-pro-preview",
298
+ "name":"Gemini 2.5 Pro Preview 06-05",
299
+ "provider_name":"Google",
300
+ "cost":10.0,
301
+ "hf_id":null,
302
+ "size":null,
303
+ "type":"closed-source",
304
+ "license":null,
305
+ "creation_date":1749081600000.0,
306
+ "tasks":[
307
+ "translation_from",
308
+ "translation_to",
309
+ "classification",
310
+ "mmlu",
311
+ "mgsm"
312
+ ]
313
+ },
314
+ {
315
+ "id":"google\/gemini-2.5-pro-preview-05-06",
316
+ "name":"Gemini 2.5 Pro Preview 05-06",
317
+ "provider_name":"Google",
318
+ "cost":10.0,
319
+ "hf_id":null,
320
+ "size":null,
321
+ "type":"closed-source",
322
+ "license":null,
323
+ "creation_date":1746576000000.0,
324
+ "tasks":[
325
+ "translation_from",
326
+ "translation_to",
327
+ "classification",
328
+ "mmlu",
329
+ "mgsm"
330
+ ]
331
+ },
332
+ {
333
+ "id":"google\/gemini-flash-1.5",
334
+ "name":"Gemini 1.5 Flash ",
335
+ "provider_name":"Google",
336
+ "cost":0.3,
337
+ "hf_id":null,
338
+ "size":null,
339
+ "type":"closed-source",
340
+ "license":null,
341
+ "creation_date":1715644800000,
342
+ "tasks":[
343
+ "translation_from",
344
+ "translation_to",
345
+ "classification",
346
+ "mmlu",
347
+ "arc",
348
+ "truthfulqa",
349
+ "mgsm"
350
+ ]
351
+ },
352
+ {
353
+ "id":"google\/gemini-flash-1.5-8b",
354
+ "name":"Gemini 1.5 Flash 8B",
355
+ "provider_name":"Google",
356
+ "cost":0.15,
357
+ "hf_id":null,
358
+ "size":null,
359
+ "type":"closed-source",
360
+ "license":null,
361
+ "creation_date":1727913600000,
362
  "tasks":[
363
  "translation_from",
364
  "translation_to",
 
389
  "mgsm"
390
  ]
391
  },
392
+ {
393
+ "id":"google\/translate-v2",
394
+ "name":"Google Translate",
395
+ "provider_name":"Google",
396
+ "cost":20.0,
397
+ "hf_id":null,
398
+ "size":null,
399
+ "type":"closed-source",
400
+ "license":null,
401
+ "creation_date":null,
402
+ "tasks":[
403
+ "translation_from",
404
+ "translation_to"
405
+ ]
406
+ },
407
+ {
408
+ "id":"gryphe\/mythomax-l2-13b",
409
+ "name":"MythoMax 13B",
410
+ "provider_name":"MythoMax 13B",
411
+ "cost":0.07,
412
+ "hf_id":"Gryphe\/MythoMax-L2-13b",
413
+ "size":null,
414
+ "type":"open-source",
415
+ "license":"Other",
416
+ "creation_date":1691625600000,
417
+ "tasks":[
418
+ "translation_from",
419
+ "translation_to",
420
+ "classification",
421
+ "mmlu",
422
+ "arc",
423
+ "truthfulqa",
424
+ "mgsm"
425
+ ]
426
+ },
427
  {
428
  "id":"meta-llama\/llama-3-70b-instruct",
429
  "name":"Llama 3 70B Instruct",
 
464
  "mgsm"
465
  ]
466
  },
467
+ {
468
+ "id":"meta-llama\/llama-3.1-8b-instruct",
469
+ "name":"Llama 3.1 8B Instruct",
470
+ "provider_name":"Meta",
471
+ "cost":0.0,
472
+ "hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
473
+ "size":8030261248.0,
474
+ "type":"open-source",
475
+ "license":"Llama3.1",
476
+ "creation_date":1721260800000.0,
477
+ "tasks":null
478
+ },
479
+ {
480
+ "id":"meta-llama\/llama-3.2-1b-instruct",
481
+ "name":"Llama 3.2 1B Instruct",
482
+ "provider_name":"Meta",
483
+ "cost":0.0,
484
+ "hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
485
+ "size":1235814400.0,
486
+ "type":"open-source",
487
+ "license":"Llama3.2",
488
+ "creation_date":1726617600000.0,
489
+ "tasks":null
490
+ },
491
  {
492
  "id":"meta-llama\/llama-3.3-70b-instruct",
493
  "name":"Llama 3.3 70B Instruct",
 
568
  "mgsm"
569
  ]
570
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  {
572
  "id":"mistralai\/mistral-nemo",
573
  "name":"Mistral Nemo",
 
629
  ]
630
  },
631
  {
632
+ "id":"openai\/gpt-3.5-turbo-0613",
633
+ "name":"GPT-3.5 Turbo (older v0613)",
634
+ "provider_name":"OpenAI",
635
+ "cost":2.0,
636
+ "hf_id":null,
637
  "size":null,
638
+ "type":"closed-source",
639
+ "license":null,
640
+ "creation_date":1706140800000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
641
  "tasks":[
642
  "translation_from",
643
  "translation_to",
 
708
  "mgsm"
709
  ]
710
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  {
712
  "id":"openai\/gpt-4o-mini",
713
  "name":"GPT-4o-mini",
 
728
  "mgsm"
729
  ]
730
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  {
732
  "id":"qwen\/qwen3-235b-a22b",
733
  "name":"Qwen3 235B A22B",
 
772
  "id":"qwen\/qwen3-32b",
773
  "name":"Qwen3 32B",
774
  "provider_name":"Qwen",
775
+ "cost":0.0,
776
  "hf_id":"Qwen\/Qwen3-32B",
777
  "size":32762123264.0,
778
  "type":"open-source",
 
787
  "truthfulqa",
788
  "mgsm"
789
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
  }
791
  ]
pyproject.toml CHANGED
@@ -44,6 +44,3 @@ dev = [
44
  "scipy>=1.16.0",
45
  "seaborn>=0.13.2",
46
  ]
47
-
48
- [tool.uv]
49
- package = true
 
44
  "scipy>=1.16.0",
45
  "seaborn>=0.13.2",
46
  ]
 
 
 
uv.lock CHANGED
The diff for this file is too large to render. See raw diff