Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

davidpomerenke commited on 3 days ago

Commit

68a93b5

verified ·

1 Parent(s): 5102b0a

Upload from GitHub Actions: Add auto-translated datasets

Browse files

Files changed (36) hide show

.DS_Store +0 -0
.github/workflows/nightly-evals.yml +0 -4
.gitignore +0 -2
Dockerfile +1 -1
README.md +0 -5
datasets.json +6 -6
evals/__init__.py +1 -0
evals/backend.py +22 -139
evals/countries.py +4 -10
evals/datasets_/arc.py +27 -44
evals/datasets_/fleurs.py +1 -2
evals/datasets_/mgsm.py +23 -47
evals/datasets_/mmlu.py +25 -57
evals/datasets_/truthfulqa.py +28 -66
evals/datasets_/util.py +0 -8
evals/download_data.py +16 -33
evals/languages.py +0 -3
evals/main.py +48 -176
evals/models.py +36 -126
evals/plots.py +41 -75
evals/tasks.py +142 -130
evals/translate.py +1 -1
frontend/package-lock.json +0 -0
frontend/package.json +5 -7
frontend/src/App.js +77 -183
frontend/src/components/HistoryPlot.js +2 -2
frontend/src/components/LanguageTable.js +1 -1
frontend/src/components/ModelTable.js +17 -31
frontend/src/components/ScoreColumns.js +10 -23
frontend/src/components/ScoreField.js +1 -2
frontend/src/components/SpeakerPlot.js +2 -2
frontend/src/components/WorldMap.js +7 -22
languages.json +49 -49
models.json +216 -362
pyproject.toml +0 -3
uv.lock +0 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.github/workflows/nightly-evals.yml CHANGED Viewed

@@ -8,8 +8,6 @@ on:
 jobs:
   run-evals:
     runs-on: ubuntu-latest
-    # checking if this is working in case eval runs take longer than 6h github actions allowance
-    timeout-minutes: 1440  # 24 hours timeout
     steps:
       - uses: actions/checkout@v3
@@ -27,8 +25,6 @@ jobs:
         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
-          N_SENTENCES: 20
-          MAX_LANGUAGES: 150
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py

 jobs:
   run-evals:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py

.gitignore CHANGED Viewed

@@ -20,5 +20,3 @@ wheels/
 # folders and files to be ignored
 .specstory/
 .cursorindexingignore

 # folders and files to be ignored
 .specstory/
 .cursorindexingignore

Dockerfile CHANGED Viewed

@@ -14,7 +14,7 @@ ENV HOME=/home/user \
 RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
 USER user
 WORKDIR $HOME/app
-COPY --chown=user pyproject.toml uv.lock README.md ./
 RUN uv sync --frozen --no-dev
 COPY --chown=user evals/ evals/
 COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build

 RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
 USER user
 WORKDIR $HOME/app
+COPY --chown=user pyproject.toml uv.lock ./
 RUN uv sync --frozen --no-dev
 COPY --chown=user evals/ evals/
 COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build

README.md CHANGED Viewed

@@ -45,7 +45,6 @@ _Tracking language proficiency of AI models for every language_
 ## Evaluate
-### Local Development
 ```bash
 uv run --extra dev evals/main.py
 ```
@@ -56,7 +55,3 @@ uv run --extra dev evals/main.py
 uv run evals/backend.py
 cd frontend && npm i && npm start
 ```
-## System Architecture
-See [system_architecture_diagram.md](system_architecture_diagram.md) for the complete system architecture diagram and component descriptions.

 ## Evaluate
 ```bash
 uv run --extra dev evals/main.py
 ```
 uv run evals/backend.py
 cd frontend && npm i && npm start
 ```

datasets.json CHANGED Viewed

@@ -219,7 +219,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
-        "implemented": false,
         "group": "Multitask Language Understanding"
     },
     {
@@ -256,7 +256,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
-        "implemented": false,
         "group": "Multitask Language Understanding"
     },
     {
@@ -360,7 +360,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
-        "implemented": false,
         "group": "ARC Question Answering"
     },
     {
@@ -375,7 +375,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
-        "implemented": false,
         "group": "ARC Question Answering"
     },
     {
@@ -420,7 +420,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
-        "implemented": false,
         "group": "Truthfulness"
     },
     {
@@ -435,7 +435,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
-        "implemented": false,
         "group": "Truthfulness"
     },
     {

         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
+        "implemented": true,
         "group": "Multitask Language Understanding"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
+        "implemented": true,
         "group": "Multitask Language Understanding"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
+        "implemented": true,
         "group": "ARC Question Answering"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
+        "implemented": true,
         "group": "ARC Question Answering"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
+        "implemented": true,
         "group": "Truthfulness"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
+        "implemented": true,
         "group": "Truthfulness"
     },
     {

evals/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+

evals/backend.py CHANGED Viewed

@@ -4,8 +4,7 @@ import os
 import numpy as np
 import pandas as pd
 import uvicorn
-from evals.countries import make_country_table
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.gzip import GZipMiddleware
@@ -27,7 +26,7 @@ task_metrics = [
     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
-    "truthfulqa_accuracy",
     "mgsm_accuracy",
 ]
@@ -40,77 +39,28 @@ def compute_normalized_average(df, metrics):
             col_min = normalized_df[col].min()
             col_max = normalized_df[col].max()
             if col_max > col_min:  # Avoid division by zero
-                normalized_df[col] = (normalized_df[col] - col_min) / (
-                    col_max - col_min
-                )
             else:
                 normalized_df[col] = 0  # If all values are the same, set to 0
     return normalized_df.mean(axis=1, skipna=False)
-def make_model_table(scores_df, models):
-    # Create a combined task_metric for origin
-    scores_df["task_metric_origin"] = (
-        scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
-    )
-    # Pivot to get scores for each origin-specific metric
-    scores_pivot = scores_df.pivot_table(
-        index="model",
-        columns="task_metric_origin",
-        values="score",
-        aggfunc="mean",
-    )
-    # Create the regular task_metric for the main average calculation
-    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
-    main_pivot = scores_df.pivot_table(
-        index="model", columns="task_metric", values="score", aggfunc="mean"
     )
-    # Merge the two pivots
-    df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
-    # Compute origin presence per model+metric
-    origin_presence = (
-        scores_df.groupby(["model", "task_metric", "origin"])
-        .size()
-        .unstack(fill_value=0)
-    )
-    # Add boolean flags: show asterisk only if exclusively machine-origin contributed
-    for metric in task_metrics:
-        human_col_name = "human" if "human" in origin_presence.columns else None
-        machine_col_name = "machine" if "machine" in origin_presence.columns else None
-        if human_col_name or machine_col_name:
-            flags = []
-            for model in df.index:
-                try:
-                    counts = origin_presence.loc[(model, metric)]
-                except KeyError:
-                    flags.append(False)
-                    continue
-                human_count = counts.get(human_col_name, 0) if human_col_name else 0
-                machine_count = (
-                    counts.get(machine_col_name, 0) if machine_col_name else 0
-                )
-                flags.append(machine_count > 0 and human_count == 0)
-            df[f"{metric}_is_machine"] = flags
-        else:
-            df[f"{metric}_is_machine"] = False
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
-    # Dynamically find all metric columns to include
-    final_cols = df.columns
-    metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
     df = df[
         [
             "rank",
@@ -124,81 +74,27 @@ def make_model_table(scores_df, models):
             "license",
             "cost",
             "average",
-            *sorted(list(set(metric_cols))),
         ]
     ]
     return df
-def make_language_table(scores_df, languages):
-    # Create a combined task_metric for origin
-    scores_df["task_metric_origin"] = (
-        scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
-    )
-    # Pivot to get scores for each origin-specific metric
-    scores_pivot = scores_df.pivot_table(
-        index="bcp_47",
-        columns="task_metric_origin",
-        values="score",
-        aggfunc="mean",
-    )
-    # Create the regular task_metric for the main average calculation
-    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
-    main_pivot = scores_df.pivot_table(
-        index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
     )
-    # Merge the two pivots
-    df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
-    # Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
-    origin_presence = (
-        scores_df.groupby(["bcp_47", "task_metric", "origin"])
-        .size()
-        .unstack(fill_value=0)
-    )
-    for metric in task_metrics:
-        human_col_name = "human" if "human" in origin_presence.columns else None
-        machine_col_name = "machine" if "machine" in origin_presence.columns else None
-        if human_col_name or machine_col_name:
-            flags = []
-            for bcp in df.index:
-                try:
-                    counts = origin_presence.loc[(bcp, metric)]
-                except KeyError:
-                    flags.append(False)
-                    continue
-                human_count = counts.get(human_col_name, 0) if human_col_name else 0
-                machine_count = (
-                    counts.get(machine_col_name, 0) if machine_col_name else 0
-                )
-                flags.append(machine_count > 0 and human_count == 0)
-            df[f"{metric}_is_machine"] = flags
-        else:
-            df[f"{metric}_is_machine"] = False
-    # Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
-    for metric in task_metrics:
-        machine_col = f"{metric}_machine"
-        if machine_col in df.columns:
-            df[f"{metric}_is_machine"] = df[machine_col].notna()
-        else:
-            df[f"{metric}_is_machine"] = False
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
-    # Dynamically find all metric columns to include
-    final_cols = df.columns
-    metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
     df = df[
         [
             "bcp_47",
@@ -208,7 +104,7 @@ def make_language_table(scores_df, languages):
             "family",
             "average",
             "in_benchmark",
-            *sorted(list(set(metric_cols))),
         ]
     ]
     return df
@@ -229,22 +125,10 @@ async def data(request: Request):
     body = await request.body()
     data = json.loads(body)
     selected_languages = data.get("selectedLanguages", {})
-    df = (
-        scores.groupby(["model", "bcp_47", "task", "metric", "origin"])
-        .mean()
-        .reset_index()
-    )
     # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
     language_table = make_language_table(df, languages)
     datasets_df = pd.read_json("datasets.json")
-    # Identify which metrics have machine translations available
-    machine_translated_metrics = set()
-    for _, row in df.iterrows():
-        if row["origin"] == "machine":
-            metric_name = f"{row['task']}_{row['metric']}"
-            machine_translated_metrics.add(metric_name)
     if selected_languages:
         # the filtering is only applied for the model table and the country data
         df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
@@ -259,7 +143,6 @@ async def data(request: Request):
         "language_table": serialize(language_table),
         "dataset_table": serialize(datasets_df),
         "countries": serialize(countries),
-        "machine_translated_metrics": list(machine_translated_metrics),
     }
     return JSONResponse(content=all_tables)

 import numpy as np
 import pandas as pd
 import uvicorn
+from countries import make_country_table
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.gzip import GZipMiddleware
     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
+    # "truthfulqa_accuracy",
     "mgsm_accuracy",
 ]
             col_min = normalized_df[col].min()
             col_max = normalized_df[col].max()
             if col_max > col_min:  # Avoid division by zero
+                normalized_df[col] = (normalized_df[col] - col_min) / (col_max - col_min)
             else:
                 normalized_df[col] = 0  # If all values are the same, set to 0
     return normalized_df.mean(axis=1, skipna=False)
+def make_model_table(df, models):
+    df = (
+        df.groupby(["model", "task", "metric"])
+        .agg({"score": "mean", "bcp_47": "nunique"})
+        .reset_index()
     )
+    df["task_metric"] = df["task"] + "_" + df["metric"]
+    df = df.drop(columns=["task", "metric"])
+    df = df.pivot(index="model", columns="task_metric", values="score")
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
     df = df[
         [
             "rank",
             "license",
             "cost",
             "average",
+            *task_metrics,
         ]
     ]
     return df
+def make_language_table(df, languages):
+    df = (
+        df.groupby(["bcp_47", "task", "metric"])
+        .agg({"score": "mean", "model": "nunique"})
+        .reset_index()
     )
+    df["task_metric"] = df["task"] + "_" + df["metric"]
+    df = df.drop(columns=["task", "metric"])
+    df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
     df = df[
         [
             "bcp_47",
             "family",
             "average",
             "in_benchmark",
+            *task_metrics,
         ]
     ]
     return df
     body = await request.body()
     data = json.loads(body)
     selected_languages = data.get("selectedLanguages", {})
+    df = scores.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
     # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
     language_table = make_language_table(df, languages)
     datasets_df = pd.read_json("datasets.json")
     if selected_languages:
         # the filtering is only applied for the model table and the country data
         df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
         "language_table": serialize(language_table),
         "dataset_table": serialize(datasets_df),
         "countries": serialize(countries),
     }
     return JSONResponse(content=all_tables)

evals/countries.py CHANGED Viewed

@@ -15,7 +15,6 @@ def population(bcp_47):
     }
     return items
 @cache
 def make_country_table(language_table):
     countries = defaultdict(list)
@@ -31,15 +30,10 @@ def make_country_table(language_table):
             )
     for country, languages in countries.items():
         speaker_pop = sum(entry["population"] for entry in languages)
-        if speaker_pop < 1000:  # Grey out low-population countries
-            score = None  # This will make them appear grey on the map
-        else:
-            score = (
-                sum(entry["score"] * entry["population"] for entry in languages)
-                / speaker_pop
-            )
         countries[country] = {
             "score": score,
             "languages": languages,

     }
     return items
 @cache
 def make_country_table(language_table):
     countries = defaultdict(list)
             )
     for country, languages in countries.items():
         speaker_pop = sum(entry["population"] for entry in languages)
+        score = (
+            sum(entry["score"] * entry["population"] for entry in languages)
+            / speaker_pop
+        )
         countries[country] = {
             "score": score,
             "languages": languages,

evals/datasets_/arc.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import random
-from langcodes import standardize_tag
 from rich import print
-from models import translate_google, get_google_supported_languages
 from tqdm import tqdm
-from datasets import load_dataset, Dataset
 import asyncio
 from tqdm.asyncio import tqdm_asyncio
 import os
@@ -13,33 +14,27 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
 tags_uhura_arc_easy = {
-    standardize_tag(a.split("_")[0], macro=True): a
-    for a in _get_dataset_config_names(slug_uhura_arc_easy)
     if not a.endswith("unmatched")
 }
 random.seed(42)
-id_sets_train = [
-    set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
-    for tag in tags_uhura_arc_easy.values()
-]
 common_ids_train = list(sorted(set.intersection(*id_sets_train)))
 random.shuffle(common_ids_train)
-id_sets_test = [
-    set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
-    for tag in tags_uhura_arc_easy.values()
-]
 common_ids_test = list(sorted(set.intersection(*id_sets_test)))
 random.shuffle(common_ids_test)
 slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
 tags_uhura_arc_easy_translated = {
-    standardize_tag(a.split("_")[0], macro=True): a
-    for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
 }
 def add_choices(row):
     row["choices"] = row["choices"]["text"]
     return row
@@ -50,40 +45,37 @@ def load_uhura_arc_easy(language_bcp_47, nr):
         ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
         ds = ds.map(add_choices)
         ds = ds.rename_column("answerKey", "answer")
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
-        return "masakhane/uhura-arc-easy", task, "human"
     if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
-        ds = _load_dataset(
-            slug_uhura_arc_easy_translated,
-            tags_uhura_arc_easy_translated[language_bcp_47],
-        )
         ds = ds.rename_column("answerKey", "answer")
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
-        return "fair-forward/arc-easy-autotranslated", task, "machine"
     else:
         return None, None, None
 def translate_arc(languages):
     human_translated = tags_uhura_arc_easy.keys()
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
-        if lang not in human_translated and lang in get_google_supported_languages()
     ]
     n_samples = 10
-    train_ids = common_ids_train[: n_samples + 3]
-    en_train = _load_dataset(
-        slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train"
-    )
     en_train = en_train.filter(lambda x: x["id"] in train_ids)
     test_ids = common_ids_test[:n_samples]
-    en_test = _load_dataset(
-        slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test"
-    )
     en_test = en_test.filter(lambda x: x["id"] in test_ids)
     data = {"train": en_train, "test": en_test}
     slug = "fair-forward/arc-easy-autotranslated"
     for lang in tqdm(untranslated):
         # check if already exists on hub
@@ -92,22 +84,16 @@ def translate_arc(languages):
         except (ValueError, Exception):
             print(f"Translating {lang}...")
             for split, data_en in data.items():
-                questions_tr = [
-                    translate_google(q, "en", lang) for q in data_en["question"]
-                ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
                 choices_texts_concatenated = []
                 for choice in data_en["choices"]:
                     for option in choice["text"]:
                         choices_texts_concatenated.append(option)
-                choices_tr = [
-                    translate_google(c, "en", lang) for c in choices_texts_concatenated
-                ]
                 choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
                 # group into chunks of 4
-                choices_tr = [
-                    choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
-                ]
                 ds_lang = Dataset.from_dict(
                     {
@@ -124,8 +110,5 @@ def translate_arc(languages):
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
-                    f"data/translations/arc/{lang}_{split}.json",
-                    lines=False,
-                    force_ascii=False,
-                    indent=2,
                 )

 import random
+from collections import Counter, defaultdict
+from langcodes import Language, standardize_tag
 from rich import print
+from models import translate_google, google_supported_languages
 from tqdm import tqdm
+from datasets import Dataset, load_dataset
 import asyncio
 from tqdm.asyncio import tqdm_asyncio
 import os
 slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
 tags_uhura_arc_easy = {
+    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy)
     if not a.endswith("unmatched")
 }
 random.seed(42)
+id_sets_train = [set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"]) for tag in tags_uhura_arc_easy.values()]
 common_ids_train = list(sorted(set.intersection(*id_sets_train)))
 random.shuffle(common_ids_train)
+id_sets_test = [set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"]) for tag in tags_uhura_arc_easy.values()]
 common_ids_test = list(sorted(set.intersection(*id_sets_test)))
 random.shuffle(common_ids_test)
 slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
 tags_uhura_arc_easy_translated = {
+    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
 }
 def add_choices(row):
     row["choices"] = row["choices"]["text"]
     return row
         ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
         ds = ds.map(add_choices)
         ds = ds.rename_column("answerKey", "answer")
+        train_ids = common_ids_train[nr:nr+3]
+        examples = ds["train"].filter(lambda x: x["id"] in train_ids)
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
+        return "masakhane/uhura-arc-easy", examples, task
     if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
+        ds = _load_dataset(slug_uhura_arc_easy_translated, tags_uhura_arc_easy_translated[language_bcp_47])
         ds = ds.rename_column("answerKey", "answer")
+        train_ids = common_ids_train[nr:nr+3]
+        examples = ds["train"].filter(lambda x: x["id"] in train_ids)
+        # raise Exception(language_bcp_47)
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
+        return "fair-forward/arc-easy-autotranslated", examples, task
     else:
         return None, None, None
 def translate_arc(languages):
     human_translated = tags_uhura_arc_easy.keys()
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in google_supported_languages
     ]
     n_samples = 10
+    train_ids = common_ids_train[:n_samples+3]
+    en_train = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train")
     en_train = en_train.filter(lambda x: x["id"] in train_ids)
     test_ids = common_ids_test[:n_samples]
+    en_test = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test")
     en_test = en_test.filter(lambda x: x["id"] in test_ids)
     data = {"train": en_train, "test": en_test}
     slug = "fair-forward/arc-easy-autotranslated"
     for lang in tqdm(untranslated):
         # check if already exists on hub
         except (ValueError, Exception):
             print(f"Translating {lang}...")
             for split, data_en in data.items():
+                questions_tr = [translate_google(q, "en", lang) for q in data_en["question"]]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
                 choices_texts_concatenated = []
                 for choice in data_en["choices"]:
                     for option in choice["text"]:
                         choices_texts_concatenated.append(option)
+                choices_tr = [translate_google(c, "en", lang) for c in choices_texts_concatenated]
                 choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
                 # group into chunks of 4
+                choices_tr = [choices_tr[i:i+4] for i in range(0, len(choices_tr), 4)]
                 ds_lang = Dataset.from_dict(
                     {
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
+                    f"data/translations/arc/{lang}_{split}.json", lines=False, force_ascii=False, indent=2
                 )

evals/datasets_/fleurs.py CHANGED Viewed

@@ -11,7 +11,6 @@ fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
     lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
 )
 def download_file(url, path):
     response = requests.get(url)
     with open(path, "wb") as f:
@@ -35,4 +34,4 @@ def download_fleurs(transcription_langs_eval):
         if not tsv_path.exists():
             print(f"Downloading {tsv_url} to {tsv_path}")
             tsv_path.parent.mkdir(parents=True, exist_ok=True)
-            download_file(tsv_url, tsv_path)

     lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
 )
 def download_file(url, path):
     response = requests.get(url)
     with open(path, "wb") as f:
         if not tsv_path.exists():
             print(f"Downloading {tsv_url} to {tsv_path}")
             tsv_path.parent.mkdir(parents=True, exist_ok=True)
+            download_file(tsv_url, tsv_path)

evals/datasets_/mgsm.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import asyncio
 import os
-import random
 from datasets import Dataset, load_dataset
-from datasets_.util import _get_dataset_config_names, _load_dataset, cache
-from langcodes import Language, standardize_tag
-from models import get_google_supported_languages, translate_google
-from rich import print
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
@@ -39,50 +37,31 @@ def parse_number(i):
         return None
-@cache
-def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
-    """Cache individual MGSM items efficiently"""
-    try:
-        ds = _load_dataset(
-            dataset_slug,
-            subset=subset_tag,
-            split="test",
-            trust_remote_code=trust_remote_code,
-        )
-        if nr >= len(ds):
-            return None
-        row = ds[nr]
-        # Post-process based on dataset type
-        if dataset_slug == slug_gsm8kx:
-            row["answer_number"] = row["answer"].split("####")[1].strip()
-        return row
-    except Exception:
-        # Dataset doesn't exist or doesn't have test split
-        return None
 def load_mgsm(language_bcp_47, nr):
     if language_bcp_47 in tags_mgsm.keys():
-        item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
-        return slug_mgsm, item, "human" if item else (None, None, None)
     elif language_bcp_47 in tags_afrimgsm.keys():
-        item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
-        return slug_afrimgsm, item, "human" if item else (None, None, None)
-    elif language_bcp_47 in tags_gsm8kx.keys():
-        item = _get_mgsm_item(
-            slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True
         )
-        return slug_gsm8kx, item, "machine" if item else (None, None, None)
     elif language_bcp_47 in tags_gsm_autotranslated.keys():
-        item = _get_mgsm_item(
-            slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr
         )
-        return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
     else:
-        return None, None, None
 def translate_mgsm(languages):
@@ -90,7 +69,7 @@ def translate_mgsm(languages):
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
-        if lang not in human_translated and lang in get_google_supported_languages()
     ]
     en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
     slug = "fair-forward/gsm-autotranslated"
@@ -117,8 +96,5 @@ def translate_mgsm(languages):
                 token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
             )
             ds_lang.to_json(
-                f"data/translations/mgsm/{lang}.json",
-                lines=False,
-                force_ascii=False,
-                indent=2,
             )

 import asyncio
 import os
 from datasets import Dataset, load_dataset
+from datasets_.util import _get_dataset_config_names, _load_dataset
+from langcodes import standardize_tag
+from models import google_supported_languages, translate_google
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
         return None
 def load_mgsm(language_bcp_47, nr):
     if language_bcp_47 in tags_mgsm.keys():
+        ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
+        return slug_mgsm, ds[nr]
     elif language_bcp_47 in tags_afrimgsm.keys():
+        ds = _load_dataset(
+            slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
         )
+        return slug_afrimgsm, ds[nr]
     elif language_bcp_47 in tags_gsm_autotranslated.keys():
+        ds = _load_dataset(
+            slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
         )
+        return slug_gsm_autotranslated, ds[nr]
+    elif language_bcp_47 in tags_gsm8kx.keys():
+        row = _load_dataset(
+            slug_gsm8kx,
+            subset=tags_gsm8kx[language_bcp_47],
+            split="test",
+            trust_remote_code=True,
+        )[nr]
+        row["answer_number"] = row["answer"].split("####")[1].strip()
+        return slug_gsm8kx, row
     else:
+        return None, None
 def translate_mgsm(languages):
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in google_supported_languages
     ]
     en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
     slug = "fair-forward/gsm-autotranslated"
                 token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
             )
             ds_lang.to_json(
+                f"data/translations/mgsm/{lang}.json", lines=False, force_ascii=False, indent=2
             )

evals/datasets_/mmlu.py CHANGED Viewed

@@ -4,9 +4,9 @@ import random
 from collections import Counter, defaultdict
 from datasets import Dataset, load_dataset
-from datasets_.util import _get_dataset_config_names, _load_dataset, cache
 from langcodes import Language, standardize_tag
-from models import get_google_supported_languages, translate_google
 from rich import print
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
@@ -111,7 +111,6 @@ def print_datasets_analysis():
 # MMLUX is translated using DeepL
 # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
 # print_datasets_analysis()
@@ -144,61 +143,32 @@ tags_mmlux = set(
     a.rsplit("_", 1)[1].split("-")[0].lower()
     for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
 )
-tags_mmlu_autotranslated = {
-    standardize_tag(a, macro=True): a
-    for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
-}
 categories = sorted(
-    list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
-)
-@cache
-def _get_processed_mmlu_dataset(dataset_name, subset_tag):
-    """Cache processed datasets to avoid reprocessing"""
-    ds = _load_dataset(dataset_name, subset_tag)
-    if dataset_name == "masakhane/afrimmlu":
-        ds = ds.map(parse_choices)
-    elif dataset_name == "CohereForAI/Global-MMLU":
-        ds = ds.map(add_choices)
-    return ds
-@cache
-def _get_mmlu_item(dataset_name, subset_tag, category, nr):
-    """Cache individual MMLU items efficiently"""
-    ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
-    if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
-        filtered = ds["test"].filter(lambda x: x["subject"] == category)
-        return filtered[nr] if nr < len(filtered) else None
-    else:  # fair-forward/mmlu-autotranslated
-        filtered = ds["test"].filter(lambda x: x["subject"] == category)
-        return filtered[nr] if nr < len(filtered) else None
-async def load_mmlu(language_bcp_47, nr):
     category = categories[nr % len(categories)]
     if language_bcp_47 in tags_afrimmlu.keys():
-        task = _get_mmlu_item(
-            "masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr
-        )
-        return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
     elif language_bcp_47 in tags_global_mmlu.keys():
-        task = _get_mmlu_item(
-            "CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr
-        )
-        return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
-    # TODO: add in Okapi, MMLUX @Jonas
     elif language_bcp_47 in tags_mmlu_autotranslated:
-        task = _get_mmlu_item(
-            "fair-forward/mmlu-autotranslated", language_bcp_47, category, nr
-        )
-        return (
-            "fair-forward/mmlu-autotranslated",
-            task,
-            "machine" if task else (None, None, None),
-        )
     else:
         return None, None, None
@@ -207,10 +177,10 @@ def translate_mmlu(languages):
     human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
     untranslated = [
         lang
-        for lang in languages["bcp_47"].values[:150]
-        if lang not in human_translated and lang in get_google_supported_languages()
     ]
-    n_samples = 20
     slug = "fair-forward/mmlu-autotranslated"
     for lang in tqdm(untranslated):
@@ -226,10 +196,8 @@ def translate_mmlu(languages):
                     if split == "dev":
                         samples.extend(ds.filter(lambda x: x["subject"] == category))
                     else:
-                        # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
-                        filtered = ds.filter(lambda x: x["subject"] == category)
-                        for i in range(min(n_samples, len(filtered))):
-                            task = filtered[i]
                             samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples

 from collections import Counter, defaultdict
 from datasets import Dataset, load_dataset
+from datasets_.util import _get_dataset_config_names, _load_dataset
 from langcodes import Language, standardize_tag
+from models import google_supported_languages, translate_google
 from rich import print
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
 # MMLUX is translated using DeepL
 # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
 # print_datasets_analysis()
     a.rsplit("_", 1)[1].split("-")[0].lower()
     for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
 )
+tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
 categories = sorted(
+        list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
+    )
+def load_mmlu(language_bcp_47, nr):
     category = categories[nr % len(categories)]
     if language_bcp_47 in tags_afrimmlu.keys():
+        ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
+        ds = ds.map(parse_choices)
+        examples = ds["dev"].filter(lambda x: x["subject"] == category)
+        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
+        return "masakhane/afrimmlu", examples, task
     elif language_bcp_47 in tags_global_mmlu.keys():
+        ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
+        ds = ds.map(add_choices)
+        examples = ds["dev"].filter(lambda x: x["subject"] == category)
+        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
+        return "CohereForAI/Global-MMLU", examples, task
     elif language_bcp_47 in tags_mmlu_autotranslated:
+        ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
+        examples = ds["dev"].filter(lambda x: x["subject"] == category)
+        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
+        return "fair-forward/mmlu-autotranslated", examples, task
     else:
         return None, None, None
     human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
     untranslated = [
         lang
+        for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in google_supported_languages
     ]
+    n_samples = 10
     slug = "fair-forward/mmlu-autotranslated"
     for lang in tqdm(untranslated):
                     if split == "dev":
                         samples.extend(ds.filter(lambda x: x["subject"] == category))
                     else:
+                        for i in range(n_samples):
+                            task = ds.filter(lambda x: x["subject"] == category)[i]
                             samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples

evals/datasets_/truthfulqa.py CHANGED Viewed

@@ -8,29 +8,17 @@ import asyncio
 from tqdm.asyncio import tqdm_asyncio
 import os
-from datasets import Dataset, load_dataset, DatasetNotFoundError
-from models import translate_google, get_google_supported_languages
 from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
-slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
 tags_uhura_truthfulqa = {
-    standardize_tag(a.split("_")[0], macro=True): a
-    for a in _get_dataset_config_names(slug_uhura_truthfulqa)
     if a.endswith("multiple_choice")
 }
-# Get available auto-translated languages
-try:
-    tags_truthfulqa_autotranslated = {
-        standardize_tag(a, macro=True): a
-        for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
-    }
-except DatasetNotFoundError:
-    tags_truthfulqa_autotranslated = {}
 def add_choices(row):
     row["choices"] = row["mc1_targets"]["choices"]
@@ -38,42 +26,26 @@ def add_choices(row):
     return row
-async def load_truthfulqa(language_bcp_47, nr):
     if language_bcp_47 in tags_uhura_truthfulqa.keys():
-        ds = _load_dataset(
-            slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
-        )
         ds = ds.map(add_choices)
         task = ds["test"][nr]
-        # Ensure there is a correct answer before returning the task
-        if 1 not in task["labels"]:
-            return None, None, None
-        return "masakhane/uhura-truthfulqa", task, "human"
-    elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
-        # Load from auto-translated dataset (same samples as translation)
-        ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
-        test_split = ds["test"] if "test" in ds else ds
-        task = test_split[nr]
-        # Ensure there is a correct answer before returning the task
-        if 1 not in task.get("labels", []):
-            return None, None, None
-        return slug_truthfulqa_autotranslated, task, "machine"
-    # TODO: add Okapi, TruthfulQA-X @Jonas
     else:
         return None, None, None
 def translate_truthfulqa(languages):
     human_translated = [*tags_uhura_truthfulqa.keys()]
     untranslated = [
         lang
-        for lang in languages["bcp_47"].values[:150]
-        if lang not in human_translated and lang in get_google_supported_languages()
     ]
-    n_samples = 20
-    # Set fixed seed for consistent sample selection across all languages
-    random.seed(42)
     slug = "fair-forward/truthfulqa-autotranslated"
     for lang in tqdm(untranslated):
@@ -83,47 +55,37 @@ def translate_truthfulqa(languages):
         except (ValueError, Exception):
             print(f"Translating {lang}...")
             for split in ["train", "test"]:
-                ds = _load_dataset(
-                    slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split
-                )
                 samples = []
                 if split == "train":
                     samples.extend(ds)
                 else:
-                    # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
-                    for i in range(min(n_samples, len(ds))):
                         task = ds[i]
                         samples.append(task)
-                # Translate questions
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples
                 ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
-                # Translate choices for each sample
-                all_choices_tr = []
-                all_labels = []
                 for s in samples:
-                    # Get choices from mc1_targets
-                    choices = s["mc1_targets"]["choices"]
-                    labels = s["mc1_targets"]["labels"]
-                    # Translate choices
-                    choices_tr = [
-                        translate_google(choice, "en", lang) for choice in choices
-                    ]
-                    choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
-                    all_choices_tr.append(choices_tr)
-                    all_labels.append(labels)
                 ds_lang = Dataset.from_dict(
                     {
                         "question": questions_tr,
-                        "choices": all_choices_tr,
-                        "labels": all_labels,
                     }
                 )
                 ds_lang.push_to_hub(
@@ -133,7 +95,7 @@ def translate_truthfulqa(languages):
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
-                    f"data/translations/truthfulqa/{lang}_{split}.json",
                     lines=False,
                     force_ascii=False,
                     indent=2,

 from tqdm.asyncio import tqdm_asyncio
 import os
+from datasets import Dataset, load_dataset
+from models import translate_google, google_supported_languages
 from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
 tags_uhura_truthfulqa = {
+    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
     if a.endswith("multiple_choice")
 }
 def add_choices(row):
     row["choices"] = row["mc1_targets"]["choices"]
     return row
+def load_truthfulqa(language_bcp_47, nr):
     if language_bcp_47 in tags_uhura_truthfulqa.keys():
+        ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
         ds = ds.map(add_choices)
+        examples = ds["train"]
         task = ds["test"][nr]
+        return "masakhane/uhura-truthfulqa", examples, task
     else:
         return None, None, None
 def translate_truthfulqa(languages):
     human_translated = [*tags_uhura_truthfulqa.keys()]
     untranslated = [
         lang
+        for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in google_supported_languages
     ]
+    n_samples = 10
     slug = "fair-forward/truthfulqa-autotranslated"
     for lang in tqdm(untranslated):
         except (ValueError, Exception):
             print(f"Translating {lang}...")
             for split in ["train", "test"]:
+                ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split)
                 samples = []
                 if split == "train":
                     samples.extend(ds)
                 else:
+                    for i in range(n_samples):
                         task = ds[i]
                         samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples
                 ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
+                choices_texts_concatenated = []
                 for s in samples:
+                    for choice in eval(s["choices"]):
+                        choices_texts_concatenated.append(choice)
+                choices_tr = [
+                    translate_google(c, "en", lang) for c in choices_texts_concatenated
+                ]
+                choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
+                # group into chunks of 4
+                choices_tr = [
+                    choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
+                ]
                 ds_lang = Dataset.from_dict(
                     {
+                        "subject": [s["subject"] for s in samples],
                         "question": questions_tr,
+                        "choices": choices_tr,
+                        "answer": [s["answer"] for s in samples],
                     }
                 )
                 ds_lang.push_to_hub(
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
+                    f"data/translations/mmlu/{lang}_{split}.json",
                     lines=False,
                     force_ascii=False,
                     indent=2,

evals/datasets_/util.py CHANGED Viewed

@@ -12,11 +12,3 @@ def _get_dataset_config_names(dataset, **kwargs):
 @cache
 def _load_dataset(dataset, subset, **kwargs):
     return load_dataset(dataset, subset, **kwargs)
-# Cache individual dataset items to avoid reloading entire datasets
-@cache
-def _get_dataset_item(dataset, subset, split, index, **kwargs):
-    """Load a single item from a dataset efficiently"""
-    ds = load_dataset(dataset, subset, split=split, **kwargs)
-    return ds[index] if index < len(ds) else None

 @cache
 def _load_dataset(dataset, subset, **kwargs):
     return load_dataset(dataset, subset, **kwargs)

evals/download_data.py CHANGED Viewed

@@ -8,7 +8,6 @@ from pathlib import Path
 import sys
 import huggingface_hub
 from datasets import load_dataset, DatasetDict
 # Import fleurs DataFrame directly from its source module
 from datasets_.fleurs import fleurs
@@ -25,25 +24,22 @@ DATA_DIR = project_root / "data"
 FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
 FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
-GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip"  # Assumed direct link from https://glottolog.org/meta/downloads
 GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
 GLOTTOLOG_CSV_NAME = "languoid.csv"
-SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html"  # This is HTML, need manual download or parsing
 SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
-SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm"  # Assumed direct link
 SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
 SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
-SPBLEU_DICT_URL = (
-    "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
-)
 SPBLEU_DICT_NAME = "dictionary.txt"
 # --- Helper Functions ---
 def download_file(url, path: Path):
     """Downloads a file from a URL to a local path."""
     print(f"Downloading {url} to {path}...")
@@ -88,16 +84,11 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
                     break
             if target_zip_path:
-                with (
-                    z.open(target_zip_path) as source,
-                    open(extract_path / target_filename, "wb") as target,
-                ):
                     target.write(source.read())
                 print(f"Successfully extracted {target_filename}.")
             else:
-                print(
-                    f"Error: Could not find {target_filename} within the zip archive."
-                )
     except zipfile.BadZipFile:
         print("Error: Downloaded file is not a valid zip archive.")
@@ -107,14 +98,13 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
 # --- Download Functions ---
 def download_fleurs_data():
     """Downloads Fleurs audio and text data."""
     print("\n--- Downloading Fleurs Data ---")
     FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
     # Use the fleurs_tag column from the imported DataFrame
-    fleurs_tags_list = fleurs["fleurs_tag"].tolist()
     if not fleurs_tags_list:
         print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
@@ -127,9 +117,7 @@ def download_fleurs_data():
         audio_dir = lang_dir / "audio"
         dev_tsv_path = lang_dir / "dev.tsv"
         dev_audio_archive_path = audio_dir / "dev.tar.gz"
-        audio_extracted_marker = (
-            audio_dir / "dev"
-        )  # Check if extraction likely happened
         # Download TSV
         if not dev_tsv_path.exists():
@@ -141,15 +129,15 @@ def download_fleurs_data():
         # Download and Extract Audio
         if not audio_extracted_marker.exists():
             if not dev_audio_archive_path.exists():
-                tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
-                download_file(tar_url, dev_audio_archive_path)
             if dev_audio_archive_path.exists():
-                extract_tar_gz(dev_audio_archive_path, audio_dir)
             else:
                 print(f"Audio archive missing, cannot extract for {lang_tag}")
         else:
-            print(f"Found extracted audio: {audio_extracted_marker}")
 def download_glottolog_data():
@@ -177,9 +165,7 @@ def download_scriptcodes_data():
     # The URL points to an HTML page, not a direct CSV link.
     # Manual download is likely required for ScriptCodes.csv.
     print(f"Cannot automatically download from {SCRIPTCODES_URL}")
-    print(
-        "Please manually download the ISO 15924 codes list (often available as a .txt file)"
-    )
     print("from the Unicode website or related sources and save it as:")
     print(f"{SCRIPTCODES_TARGET_FILE}")
     if SCRIPTCODES_TARGET_FILE.exists():
@@ -210,24 +196,21 @@ def download_spbleu_data():
 # --- Main Execution ---
 def main():
     """Runs all download functions and the conversion step."""
     print("Starting data download process...")
     DATA_DIR.mkdir(exist_ok=True)
-    # download_fleurs_data()
     download_glottolog_data()
     download_scriptcodes_data()
     download_spbleu_data()
     print("\nData download process finished.")
     print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
-    print(
-        "Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well"
-    )
     print("in 'evals/datasets_/flores.py' to be read correctly.")
 if __name__ == "__main__":
-    main()

 import sys
 import huggingface_hub
 from datasets import load_dataset, DatasetDict
 # Import fleurs DataFrame directly from its source module
 from datasets_.fleurs import fleurs
 FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
 FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
+GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
 GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
 GLOTTOLOG_CSV_NAME = "languoid.csv"
+SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
 SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
+SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
 SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
 SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
+SPBLEU_DICT_URL = "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
 SPBLEU_DICT_NAME = "dictionary.txt"
 # --- Helper Functions ---
 def download_file(url, path: Path):
     """Downloads a file from a URL to a local path."""
     print(f"Downloading {url} to {path}...")
                     break
             if target_zip_path:
+                with z.open(target_zip_path) as source, open(extract_path / target_filename, "wb") as target:
                     target.write(source.read())
                 print(f"Successfully extracted {target_filename}.")
             else:
+                print(f"Error: Could not find {target_filename} within the zip archive.")
     except zipfile.BadZipFile:
         print("Error: Downloaded file is not a valid zip archive.")
 # --- Download Functions ---
 def download_fleurs_data():
     """Downloads Fleurs audio and text data."""
     print("\n--- Downloading Fleurs Data ---")
     FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
     # Use the fleurs_tag column from the imported DataFrame
+    fleurs_tags_list = fleurs['fleurs_tag'].tolist()
     if not fleurs_tags_list:
         print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
         audio_dir = lang_dir / "audio"
         dev_tsv_path = lang_dir / "dev.tsv"
         dev_audio_archive_path = audio_dir / "dev.tar.gz"
+        audio_extracted_marker = audio_dir / "dev" # Check if extraction likely happened
         # Download TSV
         if not dev_tsv_path.exists():
         # Download and Extract Audio
         if not audio_extracted_marker.exists():
             if not dev_audio_archive_path.exists():
+                 tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
+                 download_file(tar_url, dev_audio_archive_path)
             if dev_audio_archive_path.exists():
+                 extract_tar_gz(dev_audio_archive_path, audio_dir)
             else:
                 print(f"Audio archive missing, cannot extract for {lang_tag}")
         else:
+             print(f"Found extracted audio: {audio_extracted_marker}")
 def download_glottolog_data():
     # The URL points to an HTML page, not a direct CSV link.
     # Manual download is likely required for ScriptCodes.csv.
     print(f"Cannot automatically download from {SCRIPTCODES_URL}")
+    print(f"Please manually download the ISO 15924 codes list (often available as a .txt file)")
     print("from the Unicode website or related sources and save it as:")
     print(f"{SCRIPTCODES_TARGET_FILE}")
     if SCRIPTCODES_TARGET_FILE.exists():
 # --- Main Execution ---
 def main():
     """Runs all download functions and the conversion step."""
     print("Starting data download process...")
     DATA_DIR.mkdir(exist_ok=True)
+    #download_fleurs_data()
     download_glottolog_data()
     download_scriptcodes_data()
     download_spbleu_data()
     print("\nData download process finished.")
     print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
+    print("Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well")
     print("in 'evals/datasets_/flores.py' to be read correctly.")
 if __name__ == "__main__":
+    main()

evals/languages.py CHANGED Viewed

@@ -31,7 +31,6 @@ glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
     lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
 )
 @cache
 def language_family(bcp_47):
     languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
@@ -40,7 +39,6 @@ def language_family(bcp_47):
     family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
     return family["name"]
 languages["family"] = languages["bcp_47"].apply(language_family)
 # load script codes and names
@@ -48,7 +46,6 @@ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
     columns={"Code": "iso15924", "English Name": "script_name"}
 )
 def script_name(iso15924):
     return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]

     lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
 )
 @cache
 def language_family(bcp_47):
     languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
     family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
     return family["name"]
 languages["family"] = languages["bcp_47"].apply(language_family)
 # load script codes and names
     columns={"Code": "iso15924", "English Name": "script_name"}
 )
 def script_name(iso15924):
     return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]

evals/main.py CHANGED Viewed

@@ -1,190 +1,62 @@
 import asyncio
 import pandas as pd
-import time
-from datetime import datetime, timedelta
 from models import models
 from tasks import tasks
-from languages import languages
-import os
-async def evaluate():
-    # Configuration - easily adjustable defaults
-    n_sentences = int(
-        os.environ.get("N_SENTENCES", 20)
-    )  # Default: 20 sentences per task
-    max_languages = int(
-        os.environ.get("MAX_LANGUAGES", 150)
-    )  # Default: 150 top languages
-    single_model = os.environ.get(
-        "SINGLE_MODEL"
-    )  # Optional: run only one specific model
-    test_mode = os.environ.get("TEST", "").lower() in (
-        "1",
-        "true",
-        "yes",
-    )  # Optional: skip results loading/saving
-    # Keep original DataFrames for saving metadata - distinction added for single model test runs.
-    original_models_df = pd.DataFrame(models)
-    original_languages_df = pd.DataFrame(languages)
-    # Create working copies for single evaluation runs
-    models_df = original_models_df.copy()
-    languages_df = original_languages_df.copy()
-    top_languages = languages.head(max_languages)
-    # Filter to single model if specified (only affects evaluation, not saving)
-    if single_model:
-        models_df = models_df[models_df["id"] == single_model]
-        if len(models_df) == 0:
-            print(f"Error: Model '{single_model}' not found. Available models:")
-            for model_id in original_models_df["id"]:
-                print(f"  {model_id}")
-            return pd.DataFrame()
-    print(
-        f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task"
-    )
-    if test_mode:
-        print("TEST MODE: Skipping results loading/saving")
-    start_time = time.time()
-    # Load existing results to avoid re-evaluation (skip in test mode)
-    if test_mode:
-        old_results = pd.DataFrame(
-            columns=["model", "bcp_47", "task", "metric", "origin", "score"]
-        )
-    else:
         old_results = pd.read_json("results.json")
-    # Get all combinations that need evaluation
-    combis = [
-        (model, lang.bcp_47, task_name)
-        for model in models_df["id"]
-        for lang in top_languages.itertuples()
-        for task_name, task in tasks.items()
-        if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
-    ]
-    # Filter out already evaluated combinations
-    combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
-    if not old_results.empty:
-        completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1))
-        # set + combis is faster than merge (locally it made a difference for me when loading all data/tasks into memory)
-        mask = ~combis.apply(
-            lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1
-        )
-        combis = combis[mask]
-    # Create all evaluation tasks
-    all_tasks = []
-    for i in range(n_sentences):
-        for model, bcp_47, task_name in combis.itertuples(index=False):
-            all_tasks.append((tasks[task_name], model, bcp_47, i))
-    print(f"Running {len(all_tasks)} evaluation tasks...")
-    # For single model runs, we stop immediately on first API error to inspect.
-    # For full evaluations, we continue despite errors to get maximum coverage.
-    stop_on_error = single_model is not None
-    # Process tasks in batches to avoid memory issues (for full evaluation locally that helped a lot)
-    batch_size = 1000
-    all_results = []
-    try:
-        for i in range(0, len(all_tasks), batch_size):
-            batch = all_tasks[i : i + batch_size]
-            batch_results = await asyncio.gather(
-                *[
-                    task_func(model, bcp_47, sentence_nr)
-                    for task_func, model, bcp_47, sentence_nr in batch
-                ],
-                return_exceptions=not stop_on_error,
             )
-            all_results.extend(batch_results)
-        results = all_results
-        # Process results and logging API errors separately to understand what are the main issues.
-        valid_results = []
-        errors = []
-        for i, r in enumerate(results):
-            if isinstance(r, Exception):
-                if i < len(all_tasks):
-                    task_info = all_tasks[i]
-                    errors.append(f"{task_info[1]},{task_info[2]},{str(r)}")
-            elif isinstance(r, list):
-                valid_results.extend(r)
-            elif r is not None:
-                valid_results.append(r)
-        # log errors and store
-        if errors:
-            with open("errors.log", "w") as f:
-                f.write("model,task,error\n")
-                for error in errors:
-                    f.write(error + "\n")
-        # Track model completion (TO BE DELETED - was for local run only)
-        if valid_results:
-            completed_models = set()
-            for result in valid_results:
-                if isinstance(result, dict) and "model" in result:
-                    model = result["model"]
-                    if model not in completed_models:
-                        completed_models.add(model)
-                        print(f"Completed: {model}")
-        print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors")
-    # this is for local single model runs - for testing and development
-    except Exception as e:
-        print(f"EVALUATION STOPPED - API Error occurred:")
-        print(f"Error type: {type(e).__name__}")
-        print(f"Error message: {str(e)}")
-        return pd.DataFrame()
-    # Save results (skipped in test mode as we do not want to overwrite existing results)
-    if valid_results:
-        results_df = pd.DataFrame(valid_results)
-        # Aggregate results
-        results_df = (
-            results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
-            .agg({"score": "mean"})
-            .reset_index()
-        )
-        if not test_mode:
-            args = dict(orient="records", indent=2, force_ascii=False)
-            # Merge with existing results
-            if not old_results.empty:
-                results_df = pd.concat([old_results, results_df])
-                results_df = results_df.drop_duplicates(
-                    subset=["model", "bcp_47", "task", "metric", "origin"]
-                )
-            results_df = results_df.sort_values(
-                by=["model", "bcp_47", "task", "metric"]
-            )
-            results_df.to_json("results.json", **args)
-            # Save model and language info (always save complete metadata, not filtered)
-            original_models_df.to_json("models.json", **args)
-            original_languages_df.to_json("languages.json", **args)
-        else:
-            print("TEST MODE: Skipping results saving")
-        elapsed = time.time() - start_time
-        print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
-        return results_df
-    return pd.DataFrame()
 if __name__ == "__main__":

 import asyncio
 import pandas as pd
+from languages import languages
 from models import models
 from tasks import tasks
+from tqdm.asyncio import tqdm_asyncio
+# ===== config =====
+n_sentences = 10
+# ===== run evaluation and aggregate results =====
+async def evaluate():
+    # FIXME we should not need this for-loop, but it helps
+    for n_languages in range(10, 101, 10):
+        print(f"running evaluations for {n_languages} languages")
         old_results = pd.read_json("results.json")
+        old_models = pd.read_json("models.json")
+        # get all combinations of model, language and task
+        combis = [
+            (model, lang.bcp_47, task_name)
+            for model in models["id"]
+            for lang in languages.iloc[:n_languages].itertuples()
+            for task_name, task in tasks.items()
+            if task_name in models[models["id"] == model]["tasks"].iloc[0]
+        ]
+        # filter out combinations that have already been evaluated
+        combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
+        combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
+        combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
+        # run evaluations
+        results = [
+            tasks[task_name](model, bcp_47, i)
+            for i in range(n_sentences)
+            for model, bcp_47, task_name in combis.itertuples(index=False)
+        ]
+        results = await tqdm_asyncio.gather(*results, miniters=1)
+        results = [r for group in results for r in group]
+        args = dict(orient="records", indent=2, force_ascii=False)
+        if results:
+            # aggregate results
+            results = pd.DataFrame(results)
+            results = (
+                results.groupby(["model", "bcp_47", "task", "metric"])
+                .agg({"score": "mean"})
+                .reset_index()
             )
+            # save results
+            results = pd.concat([old_results, results])
+            results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
+            results.to_json("results.json", **args)
+        # save up-to-date info on models and languages
+        all_models = pd.concat([pd.DataFrame(models), old_models])
+        all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
+        all_models.to_json("models.json", **args)
+        pd.DataFrame(languages).to_json("languages.json", **args)
 if __name__ == "__main__":

evals/models.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 import json
 import re
 from collections import defaultdict
@@ -8,6 +7,7 @@ from os import getenv
 import pandas as pd
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
@@ -22,17 +22,14 @@ important_models = [
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
-    "openai/gpt-5",
-    "openai/gpt-5-nano",  # include if/when available
     "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
-    "openai/gpt-4o-2024-11-20",  # 10$
-    "openai/gpt-oss-120b",
-    "anthropic/claude-3.7-sonnet",  # 15$ - added for full coverage
-    "anthropic/claude-sonnet-4",  # 15$ - added for full coverage
-    "anthropic/claude-opus-4.1",  # 15$ - added for full coverage
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
     "mistralai/mistral-saba",  # 0.6$
     "mistralai/mistral-nemo",  # 0.08$
@@ -51,13 +48,10 @@ important_models = [
     "microsoft/phi-4",  # 0.07$
     "microsoft/phi-4-multimodal-instruct",  # 0.1$
     "amazon/nova-micro-v1",  # 0.09$
-    "moonshotai/kimi-k2",  # 0.6$ - added to prevent missing from models.json
-    "x-ai/grok-4",
 ]
 blocklist = [
     "google/gemini-2.5-pro-preview",
-    "google/gemini-2.5-pro",
     "google/gemini-2.5-flash-preview",
     "google/gemini-2.5-flash-lite-preview",
     "google/gemini-2.5-flash-preview-04-17",
@@ -65,7 +59,6 @@ blocklist = [
     "google/gemini-2.5-flash-lite-preview-06-17",
     "google/gemini-2.5-pro-preview-06-05",
     "google/gemini-2.5-pro-preview-05-06",
-    "perplexity/sonar-deep-research",
 ]
 transcription_models = [
@@ -100,81 +93,28 @@ def get_model(permaslug):
 @cache
 def get_historical_popular_models(date: date):
-    try:
-        raw = get("https://openrouter.ai/rankings").text
-        # Extract model data from rankingData using regex
-        import re
-        import json
-        # Find all count and model_permaslug pairs in the data
-        # Format: "count":number,"model_permaslug":"model/name"
-        pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
-        matches = re.findall(pattern, raw)
-        if matches:
-            # Aggregate model counts
-            model_counts = {}
-            for count_str, model_slug in matches:
-                count = float(count_str)
-                if not model_slug.startswith("openrouter") and model_slug != "Others":
-                    # Remove variant suffixes for aggregation
-                    base_model = model_slug.split(":")[0]
-                    model_counts[base_model] = model_counts.get(base_model, 0) + count
-            # Sort by popularity and return top models
-            sorted_models = sorted(
-                model_counts.items(), key=lambda x: x[1], reverse=True
-            )
-            result = []
-            for model_slug, count in sorted_models[:20]:  # Top 20
-                result.append({"slug": model_slug, "count": int(count)})
-            return result
-        else:
-            return []
-    except Exception as e:
-        return []
 @cache
 def get_current_popular_models(date: date):
-    try:
-        raw = get("https://openrouter.ai/rankings?view=day").text
-        # Extract model data from daily rankings
-        import re
-        import json
-        # Find all count and model_permaslug pairs in the daily data
-        pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
-        matches = re.findall(pattern, raw)
-        if matches:
-            # Aggregate model counts
-            model_counts = {}
-            for count_str, model_slug in matches:
-                count = float(count_str)
-                if not model_slug.startswith("openrouter") and model_slug != "Others":
-                    # Remove variant suffixes for aggregation
-                    base_model = model_slug.split(":")[0]
-                    model_counts[base_model] = model_counts.get(base_model, 0) + count
-            # Sort by popularity and return top models
-            sorted_models = sorted(
-                model_counts.items(), key=lambda x: x[1], reverse=True
-            )
-            result = []
-            for model_slug, count in sorted_models[:10]:  # Top 10
-                result.append({"slug": model_slug, "count": int(count)})
-            return result
-        else:
-            return []
-    except Exception as e:
-        return []
 def get_translation_models():
@@ -221,10 +161,7 @@ async def complete(**kwargs) -> str | None:
 translate_client = translate.Client()
-def get_google_supported_languages():
-    return [l["language"] for l in translate_client.get_languages()]
 @cache
@@ -294,15 +231,12 @@ def get_hf_metadata(row):
         return empty
     try:
         info = api.model_info(id)
-        license = ""
-        if (
-            info.card_data
-            and hasattr(info.card_data, "license")
-            and info.card_data.license
-        ):
-            license = (
-                info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
-            )
         return {
             "hf_id": info.id,
             "creation_date": info.created_at,
@@ -315,14 +249,8 @@ def get_hf_metadata(row):
 def get_cost(row):
-    """
-    row: a row from the OpenRouter models dataframe
-    """
-    try:
-        cost = float(row["endpoint"]["pricing"]["completion"])
-        return round(cost * 1_000_000, 2)
-    except (TypeError, KeyError):
-        return None
 @cache
@@ -332,17 +260,8 @@ def load_models(date: date):
         + get_current_popular_models(date.today())[:10]
     )
     popular_models = [m["slug"] for m in popular_models]
-    all_model_candidates = set(important_models + popular_models) - set(blocklist)
-    # Validate models exist on OpenRouter before including them
-    valid_models = []
-    for model_id in all_model_candidates:
-        metadata = get_or_metadata(model_id)
-        if metadata is not None:
-            valid_models.append(model_id)
-    models = pd.DataFrame(sorted(valid_models), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)
     creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
@@ -362,18 +281,9 @@ def load_models(date: date):
         license=hf_metadata.str["license"],
         creation_date=creation_date_hf.combine_first(creation_date_or),
     )
-    # Filter out expensive models to keep costs reasonable
-    models = models[models["cost"] <= 15.0].reset_index(drop=True)
     models["tasks"] = [
-        [
-            "translation_from",
-            "translation_to",
-            "classification",
-            "mmlu",
-            "arc",
-            "truthfulqa",
-            "mgsm",
-        ]
     ] * len(models)
     models = pd.concat([models, get_translation_models()])
     return models

 import json
 import re
 from collections import defaultdict
 import pandas as pd
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
+from elevenlabs import AsyncElevenLabs
 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
     "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
+    # "openai/gpt-4o-2024-11-20", # 10$
+    "openai/gpt-3.5-turbo-0613",  # 2$
+    # "openai/gpt-3.5-turbo",  # 1.5$
+    # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
     "mistralai/mistral-saba",  # 0.6$
     "mistralai/mistral-nemo",  # 0.08$
     "microsoft/phi-4",  # 0.07$
     "microsoft/phi-4-multimodal-instruct",  # 0.1$
     "amazon/nova-micro-v1",  # 0.09$
 ]
 blocklist = [
     "google/gemini-2.5-pro-preview",
     "google/gemini-2.5-flash-preview",
     "google/gemini-2.5-flash-lite-preview",
     "google/gemini-2.5-flash-preview-04-17",
     "google/gemini-2.5-flash-lite-preview-06-17",
     "google/gemini-2.5-pro-preview-06-05",
     "google/gemini-2.5-pro-preview-05-06",
 ]
 transcription_models = [
 @cache
 def get_historical_popular_models(date: date):
+    raw = get("https://openrouter.ai/rankings").text
+    data = re.search(r'{\\"data\\":(.*),\\"isPercentage\\"', raw).group(1)
+    data = json.loads(data.replace("\\", ""))
+    counts = defaultdict(int)
+    for day in data:
+        for model, count in day["ys"].items():
+            if model.startswith("openrouter") or model == "Others":
+                continue
+            counts[model.split(":")[0]] += count
+    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
+    models = [get_model(model) for model, _ in counts]
+    return [m for m in models if m]
 @cache
 def get_current_popular_models(date: date):
+    raw = get("https://openrouter.ai/rankings?view=day").text.replace("\\", "")
+    data = re.search(r'"rankingData":(.*),"rankingType":"day"', raw).group(1)
+    data = json.loads(data)
+    data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
+    models = [get_model(model["model_permaslug"]) for model in data]
+    return [m for m in models if m]
 def get_translation_models():
 translate_client = translate.Client()
+google_supported_languages = [l["language"] for l in translate_client.get_languages()]
 @cache
         return empty
     try:
         info = api.model_info(id)
+        license = (
+            (info.card_data.license or "")
+            .replace("-", " ")
+            .replace("mit", "MIT")
+            .title()
+        )
         return {
             "hf_id": info.id,
             "creation_date": info.created_at,
 def get_cost(row):
+    cost = float(row["endpoint"]["pricing"]["completion"])
+    return round(cost * 1_000_000, 2)
 @cache
         + get_current_popular_models(date.today())[:10]
     )
     popular_models = [m["slug"] for m in popular_models]
+    models = set(important_models + popular_models) - set(blocklist)
+    models = pd.DataFrame(sorted(list(models)), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)
     creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
         license=hf_metadata.str["license"],
         creation_date=creation_date_hf.combine_first(creation_date_or),
     )
+    # models = models[models["cost"] <= 2.0].reset_index(drop=True)
     models["tasks"] = [
+        ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
     ] * len(models)
     models = pd.concat([models, get_translation_models()])
     return models

evals/plots.py CHANGED Viewed

@@ -9,33 +9,34 @@ df = pd.read_json("../results.json")
 df = df[df["metric"] != "chrf"]
 df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
 # Apply logit transformation to classification scores to reduce skewness
 def transform_classification_scores(row):
-    if row["task"] == "classification":
         # Avoid division by zero and infinite values by clipping
-        score = np.clip(row["score"], 0.001, 0.999)
         # Apply logit transformation (log(p/(1-p)))
         return logit(score)
     else:
-        return row["score"]
-df["score"] = df.apply(transform_classification_scores, axis=1)
 # Create a pivot table with tasks as columns and languages as rows
 pivot_df = df.pivot_table(
-    values="score", index="bcp_47", columns="task", aggfunc="mean"
 )
 # Sort and filter tasks
 ordered_tasks = [
-    "translation_from",
-    "translation_to",
-    "classification",
-    "mmlu",
-    "arc",
-    "mgsm",
 ]
 # Drop 'truthfulqa' if present and reindex columns
 pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
@@ -45,29 +46,29 @@ correlation_matrix = pivot_df.corr()
 # Create the correlation plot
 plt.figure(figsize=(8, 6))
-# Create mask for upper triangle including diagonal to show only lower triangle
 mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
 # Create a heatmap
 sns.heatmap(
-    correlation_matrix,
-    annot=True,
-    cmap="Blues",
     center=0,
     square=True,
     mask=mask,
-    cbar_kws={"shrink": 0.8},
-    fmt=".3f",
 )
-plt.xlabel("Tasks", fontsize=12)
-plt.ylabel("Tasks", fontsize=12)
-plt.xticks(rotation=45, ha="right")
 plt.yticks(rotation=0)
 plt.tight_layout()
 # Save the plot
-plt.savefig("task_correlation_matrix.png", dpi=300, bbox_inches="tight")
 plt.show()
 # Print correlation values for reference
@@ -76,91 +77,56 @@ print("Note: Classification scores have been logit-transformed to reduce skewnes
 print(correlation_matrix.round(3))
 # Also create a scatter plot matrix for pairwise relationships with highlighted languages
-highlighted_languages = ["en", "zh", "hi", "es", "ar"]
 # Create color mapping
 def get_color_and_label(lang_code):
     if lang_code in highlighted_languages:
-        color_map = {
-            "en": "red",
-            "zh": "blue",
-            "hi": "green",
-            "es": "orange",
-            "ar": "purple",
-        }
         return color_map[lang_code], lang_code
     else:
-        return "lightgray", "Other"
 # Create custom scatter plot matrix
 tasks = pivot_df.columns.tolist()
 n_tasks = len(tasks)
 fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
-fig.suptitle("Pairwise Task Performance", fontsize=16, fontweight="bold")
 # Create legend elements
 legend_elements = []
 for lang in highlighted_languages:
     color, _ = get_color_and_label(lang)
-    legend_elements.append(
-        plt.Line2D(
-            [0],
-            [0],
-            marker="o",
-            color="w",
-            markerfacecolor=color,
-            markersize=8,
-            label=lang,
-        )
-    )
-legend_elements.append(
-    plt.Line2D(
-        [0],
-        [0],
-        marker="o",
-        color="w",
-        markerfacecolor="lightgray",
-        markersize=8,
-        label="Other",
-    )
-)
 for i, task_y in enumerate(tasks):
     for j, task_x in enumerate(tasks):
         ax = axes[i, j]
         if i == j:
             # Diagonal: histogram
             task_data = pivot_df[task_y].dropna()
             colors = [get_color_and_label(lang)[0] for lang in task_data.index]
-            ax.hist(task_data, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
-            ax.set_title(f"{task_y}", fontsize=10)
         else:
             # Off-diagonal: scatter plot
             for lang_code in pivot_df.index:
-                if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(
-                    pivot_df.loc[lang_code, task_y]
-                ):
                     color, _ = get_color_and_label(lang_code)
                     alpha = 0.8 if lang_code in highlighted_languages else 0.3
                     size = 50 if lang_code in highlighted_languages else 20
-                    ax.scatter(
-                        pivot_df.loc[lang_code, task_x],
-                        pivot_df.loc[lang_code, task_y],
-                        c=color,
-                        alpha=alpha,
-                        s=size,
-                    )
         # Set labels
         if i == n_tasks - 1:
             ax.set_xlabel(task_x, fontsize=10)
         if j == 0:
             ax.set_ylabel(task_y, fontsize=10)
         # Remove tick labels except for edges
         if i != n_tasks - 1:
             ax.set_xticklabels([])
@@ -170,15 +136,15 @@ for i, task_y in enumerate(tasks):
 # Add legend
 fig.legend(
     handles=legend_elements,
-    loc="lower center",
     bbox_to_anchor=(0.5, -0.05),
     ncol=len(legend_elements),
     frameon=False,
     fontsize=10,
     handletextpad=0.5,
-    columnspacing=1.0,
 )
 plt.tight_layout()
-plt.savefig("task_scatter_matrix.png", dpi=300, bbox_inches="tight")
 plt.show()

 df = df[df["metric"] != "chrf"]
 df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
 # Apply logit transformation to classification scores to reduce skewness
 def transform_classification_scores(row):
+    if row['task'] == 'classification':
         # Avoid division by zero and infinite values by clipping
+        score = np.clip(row['score'], 0.001, 0.999)
         # Apply logit transformation (log(p/(1-p)))
         return logit(score)
     else:
+        return row['score']
+df['score'] = df.apply(transform_classification_scores, axis=1)
 # Create a pivot table with tasks as columns and languages as rows
 pivot_df = df.pivot_table(
+    values='score',
+    index='bcp_47',
+    columns='task',
+    aggfunc='mean'
 )
 # Sort and filter tasks
 ordered_tasks = [
+    'translation_from',
+    'translation_to',
+    'classification',
+    'mmlu',
+    'arc',
+    'mgsm',
 ]
 # Drop 'truthfulqa' if present and reindex columns
 pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
 # Create the correlation plot
 plt.figure(figsize=(8, 6))
+# Create mask for upper triangle including diagonal to show only lower triangle
 mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
 # Create a heatmap
 sns.heatmap(
+    correlation_matrix,
+    annot=True,
+    cmap='Blues',
     center=0,
     square=True,
     mask=mask,
+    cbar_kws={"shrink": .8},
+    fmt='.3f'
 )
+plt.xlabel('Tasks', fontsize=12)
+plt.ylabel('Tasks', fontsize=12)
+plt.xticks(rotation=45, ha='right')
 plt.yticks(rotation=0)
 plt.tight_layout()
 # Save the plot
+plt.savefig('task_correlation_matrix.png', dpi=300, bbox_inches='tight')
 plt.show()
 # Print correlation values for reference
 print(correlation_matrix.round(3))
 # Also create a scatter plot matrix for pairwise relationships with highlighted languages
+highlighted_languages = ['en', 'zh', 'hi', 'es', 'ar']
 # Create color mapping
 def get_color_and_label(lang_code):
     if lang_code in highlighted_languages:
+        color_map = {'en': 'red', 'zh': 'blue', 'hi': 'green', 'es': 'orange', 'ar': 'purple'}
         return color_map[lang_code], lang_code
     else:
+        return 'lightgray', 'Other'
 # Create custom scatter plot matrix
 tasks = pivot_df.columns.tolist()
 n_tasks = len(tasks)
 fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
+fig.suptitle('Pairwise Task Performance', fontsize=16, fontweight='bold')
 # Create legend elements
 legend_elements = []
 for lang in highlighted_languages:
     color, _ = get_color_and_label(lang)
+    legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=8, label=lang))
+legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='lightgray', markersize=8, label='Other'))
 for i, task_y in enumerate(tasks):
     for j, task_x in enumerate(tasks):
         ax = axes[i, j]
         if i == j:
             # Diagonal: histogram
             task_data = pivot_df[task_y].dropna()
             colors = [get_color_and_label(lang)[0] for lang in task_data.index]
+            ax.hist(task_data, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
+            ax.set_title(f'{task_y}', fontsize=10)
         else:
             # Off-diagonal: scatter plot
             for lang_code in pivot_df.index:
+                if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(pivot_df.loc[lang_code, task_y]):
                     color, _ = get_color_and_label(lang_code)
                     alpha = 0.8 if lang_code in highlighted_languages else 0.3
                     size = 50 if lang_code in highlighted_languages else 20
+                    ax.scatter(pivot_df.loc[lang_code, task_x], pivot_df.loc[lang_code, task_y],
+                             c=color, alpha=alpha, s=size)
         # Set labels
         if i == n_tasks - 1:
             ax.set_xlabel(task_x, fontsize=10)
         if j == 0:
             ax.set_ylabel(task_y, fontsize=10)
         # Remove tick labels except for edges
         if i != n_tasks - 1:
             ax.set_xticklabels([])
 # Add legend
 fig.legend(
     handles=legend_elements,
+    loc='lower center',
     bbox_to_anchor=(0.5, -0.05),
     ncol=len(legend_elements),
     frameon=False,
     fontsize=10,
     handletextpad=0.5,
+    columnspacing=1.0
 )
 plt.tight_layout()
+plt.savefig('task_scatter_matrix.png', dpi=300, bbox_inches='tight')
 plt.show()

evals/tasks.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 import random
 from functools import partial
 from textwrap import dedent
@@ -6,10 +5,10 @@ from textwrap import dedent
 import evaluate
 import pandas as pd
 import sentencepiece as spm
-from datasets_.arc import load_uhura_arc_easy
 from datasets_.flores import flores_sentences
 from datasets_.mgsm import load_mgsm, parse_number
 from datasets_.mmlu import load_mmlu
 from datasets_.truthfulqa import load_truthfulqa
 from google.cloud import translate_v2 as translate
 from langcodes import closest_supported_match
@@ -48,7 +47,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
-    translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
     if model == "google/translate-v2":
         original_language = closest_supported_match(
             original_language, supported_languages
@@ -68,7 +66,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
             messages=[
                 {
                     "role": "user",
-                    "content": translation_prompt,
                 }
             ],
             temperature=0,
@@ -93,7 +91,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
             "task": f"translation_{mode}",
             "metric": metric,
             "score": score,
-            "origin": "human",  # FLORES+ is human-translated
             "sentence_nr": sentence_nr,
         }
         for metric, score in (
@@ -115,33 +112,57 @@ async def classify_and_evaluate(model, bcp_47, nr):
     )
     top_topics = paragraphs.value_counts("topic").head(5).index
     paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
-    test_paragraph = paragraphs.sample(n=1, random_state=nr).iloc[0]
-    prompt = f"""Classify the following text into one of these topics: {", ".join(top_topics)}.
-Reply with only the topic name.
-Text:
-{test_paragraph.text}
-"""
-    response = await complete(
-        model=model,
-        messages=[{"role": "user", "content": prompt}],
-        temperature=0,
-        max_tokens=30,
     )
-    pred = response.lower().strip() if response else ""
-    true = test_paragraph.topic.lower().strip()
-    others = [t for t in top_topics if t != true]
-    acc = (
-        int(
-            pred.startswith(true)
-            or (true in pred and not any(o in pred for o in others))
-        )
-        if pred
-        else 0
-    )
     return [
         {
             "model": model,
@@ -149,7 +170,6 @@ Text:
             "task": "classification",
             "metric": "accuracy",
             "score": acc,
-            "origin": "human",  # FLORES+ is human-translated
             "sentence_nr": nr,
         }
     ]
@@ -212,38 +232,39 @@ def format_multiple_choice(item):
     A: {item["choices"][0]}
     B: {item["choices"][1]}
     C: {item["choices"][2]}
-    D: {item["choices"][3]}"""
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
-    ds_name, task, origin = await load_mmlu(language_bcp_47, nr)
     if not task:
         return []
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
-Response format: <reasoning> #### <letter>
----
-{format_multiple_choice(task)}""",
-        },
-    ]
-    response = await complete(
-        model=model,
-        messages=messages,
-        temperature=0,
-        max_tokens=1024,
-    )
-    if response and "####" in response:
-        answer = response.split("####")[-1].strip()
-        acc = int(answer[:1] == task["answer"])
-    else:
-        acc = 0
     return [
         {
             "model": model,
@@ -251,40 +272,39 @@ Response format: <reasoning> #### <letter>
             "task": "mmlu",
             "metric": "accuracy",
             "score": acc,
-            "origin": origin,  # Add origin tag to results
             "sentence_nr": nr,
         }
     ]
 async def arc_and_evaluate(model, language_bcp_47, nr):
-    ds_name, task, origin = load_uhura_arc_easy(language_bcp_47, nr)
     if not task:
         return []
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
-Response format: <reasoning> #### <letter>
----
-{format_multiple_choice(task)}""",
-        },
-    ]
-    response = await complete(
-        model=model,
-        messages=messages,
-        temperature=0,
-        max_tokens=1024,
-    )
-    if response and "####" in response:
-        answer = response.split("####")[-1].strip()
-        acc = int(answer[:1] == task["answer"])
-    else:
-        acc = 0
     return [
         {
             "model": model,
@@ -292,7 +312,6 @@ Response format: <reasoning> #### <letter>
             "task": "arc",
             "metric": "accuracy",
             "score": acc,
-            "origin": origin,
             "sentence_nr": nr,
         }
     ]
@@ -313,42 +332,40 @@ def format_multiple_choice_truthfulqa(item):
     text = item["question"] + "\n\n"
     for i, choice in enumerate(item["choices"]):
         text += f"{letters[i]}: {choice}\n"
     return text
 async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
-    ds_name, task, origin = await load_truthfulqa(language_bcp_47, nr)
     if not task:
         return []
-    # Find the correct answer
-    correct_choice_index = task["labels"].index(1)
-    answer = letters[correct_choice_index]
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Answer the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
-Response format: <reasoning> #### <letter>
----
-{format_multiple_choice_truthfulqa(task)}""",
-        },
-    ]
-    response = await complete(
-        model=model,
-        messages=messages,
-        temperature=0,
-        max_tokens=1024,  # Increased for reasoning
-    )
-    if response and "####" in response:
-        pred_answer = response.split("####")[-1].strip()
-        acc = int(pred_answer[:1].upper() == answer)
-    else:
-        acc = 0
     return [
         {
             "model": model,
@@ -356,36 +373,30 @@ Response format: <reasoning> #### <letter>
             "task": "truthfulqa",
             "metric": "accuracy",
             "score": acc,
-            "origin": origin,
             "sentence_nr": nr,
         }
     ]
 async def mgsm_and_evaluate(model, language_bcp_47, nr):
-    ds_slug, question, origin = load_mgsm(language_bcp_47, nr)
     if not question:
         return []
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Solve the following math problem. Reason step-by-step and then write the final answer as a number.
-Response format: <reasoning> #### <number>
----
-{question["question"]}""",
-        },
-    ]
     response = await complete(
         model=model,
-        messages=messages,
         temperature=0,
         max_tokens=1024,
     )
-    if response and "####" in response:
         number = response.split("####")[1].strip()
         accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
     else:
@@ -398,7 +409,6 @@ Response format: <reasoning> #### <number>
             "task": "mgsm",
             "metric": "accuracy",
             "score": accuracy,
-            "origin": origin,
             "sentence_nr": nr,
         }
     ]
@@ -439,8 +449,10 @@ tasks = {
     "translation_from": partial(translate_and_evaluate, mode="from"),
     "translation_to": partial(translate_and_evaluate, mode="to"),
     "classification": classify_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     "arc": arc_and_evaluate,
     "truthfulqa": truthfulqa_and_evaluate,
     "mgsm": mgsm_and_evaluate,
 }

 import random
 from functools import partial
 from textwrap import dedent
 import evaluate
 import pandas as pd
 import sentencepiece as spm
 from datasets_.flores import flores_sentences
 from datasets_.mgsm import load_mgsm, parse_number
 from datasets_.mmlu import load_mmlu
+from datasets_.arc import load_uhura_arc_easy
 from datasets_.truthfulqa import load_truthfulqa
 from google.cloud import translate_v2 as translate
 from langcodes import closest_supported_match
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
     if model == "google/translate-v2":
         original_language = closest_supported_match(
             original_language, supported_languages
             messages=[
                 {
                     "role": "user",
+                    "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
                 }
             ],
             temperature=0,
             "task": f"translation_{mode}",
             "metric": metric,
             "score": score,
             "sentence_nr": sentence_nr,
         }
         for metric, score in (
     )
     top_topics = paragraphs.value_counts("topic").head(5).index
     paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
+    examples = pd.concat(
+        [
+            paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42)
+            for t in top_topics
+        ]
+    ).sample(frac=1, random_state=nr)
+    test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
+        frac=1, random_state=42
     )
+    test_paragraph = test_paragraphs.iloc[nr]
+    def format_prompt(text):
+        return f"{text}\n\nTopic: {'|'.join(top_topics)}?"
+    messages = []
+    for example in examples.itertuples():
+        messages += [
+            {"role": "user", "content": format_prompt(example.text)},
+            {"role": "assistant", "content": example.topic},
+        ]
+    # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
+    # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
+    try:
+        pred = await complete(
+            model=model,
+            messages=[
+                *messages,
+                {
+                    "role": "user",
+                    "content": format_prompt(test_paragraph.text),
+                },
+            ],
+            temperature=0,
+            max_tokens=30,
+        )
+        true = test_paragraph.topic
+        others = [t for t in top_topics if t != true]
+        acc = (
+            int(
+                pred.startswith(true)
+                or (true in pred and not any(o in pred for o in others))
+            )
+            if pred
+            else 0
+        )
+    except Exception as e:
+        if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
+            print(f"Max tokens exceeded for {model} in {bcp_47}")
+            acc = 0
+        else:
+            raise e
     return [
         {
             "model": model,
             "task": "classification",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
     A: {item["choices"][0]}
     B: {item["choices"][1]}
     C: {item["choices"][2]}
+    D: {item["choices"][3]}
+    A|B|C|D?"""
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
+    ds_name, examples, task = load_mmlu(language_bcp_47, nr)
     if not task:
         return []
+    messages = []
+    for example in examples:
+        messages += [
+            {"role": "user", "content": format_multiple_choice(example)},
+            {"role": "assistant", "content": example["answer"]},
+        ]
+    messages += [{"role": "user", "content": format_multiple_choice(task)}]
+    try:
+        response = await complete(
+            model=model,
+            messages=messages,
+            temperature=0,
+            max_tokens=1,
+        )
+        if response:
+            acc = int(response[:1].strip() == task["answer"])
+        else:
+            acc = 0
+    except Exception as e:
+        if "ResponsibleAIPolicyViolation" in str(e):
+            acc = 0
+        else:
+            raise e
     return [
         {
             "model": model,
             "task": "mmlu",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
 async def arc_and_evaluate(model, language_bcp_47, nr):
+    ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
     if not task:
         return []
+    messages = []
+    for example in examples:
+        messages += [
+            {"role": "user", "content": format_multiple_choice(example)},
+            {"role": "assistant", "content": example["answer"]},
+        ]
+    messages += [{"role": "user", "content": format_multiple_choice(task)}]
+    try:
+        response = await complete(
+            model=model,
+            messages=messages,
+            temperature=0,
+            max_tokens=1,
+        )
+        if response:
+            acc = int(response[:1].strip() == task["answer"])
+        else:
+            acc = 0
+    except Exception as e:
+        if "ResponsibleAIPolicyViolation" in str(e):
+            acc = 0
+        else:
+            raise e
     return [
         {
             "model": model,
             "task": "arc",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
     text = item["question"] + "\n\n"
     for i, choice in enumerate(item["choices"]):
         text += f"{letters[i]}: {choice}\n"
+    text += "|".join(letters[: len(item["choices"])]) + "?"
     return text
 async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
+    ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
     if not task:
         return []
+    task = shuffle_choices_and_labels(task)
+    answer = letters[task["labels"].index(1)]
+    messages = []
+    for example in examples:
+        example = shuffle_choices_and_labels(example)
+        messages += [
+            {"role": "user", "content": format_multiple_choice_truthfulqa(example)},
+            {"role": "assistant", "content": letters[example["labels"].index(1)]},
+        ]
+    messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
+    try:
+        response = await complete(
+            model=model,
+            messages=messages,
+            temperature=0,
+            max_tokens=1,
+        )
+        if response:
+            acc = int(response[:1].strip() == answer)
+        else:
+            acc = 0
+    except Exception as e:
+        if "ResponsibleAIPolicyViolation" in str(e):
+            acc = 0
+        else:
+            raise e
     return [
         {
             "model": model,
             "task": "truthfulqa",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
 async def mgsm_and_evaluate(model, language_bcp_47, nr):
+    system_prompt = """
+    Solve the math problem. Use reasoning, and finally give the answer as a number.
+    Response format: <reasoning> #### <number>
+    """
+    system_prompt = dedent(system_prompt).strip()
+    ds_slug, question = load_mgsm(language_bcp_47, nr)
     if not question:
         return []
     response = await complete(
         model=model,
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": question["question"]},
+        ],
         temperature=0,
         max_tokens=1024,
     )
+    if response and len(response.split("####")) == 2:
         number = response.split("####")[1].strip()
         accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
     else:
             "task": "mgsm",
             "metric": "accuracy",
             "score": accuracy,
             "sentence_nr": nr,
         }
     ]
     "translation_from": partial(translate_and_evaluate, mode="from"),
     "translation_to": partial(translate_and_evaluate, mode="to"),
     "classification": classify_and_evaluate,
+    # "mlm": mlm_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     "arc": arc_and_evaluate,
     "truthfulqa": truthfulqa_and_evaluate,
     "mgsm": mgsm_and_evaluate,
+    # "asr": transcribe_and_evaluate,
 }

evals/translate.py CHANGED Viewed

@@ -6,4 +6,4 @@ from datasets_.mmlu import translate_mmlu
 if __name__ == "__main__":
     translate_mmlu(languages)
     translate_mgsm(languages)
-    translate_arc(languages)

 if __name__ == "__main__":
     translate_mmlu(languages)
     translate_mgsm(languages)
+    translate_arc(languages)

frontend/package-lock.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

frontend/package.json CHANGED Viewed

@@ -6,12 +6,13 @@
     "@observablehq/plot": "^0.6.17",
     "@testing-library/dom": "^10.4.0",
     "@testing-library/jest-dom": "^6.6.3",
-    "@testing-library/react": "^15.0.0",
     "@testing-library/user-event": "^13.5.0",
     "primeicons": "^7.0.0",
     "primereact": "^10.9.3",
-    "react": "^18.2.0",
-    "react-dom": "^18.2.0",
     "topojson-simplify": "^3.0.3",
     "web-vitals": "^2.1.4"
   },
@@ -40,8 +41,5 @@
       "last 1 safari version"
     ]
   },
-  "proxy": "http://localhost:8000",
-  "devDependencies": {
-    "react-scripts": "^5.0.1"
-  }
 }

     "@observablehq/plot": "^0.6.17",
     "@testing-library/dom": "^10.4.0",
     "@testing-library/jest-dom": "^6.6.3",
+    "@testing-library/react": "^16.2.0",
     "@testing-library/user-event": "^13.5.0",
     "primeicons": "^7.0.0",
     "primereact": "^10.9.3",
+    "react": "^19.0.0",
+    "react-dom": "^19.0.0",
+    "react-scripts": "5.0.1",
     "topojson-simplify": "^3.0.3",
     "web-vitals": "^2.1.4"
   },
       "last 1 safari version"
     ]
   },
+  "proxy": "http://localhost:8000"
 }

frontend/src/App.js CHANGED Viewed

@@ -19,14 +19,9 @@ function App () {
   const [loading, setLoading] = useState(true)
   const [error, setError] = useState(null)
   const [selectedLanguages, setSelectedLanguages] = useState([])
-  const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
   const [dialogVisible, setDialogVisible] = useState(false)
   const [aboutVisible, setAboutVisible] = useState(false)
   const [contributeVisible, setContributeVisible] = useState(false)
-  // Add state for carousel items
-  const [carouselItems, setCarouselItems] = useState([])
-  const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
   useEffect(() => {
     fetch('/api/data', {
@@ -41,7 +36,6 @@ function App () {
       })
       .then(jsonData => {
         setData(jsonData)
-        setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
         setLoading(false)
       })
       .catch(err => {
@@ -50,27 +44,8 @@ function App () {
       })
   }, [selectedLanguages])
-  // Create carousel items when data is loaded
-  useEffect(() => {
-    if (data) {
-      // Add a small delay to ensure components are ready
-      const timer = setTimeout(() => {
-        setCarouselItems([
-          <WorldMap key="worldmap-0" data={data.countries} allLanguages={data.language_table} width={750} height={500} />,
-          <LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
-          <SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
-          <HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
-          <CostPlot key="costplot-4" data={data} width={750} height={500} />
-        ]);
-      }, 100);
-      return () => clearTimeout(timer);
-    }
-  }, [data])
   const [windowWidth, setWindowWidth] = useState(window.innerWidth)
   const [windowHeight, setWindowHeight] = useState(window.innerHeight)
   useEffect(() => {
     const handleResize = () => {
       setWindowWidth(window.innerWidth)
@@ -80,44 +55,6 @@ function App () {
     return () => window.removeEventListener('resize', handleResize)
   }, [])
-  // Create full-screen carousel items when data or window size changes
-  useEffect(() => {
-    if (data) {
-      const timer = setTimeout(() => {
-        setFullScreenCarouselItems([
-          <WorldMap
-            key="fs-worldmap-0"
-            data={data.countries}
-            allLanguages={data.language_table}
-            width={windowWidth * 0.7}
-            height={windowHeight * 0.6}
-          />,
-          <LanguagePlot
-            key="fs-langplot-1"
-            data={data}
-            width={windowWidth * 0.7}
-            height={windowHeight * 0.6}
-          />,
-          <SpeakerPlot
-            key="fs-speakerplot-2"
-            data={data}
-            width={windowWidth * 0.7}
-            height={windowHeight * 0.6}
-          />,
-          <HistoryPlot
-            key="fs-histplot-3"
-            data={data}
-            width={windowWidth * 0.7}
-            height={windowHeight * 0.6}
-          />,
-          <CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
-        ]);
-      }, 100);
-      return () => clearTimeout(timer);
-    }
-  }, [data, windowWidth, windowHeight])
   return (
     <PrimeReactProvider>
       <div
@@ -132,50 +69,35 @@ function App () {
           style={{
             backgroundColor: '#fff3cd',
             color: '#856404',
-            padding: '1rem 1.5rem',
             marginBottom: '1rem',
             border: '1px solid #ffeeba',
             borderRadius: '0.25rem',
-            textAlign: 'center',
-            lineHeight: '1.5',
-            position: 'relative'
           }}
         >
           <strong>Work in Progress:</strong> This dashboard is currently under
-          active development. Evaluation results are not yet final. More extensive evaluation runs will be released later this year.
-        </div>
-        <div
-          style={{
-            display: 'flex',
-            justifyContent: 'flex-end',
-            padding: '0 1.5rem',
-            marginBottom: '1rem'
-          }}
-        >
           <a
             href='https://github.com/datenlabor-bmz/ai-language-monitor'
             target='_blank'
             rel='noopener noreferrer'
             style={{
               textDecoration: 'none',
-              color: '#6c757d',
-              fontSize: '1rem',
-              fontWeight: '500',
-              padding: '0.5rem 1rem',
-              borderRadius: '0.375rem',
-              backgroundColor: '#f8f9fa',
-              border: '1px solid #e9ecef',
-              display: 'flex',
-              alignItems: 'center',
-              gap: '0.5rem',
-              transition: 'all 0.2s ease',
-              ':hover': {
-                backgroundColor: '#e9ecef',
-                color: '#495057'
-              }
             }}
           >
-            <i className='pi pi-github' title='View on GitHub' />
             GitHub
           </a>
         </div>
@@ -227,88 +149,39 @@ function App () {
           <div
             style={{
               display: 'flex',
-              gap: '0.75rem',
-              marginBottom: '2rem',
               flexWrap: 'wrap',
               justifyContent: 'center'
             }}
           >
-            <button
               onClick={() => setAboutVisible(true)}
               style={{
-                background: 'linear-gradient(135deg, #667eea 0%, #764ba2 100%)',
-                color: 'white',
-                border: 'none',
-                padding: '0.75rem 1.5rem',
-                borderRadius: '12px',
-                fontSize: '0.95rem',
-                fontWeight: '500',
-                cursor: 'pointer',
-                display: 'flex',
-                alignItems: 'center',
-                gap: '0.5rem',
-                boxShadow: '0 4px 15px rgba(102, 126, 234, 0.25)',
-                transition: 'all 0.3s ease',
-                ':hover': {
-                  transform: 'translateY(-2px)',
-                  boxShadow: '0 8px 25px rgba(102, 126, 234, 0.35)'
-                }
-              }}
-              onMouseEnter={(e) => {
-                e.target.style.transform = 'translateY(-2px)';
-                e.target.style.boxShadow = '0 8px 25px rgba(102, 126, 234, 0.35)';
               }}
-              onMouseLeave={(e) => {
-                e.target.style.transform = 'translateY(0)';
-                e.target.style.boxShadow = '0 4px 15px rgba(102, 126, 234, 0.25)';
-              }}
-            >
-              <span style={{ fontSize: '1.1rem' }}>📚</span>
-              About this tool
-            </button>
-            <button
               onClick={() => setContributeVisible(true)}
-              title='This feature is on our roadmap and will be available soon.'
               style={{
-                background: 'linear-gradient(135deg, #ff9a9e 0%, #fecfef 50%, #fecfef 100%)',
-                color: '#6b46c1',
-                border: 'none',
-                padding: '0.75rem 1.5rem',
-                borderRadius: '12px',
-                fontSize: '0.95rem',
-                fontWeight: '500',
-                cursor: 'pointer',
-                display: 'flex',
-                alignItems: 'center',
-                gap: '0.5rem',
-                boxShadow: '0 4px 15px rgba(255, 154, 158, 0.25)',
-                transition: 'all 0.3s ease',
-                position: 'relative',
-                overflow: 'hidden'
               }}
-              onMouseEnter={(e) => {
-                e.target.style.transform = 'translateY(-2px)';
-                e.target.style.boxShadow = '0 8px 25px rgba(255, 154, 158, 0.35)';
-              }}
-              onMouseLeave={(e) => {
-                e.target.style.transform = 'translateY(0)';
-                e.target.style.boxShadow = '0 4px 15px rgba(255, 154, 158, 0.25)';
-              }}
-            >
-              <span style={{ fontSize: '1.1rem' }}>🚀</span>
-              Add your model
-              <span style={{
-                fontSize: '0.75rem',
-                backgroundColor: 'rgba(107, 70, 193, 0.15)',
-                padding: '0.2rem 0.5rem',
-                borderRadius: '6px',
-                marginLeft: '0.5rem',
-                fontWeight: '600'
-              }}>
-                soon
-              </span>
-            </button>
           </div>
           {data && (
@@ -347,7 +220,6 @@ function App () {
                 data={data.model_table}
                 selectedLanguages={selectedLanguages}
                 allLanguages={data.language_table || []}
-                machineTranslatedMetrics={machineTranslatedMetrics}
               />
               <LanguageTable
                 data={data.language_table}
@@ -376,18 +248,20 @@ function App () {
                     color: '#666'
                   }}
                 />
-                {carouselItems.length > 0 && (
-                  <Carousel
-                    key={`main-carousel-${carouselItems.length}-${Date.now()}`}
-                    value={carouselItems}
-                    numScroll={1}
-                    numVisible={1}
-                    itemTemplate={item => item}
-                    circular={false}
-                    activeIndex={0}
-                    style={{ width: '100%', minHeight: '650px' }}
-                  />
-                )}
               </div>
             </>
           )}
@@ -535,16 +409,36 @@ function App () {
           modal
           header={null}
         >
-          {fullScreenCarouselItems.length > 0 && (
             <div style={{ width: '100%', height: '100%' }}>
               <Carousel
-                key={`fs-carousel-${fullScreenCarouselItems.length}-${Date.now()}`}
-                value={fullScreenCarouselItems}
                 numScroll={1}
                 numVisible={1}
                 itemTemplate={item => item}
-                circular={false}
-                activeIndex={0}
                 style={{ width: '100%', height: 'calc(90vh - 120px)' }}
               />
             </div>
@@ -555,4 +449,4 @@ function App () {
   )
 }
-export default App

   const [loading, setLoading] = useState(true)
   const [error, setError] = useState(null)
   const [selectedLanguages, setSelectedLanguages] = useState([])
   const [dialogVisible, setDialogVisible] = useState(false)
   const [aboutVisible, setAboutVisible] = useState(false)
   const [contributeVisible, setContributeVisible] = useState(false)
   useEffect(() => {
     fetch('/api/data', {
       })
       .then(jsonData => {
         setData(jsonData)
         setLoading(false)
       })
       .catch(err => {
       })
   }, [selectedLanguages])
   const [windowWidth, setWindowWidth] = useState(window.innerWidth)
   const [windowHeight, setWindowHeight] = useState(window.innerHeight)
   useEffect(() => {
     const handleResize = () => {
       setWindowWidth(window.innerWidth)
     return () => window.removeEventListener('resize', handleResize)
   }, [])
   return (
     <PrimeReactProvider>
       <div
           style={{
             backgroundColor: '#fff3cd',
             color: '#856404',
+            padding: '0.75rem 1.25rem',
             marginBottom: '1rem',
             border: '1px solid #ffeeba',
             borderRadius: '0.25rem',
+            textAlign: 'center'
           }}
         >
           <strong>Work in Progress:</strong> This dashboard is currently under
+          active development. Evaluation results are not yet final.
           <a
             href='https://github.com/datenlabor-bmz/ai-language-monitor'
             target='_blank'
             rel='noopener noreferrer'
             style={{
               textDecoration: 'none',
+              color: '#856404',
+              float: 'right',
+              fontSize: '1.2rem',
+              fontWeight: 'bold',
+              padding: '0 0.5rem',
+              borderRadius: '3px',
+              backgroundColor: 'rgba(255,255,255,0.3)'
             }}
           >
+            <i
+              className='pi pi-github'
+              title='View on GitHub'
+              style={{ marginRight: '0.3rem' }}
+            />
             GitHub
           </a>
         </div>
           <div
             style={{
               display: 'flex',
+              gap: '1rem',
+              marginBottom: '1.5rem',
               flexWrap: 'wrap',
               justifyContent: 'center'
             }}
           >
+            <Button
+              label='📚 About this tool'
+              className='p-button-text'
               onClick={() => setAboutVisible(true)}
               style={{
+                color: '#666',
+                border: '1px solid #ddd',
+                padding: '0.5rem 1rem',
+                borderRadius: '4px',
+                fontSize: '0.9rem'
               }}
+            />
+            <Button
+              label='🚀 Add your model (soon)'
+              className='p-button-text'
               onClick={() => setContributeVisible(true)}
+              tooltip='This feature is on our roadmap and will be available soon.'
+              tooltipOptions={{ position: 'bottom' }}
               style={{
+                color: '#666',
+                border: '1px solid #ddd',
+                padding: '0.5rem 1rem',
+                borderRadius: '4px',
+                fontSize: '0.9rem'
               }}
+            />
           </div>
           {data && (
                 data={data.model_table}
                 selectedLanguages={selectedLanguages}
                 allLanguages={data.language_table || []}
               />
               <LanguageTable
                 data={data.language_table}
                     color: '#666'
                   }}
                 />
+                <Carousel
+                  value={[
+                    <WorldMap data={data.countries} />,
+                    <LanguagePlot data={data} />,
+                    <SpeakerPlot data={data} />,
+                    <HistoryPlot data={data} />,
+                    <CostPlot data={data} />
+                  ]}
+                  numScroll={1}
+                  numVisible={1}
+                  itemTemplate={item => item}
+                  circular
+                  style={{ width: '100%', minHeight: '650px' }}
+                />
               </div>
             </>
           )}
           modal
           header={null}
         >
+          {data && (
             <div style={{ width: '100%', height: '100%' }}>
               <Carousel
+                value={[
+                  <WorldMap
+                    data={data.countries}
+                    width={windowWidth * 0.7}
+                    height={windowHeight * 0.6}
+                  />,
+                  <LanguagePlot
+                    data={data}
+                    width={windowWidth * 0.7}
+                    height={windowHeight * 0.6}
+                  />,
+                  <SpeakerPlot
+                    data={data}
+                    width={windowWidth * 0.7}
+                    height={windowHeight * 0.6}
+                  />,
+                  <HistoryPlot
+                    data={data}
+                    width={windowWidth * 0.7}
+                    height={windowHeight * 0.6}
+                  />,
+                  <CostPlot data={data} />
+                ]}
                 numScroll={1}
                 numVisible={1}
                 itemTemplate={item => item}
+                circular
                 style={{ width: '100%', height: 'calc(90vh - 120px)' }}
               />
             </div>
   )
 }
+export default App

frontend/src/components/HistoryPlot.js CHANGED Viewed

@@ -50,12 +50,12 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
             ...models.filter(d => d.newRecord),
             {
               creation_date: new Date(),
-              maxAverage: models[models.length - 1]?.maxAverage || 0
             }
           ],
           {
             x: d => d.creation_date,
-            y: d => d.maxAverage || 0,
             curve: 'step-after',
             strokeOpacity: 0.3
           }

             ...models.filter(d => d.newRecord),
             {
               creation_date: new Date(),
+              maxAverage: models[models.length - 1].maxAverage
             }
           ],
           {
             x: d => d.creation_date,
+            y: d => d.maxAverage,
             curve: 'step-after',
             strokeOpacity: 0.3
           }

frontend/src/components/LanguageTable.js CHANGED Viewed

@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
         filterElement={familyRowFilterTemplate}
         style={{ minWidth: '10rem' }}
       />
-      {ScoreColumns()}
     </DataTable>
   )
 }

         filterElement={familyRowFilterTemplate}
         style={{ minWidth: '10rem' }}
       />
+      {ScoreColumns}
     </DataTable>
   )
 }

frontend/src/components/ModelTable.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
 import Medal from './Medal'
 import { Slider } from 'primereact/slider'
 import ScoreColumns from './ScoreColumns'
-const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTranslatedMetrics = [] }) => {
   const [filters, setFilters] = useState({
     type: { value: null, matchMode: FilterMatchMode.IN },
     size: { value: null, matchMode: FilterMatchMode.BETWEEN },
@@ -50,10 +50,10 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
   }
   const SliderWithLabel = ({ value, onChange, min, max }) => {
-    const p = 10;
-    const start = value === null || value[0] === null ? min : Math.log(value[0]) / Math.log(p);
-    const stop = value === null || value[1] === null ? max : Math.log(value[1]) / Math.log(p);
-    const [_value, _setValue] = useState([start, stop]);
     useEffect(() => {
       const timer = setTimeout(() => {
         onChange({
@@ -61,11 +61,11 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
             // set to "no filter" when (almost) the whole range is selected
             _value[0] <= min + 0.1 && _value[1] >= max - 0.1
               ? null
-              : [p ** _value[0], p ** _value[1]],
-        });
-      }, 1000);
-      return () => clearTimeout(timer);
-    }, [_value, onChange, min, max]);
     return (
       <div style={{ minWidth: '20rem' }}>
         <div>{formatSize(p ** _value[0])}</div>
@@ -147,35 +147,21 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
   }
   const costBodyTemplate = rowData => {
-    return (
-      <div style={{ textAlign: 'center' }}>
-        {rowData.cost === null ? 'n/a' : `$${rowData.cost.toFixed(2)}`}
-      </div>
-    )
   }
   const getHeaderText = () => {
-    // Count languages that have any evaluation data (any task scores available)
-    const evaluatedLanguagesCount = allLanguages.filter(lang => {
-      // Check if language has any task scores (not just average)
-      const hasAnyScores = [
-        'translation_from_bleu',
-        'translation_to_bleu',
-        'classification_accuracy',
-        'mmlu_accuracy',
-        'arc_accuracy',
-        'truthfulqa_accuracy',
-        'mgsm_accuracy'
-      ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
-      return hasAnyScores
-    }).length
     if (selectedLanguages.length === 0) {
       return (
         <span>
           <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
           <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
-            Performance across {evaluatedLanguagesCount} evaluated languages
           </span>
         </span>
       )
@@ -259,7 +245,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
         body={costBodyTemplate}
         style={{ minWidth: '5rem' }}
       />
-      {ScoreColumns(machineTranslatedMetrics)}
     </DataTable>
   )
 }

 import Medal from './Medal'
 import { Slider } from 'primereact/slider'
 import ScoreColumns from './ScoreColumns'
+const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
   const [filters, setFilters] = useState({
     type: { value: null, matchMode: FilterMatchMode.IN },
     size: { value: null, matchMode: FilterMatchMode.BETWEEN },
   }
   const SliderWithLabel = ({ value, onChange, min, max }) => {
+    const p = 10
+    const start = value === null ? min : Math.log(value[0]) / Math.log(p)
+    const stop = value === null ? max : Math.log(value[1]) / Math.log(p)
+    const [_value, _setValue] = useState([start, stop])
     useEffect(() => {
       const timer = setTimeout(() => {
         onChange({
             // set to "no filter" when (almost) the whole range is selected
             _value[0] <= min + 0.1 && _value[1] >= max - 0.1
               ? null
+              : [p ** _value[0], p ** _value[1]]
+        })
+      }, 1000)
+      return () => clearTimeout(timer)
+    }, [_value, onChange, min, max])
     return (
       <div style={{ minWidth: '20rem' }}>
         <div>{formatSize(p ** _value[0])}</div>
   }
   const costBodyTemplate = rowData => {
+    return <div style={{ textAlign: 'center' }}>${rowData.cost?.toFixed(2)}</div>
   }
   const getHeaderText = () => {
+    // Count languages that have evaluation data (average score available)
+    const evaluatedLanguagesCount = allLanguages.filter(lang =>
+      lang.average !== null && lang.average !== undefined
+    ).length
     if (selectedLanguages.length === 0) {
       return (
         <span>
           <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
           <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
+            Average performance across {evaluatedLanguagesCount} evaluated languages
           </span>
         </span>
       )
         body={costBodyTemplate}
         style={{ minWidth: '5rem' }}
       />
+      {ScoreColumns}
     </DataTable>
   )
 }

frontend/src/components/ScoreColumns.js CHANGED Viewed

@@ -2,28 +2,21 @@ import { Column } from 'primereact/column'
 import ScoreField from './ScoreField'
 const scoreBodyTemplate = (field, options = {}) => {
-  const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
   return rowData => {
     const score = rowData[field]
-    // Prefer per-row flag if present (backend sets `<metric>_is_machine`),
-    // otherwise fall back to global list
-    const rowFlagKey = `${field}_is_machine`
-    const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
-    const isMachineTranslated = hasRowFlag
-      ? !!rowData[rowFlagKey]
-      : machineTranslatedMetrics.includes(field)
-    return ScoreField(score, minScore, maxScore, isMachineTranslated)
   }
 }
-const ScoreColumns = (machineTranslatedMetrics = []) => [
   <Column
     field='average'
     header='Proficiency'
     headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
     sortable
-    body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
   <Column
@@ -33,8 +26,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('translation_from_bleu', {
       minScore: 0,
-      maxScore: 0.5,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -45,8 +37,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('translation_to_bleu', {
       minScore: 0,
-      maxScore: 0.5,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -57,8 +48,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('classification_accuracy', {
       minScore: 0,
-      maxScore: 0.5,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -79,8 +69,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('mmlu_accuracy', {
       minScore: 0,
-      maxScore: 1,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -91,8 +80,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('arc_accuracy', {
       minScore: 0,
-      maxScore: 1,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -103,8 +91,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('mgsm_accuracy', {
       minScore: 0,
-      maxScore: 1,
-      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,

 import ScoreField from './ScoreField'
 const scoreBodyTemplate = (field, options = {}) => {
+  const { minScore = 0, maxScore = 1 } = options
   return rowData => {
     const score = rowData[field]
+    return ScoreField(score, minScore, maxScore)
   }
 }
+const ScoreColumns = [
   <Column
     field='average'
     header='Proficiency'
     headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
     sortable
+    body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
   <Column
     sortable
     body={scoreBodyTemplate('translation_from_bleu', {
       minScore: 0,
+      maxScore: 0.5
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('translation_to_bleu', {
       minScore: 0,
+      maxScore: 0.5
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('classification_accuracy', {
       minScore: 0,
+      maxScore: 0.5
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('mmlu_accuracy', {
       minScore: 0,
+      maxScore: 1
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('arc_accuracy', {
       minScore: 0,
+      maxScore: 1
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('mgsm_accuracy', {
       minScore: 0,
+      maxScore: 1
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,

frontend/src/components/ScoreField.js CHANGED Viewed

@@ -1,4 +1,4 @@
-const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
   let percentage = 100
   let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
   if (score !== null) {
@@ -50,7 +50,6 @@ const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
         }}
       >
         {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
-        {isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
       </span>
     </div>
   )

+const ScoreField = (score, minScore, maxScore) => {
   let percentage = 100
   let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
   if (score !== null) {
         }}
       >
         {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
       </span>
     </div>
   )

frontend/src/components/SpeakerPlot.js CHANGED Viewed

@@ -73,10 +73,10 @@ const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
           textStrokeOpacity: 0,
           textFillOpacity: 0
         }),
-        ...(languages.length >= 40 ? [Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
           x: 40,
           y: languages[39].cumSpeakers / 1e6
-        })] : [])
       ]
     })
     containerRef.current.append(plot)

           textStrokeOpacity: 0,
           textFillOpacity: 0
         }),
+        Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
           x: 40,
           y: languages[39].cumSpeakers / 1e6
+        })
       ]
     })
     containerRef.current.append(plot)

frontend/src/components/WorldMap.js CHANGED Viewed

@@ -26,13 +26,13 @@ const makeTitle = data => d => {
         a =>
           `${smoothProgressBar(a.population / pop)} ${
             a.name
-          } – ${a.score === null || a.score === undefined ? "n/a" : a.score.toFixed(2)}`
       )
       .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
-  return `${d.properties.ADMIN} – ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
 }
-const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
   const containerRef = useRef()
   const [mapData, setMapData] = useState()
@@ -48,22 +48,8 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
       acc[country.iso2] = country
       return acc
     }, {})
-    // Count languages that have any evaluation data
-    const evaluatedLanguagesCount = allLanguages.filter(lang => {
-      const hasAnyScores = [
-        'translation_from_bleu',
-        'translation_to_bleu',
-        'classification_accuracy',
-        'mmlu_accuracy',
-        'arc_accuracy',
-        'truthfulqa_accuracy',
-        'mgsm_accuracy'
-      ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
-      return hasAnyScores
-    }).length
     const plot = Plot.plot({
-      subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
       width: width,
       height: height,
       projection: 'equal-earth',
@@ -75,12 +61,11 @@ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
         })
       ],
       color: {
-        scheme: 'RdYlGn',
-        unknown: '#d0d0d0',
         label: 'Score',
         legend: true,
-        domain: [0, 1],
-        pivot: 0.5
       },
       style: {
         fontFamily: 'monospace'

         a =>
           `${smoothProgressBar(a.population / pop)} ${
             a.name
+          } – ${a.score.toFixed(2)}`
       )
       .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
+  return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
 }
+const WorldMap = ({ data, width = 750, height = 500 }) => {
   const containerRef = useRef()
   const [mapData, setMapData] = useState()
       acc[country.iso2] = country
       return acc
     }, {})
     const plot = Plot.plot({
+      subtitle: 'Language Proficiency Score by Country',
       width: width,
       height: height,
       projection: 'equal-earth',
         })
       ],
       color: {
+        scheme: 'Greens',
+        unknown: 'gray',
         label: 'Score',
         legend: true,
+        domain: [0, 1]
       },
       style: {
         fontFamily: 'monospace'

languages.json CHANGED Viewed

@@ -7,7 +7,7 @@
     "family":"Indo-European",
     "flores_path":"eng_Latn",
     "fleurs_tag":"en_us",
-    "commonvoice_hours":2683.0,
     "commonvoice_locale":"en",
     "in_benchmark":true
   },
@@ -32,7 +32,7 @@
     "flores_path":"hin_Deva",
     "fleurs_tag":"hi_in",
     "commonvoice_hours":16.0,
-    "commonvoice_locale":"hi",
     "in_benchmark":true
   },
   {
@@ -43,7 +43,7 @@
     "family":"Indo-European",
     "flores_path":"spa_Latn",
     "fleurs_tag":"es_419",
-    "commonvoice_hours":449.0,
     "commonvoice_locale":"es",
     "in_benchmark":true
   },
@@ -79,7 +79,7 @@
     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
-    "commonvoice_hours":1073.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
@@ -103,7 +103,7 @@
     "family":"Indo-European",
     "flores_path":"por_Latn",
     "fleurs_tag":"pt_br",
-    "commonvoice_hours":181.0,
     "commonvoice_locale":"pt",
     "in_benchmark":true
   },
@@ -115,7 +115,7 @@
     "family":"Indo-European",
     "flores_path":"pan_Guru",
     "fleurs_tag":"pa_in",
-    "commonvoice_hours":2.5,
     "commonvoice_locale":"pa-IN",
     "in_benchmark":true
   },
@@ -127,7 +127,7 @@
     "family":"Indo-European",
     "flores_path":"rus_Cyrl",
     "fleurs_tag":"ru_ru",
-    "commonvoice_hours":247.0,
     "commonvoice_locale":"ru",
     "in_benchmark":true
   },
@@ -139,7 +139,7 @@
     "family":"Atlantic-Congo",
     "flores_path":"swh_Latn",
     "fleurs_tag":"sw_ke",
-    "commonvoice_hours":412.0,
     "commonvoice_locale":"sw",
     "in_benchmark":true
   },
@@ -151,7 +151,7 @@
     "family":"Austronesian",
     "flores_path":"ind_Latn",
     "fleurs_tag":"id_id",
-    "commonvoice_hours":34.0,
     "commonvoice_locale":"id",
     "in_benchmark":true
   },
@@ -163,7 +163,7 @@
     "family":"Indo-European",
     "flores_path":"deu_Latn",
     "fleurs_tag":"de_de",
-    "commonvoice_hours":1372.0,
     "commonvoice_locale":"de",
     "in_benchmark":true
   },
@@ -379,7 +379,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":"ps_af",
-    "commonvoice_hours":82.0,
     "commonvoice_locale":"ps",
     "in_benchmark":false
   },
@@ -439,7 +439,7 @@
     "family":"Indo-European",
     "flores_path":"pol_Latn",
     "fleurs_tag":"pl_pl",
-    "commonvoice_hours":176.0,
     "commonvoice_locale":"pl",
     "in_benchmark":true
   },
@@ -619,7 +619,7 @@
     "family":"Indo-European",
     "flores_path":"nld_Latn",
     "fleurs_tag":"nl_nl",
-    "commonvoice_hours":123.0,
     "commonvoice_locale":"nl",
     "in_benchmark":true
   },
@@ -655,7 +655,7 @@
     "family":"Atlantic-Congo",
     "flores_path":"yor_Latn",
     "fleurs_tag":"yo_ng",
-    "commonvoice_hours":6.4,
     "commonvoice_locale":"yo",
     "in_benchmark":true
   },
@@ -979,7 +979,7 @@
     "family":"Turkic",
     "flores_path":"kaz_Cyrl",
     "fleurs_tag":"kk_kz",
-    "commonvoice_hours":2.3,
     "commonvoice_locale":"kk",
     "in_benchmark":true
   },
@@ -1027,7 +1027,7 @@
     "family":"Uralic",
     "flores_path":"hun_Latn",
     "fleurs_tag":"hu_hu",
-    "commonvoice_hours":94.0,
     "commonvoice_locale":"hu",
     "in_benchmark":true
   },
@@ -1099,7 +1099,7 @@
     "family":"Indo-European",
     "flores_path":"ckb_Arab",
     "fleurs_tag":"ckb_iq",
-    "commonvoice_hours":136.0,
     "commonvoice_locale":"ckb",
     "in_benchmark":true
   },
@@ -1183,7 +1183,7 @@
     "family":"Indo-European",
     "flores_path":"bel_Cyrl",
     "fleurs_tag":"be_by",
-    "commonvoice_hours":1812.0,
     "commonvoice_locale":"be",
     "in_benchmark":true
   },
@@ -1207,7 +1207,7 @@
     "family":"Indo-European",
     "flores_path":"tgk_Cyrl",
     "fleurs_tag":"tg_tj",
-    "commonvoice_hours":0.6,
     "commonvoice_locale":"tg",
     "in_benchmark":true
   },
@@ -1243,7 +1243,7 @@
     "family":"Indo-European",
     "flores_path":"afr_Latn",
     "fleurs_tag":"af_za",
-    "commonvoice_hours":0.6,
     "commonvoice_locale":"af",
     "in_benchmark":true
   },
@@ -1291,7 +1291,7 @@
     "family":"Indo-European",
     "flores_path":"cat_Latn",
     "fleurs_tag":"ca_es",
-    "commonvoice_hours":2883.0,
     "commonvoice_locale":"ca",
     "in_benchmark":true
   },
@@ -1303,7 +1303,7 @@
     "family":"Afro-Asiatic",
     "flores_path":"heb_Hebr",
     "fleurs_tag":"he_il",
-    "commonvoice_hours":2.0,
     "commonvoice_locale":"he",
     "in_benchmark":true
   },
@@ -1375,7 +1375,7 @@
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
-    "commonvoice_hours":437.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
@@ -1519,7 +1519,7 @@
     "family":"Indo-European",
     "flores_path":"kmr_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":71.0,
     "commonvoice_locale":"kmr",
     "in_benchmark":true
   },
@@ -1555,7 +1555,7 @@
     "family":"Indo-European",
     "flores_path":"slk_Latn",
     "fleurs_tag":"sk_sk",
-    "commonvoice_hours":52.0,
     "commonvoice_locale":"sk",
     "in_benchmark":true
   },
@@ -1675,7 +1675,7 @@
     "family":"Tupian",
     "flores_path":"gug_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":4.5,
     "commonvoice_locale":"gn",
     "in_benchmark":true
   },
@@ -1747,7 +1747,7 @@
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
-    "commonvoice_hours":1.8,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
@@ -2155,7 +2155,7 @@
     "family":"Kartvelian",
     "flores_path":"kat_Geor",
     "fleurs_tag":"ka_ge",
-    "commonvoice_hours":167.0,
     "commonvoice_locale":"ka",
     "in_benchmark":true
   },
@@ -2167,7 +2167,7 @@
     "family":"Indo-European",
     "flores_path":"glg_Latn",
     "fleurs_tag":"gl_es",
-    "commonvoice_hours":164.0,
     "commonvoice_locale":"gl",
     "in_benchmark":true
   },
@@ -2323,7 +2323,7 @@
     "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":11.0,
     "commonvoice_locale":"brh",
     "in_benchmark":false
   },
@@ -2623,7 +2623,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":11.0,
     "commonvoice_locale":"haz",
     "in_benchmark":false
   },
@@ -2695,7 +2695,7 @@
     "family":"Indo-European",
     "flores_path":"oci_Latn",
     "fleurs_tag":"oc_fr",
-    "commonvoice_hours":1.9,
     "commonvoice_locale":"oc",
     "in_benchmark":true
   },
@@ -3175,8 +3175,8 @@
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"seh",
     "in_benchmark":false
   },
   {
@@ -3319,8 +3319,8 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"mfe",
     "in_benchmark":false
   },
   {
@@ -3331,7 +3331,7 @@
     "family":"Indo-European",
     "flores_path":"gle_Latn",
     "fleurs_tag":"ga_ie",
-    "commonvoice_hours":9.3,
     "commonvoice_locale":"ga-IE",
     "in_benchmark":true
   },
@@ -3487,7 +3487,7 @@
     "family":"Indo-European",
     "flores_path":"lvs_Latn",
     "fleurs_tag":"lv_lv",
-    "commonvoice_hours":263.0,
     "commonvoice_locale":"lv",
     "in_benchmark":true
   },
@@ -3535,7 +3535,7 @@
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":453.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
@@ -3559,7 +3559,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":106.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
@@ -3679,7 +3679,7 @@
     "family":"Indo-European",
     "flores_path":"ydd_Hebr",
     "fleurs_tag":null,
-    "commonvoice_hours":1.8,
     "commonvoice_locale":"yi",
     "in_benchmark":true
   },
@@ -3991,8 +3991,8 @@
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"gaa",
     "in_benchmark":false
   },
   {
@@ -4099,8 +4099,8 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"pcd",
     "in_benchmark":false
   },
   {
@@ -4351,7 +4351,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":30.0,
     "commonvoice_locale":"br",
     "in_benchmark":false
   },
@@ -4651,7 +4651,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":32.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },
@@ -5011,7 +5011,7 @@
     "family":"Nakh-Daghestanian",
     "flores_path":"dar_Cyrl",
     "fleurs_tag":null,
-    "commonvoice_hours":1.3,
     "commonvoice_locale":"dar",
     "in_benchmark":true
   },
@@ -7879,7 +7879,7 @@
     "family":"Artificial Language",
     "flores_path":"epo_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":1437.0,
     "commonvoice_locale":"eo",
     "in_benchmark":true
   },

     "family":"Indo-European",
     "flores_path":"eng_Latn",
     "fleurs_tag":"en_us",
+    "commonvoice_hours":2674.0,
     "commonvoice_locale":"en",
     "in_benchmark":true
   },
     "flores_path":"hin_Deva",
     "fleurs_tag":"hi_in",
     "commonvoice_hours":16.0,
+    "commonvoice_locale":"hi-IN",
     "in_benchmark":true
   },
   {
     "family":"Indo-European",
     "flores_path":"spa_Latn",
     "fleurs_tag":"es_419",
+    "commonvoice_hours":448.0,
     "commonvoice_locale":"es",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
+    "commonvoice_hours":1065.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"por_Latn",
     "fleurs_tag":"pt_br",
+    "commonvoice_hours":180.0,
     "commonvoice_locale":"pt",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"pan_Guru",
     "fleurs_tag":"pa_in",
+    "commonvoice_hours":2.3,
     "commonvoice_locale":"pa-IN",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"rus_Cyrl",
     "fleurs_tag":"ru_ru",
+    "commonvoice_hours":245.0,
     "commonvoice_locale":"ru",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":"swh_Latn",
     "fleurs_tag":"sw_ke",
+    "commonvoice_hours":411.0,
     "commonvoice_locale":"sw",
     "in_benchmark":true
   },
     "family":"Austronesian",
     "flores_path":"ind_Latn",
     "fleurs_tag":"id_id",
+    "commonvoice_hours":33.0,
     "commonvoice_locale":"id",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"deu_Latn",
     "fleurs_tag":"de_de",
+    "commonvoice_hours":1369.0,
     "commonvoice_locale":"de",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":"ps_af",
+    "commonvoice_hours":81.0,
     "commonvoice_locale":"ps",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"pol_Latn",
     "fleurs_tag":"pl_pl",
+    "commonvoice_hours":175.0,
     "commonvoice_locale":"pl",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"nld_Latn",
     "fleurs_tag":"nl_nl",
+    "commonvoice_hours":120.0,
     "commonvoice_locale":"nl",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":"yor_Latn",
     "fleurs_tag":"yo_ng",
+    "commonvoice_hours":6.3,
     "commonvoice_locale":"yo",
     "in_benchmark":true
   },
     "family":"Turkic",
     "flores_path":"kaz_Cyrl",
     "fleurs_tag":"kk_kz",
+    "commonvoice_hours":2.2,
     "commonvoice_locale":"kk",
     "in_benchmark":true
   },
     "family":"Uralic",
     "flores_path":"hun_Latn",
     "fleurs_tag":"hu_hu",
+    "commonvoice_hours":93.0,
     "commonvoice_locale":"hu",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"ckb_Arab",
     "fleurs_tag":"ckb_iq",
+    "commonvoice_hours":135.0,
     "commonvoice_locale":"ckb",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"bel_Cyrl",
     "fleurs_tag":"be_by",
+    "commonvoice_hours":1810.0,
     "commonvoice_locale":"be",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"tgk_Cyrl",
     "fleurs_tag":"tg_tj",
+    "commonvoice_hours":0.4,
     "commonvoice_locale":"tg",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"afr_Latn",
     "fleurs_tag":"af_za",
+    "commonvoice_hours":0.5,
     "commonvoice_locale":"af",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"cat_Latn",
     "fleurs_tag":"ca_es",
+    "commonvoice_hours":2863.0,
     "commonvoice_locale":"ca",
     "in_benchmark":true
   },
     "family":"Afro-Asiatic",
     "flores_path":"heb_Hebr",
     "fleurs_tag":"he_il",
+    "commonvoice_hours":1.4,
     "commonvoice_locale":"he",
     "in_benchmark":true
   },
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
+    "commonvoice_hours":411.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"kmr_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":69.0,
     "commonvoice_locale":"kmr",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"slk_Latn",
     "fleurs_tag":"sk_sk",
+    "commonvoice_hours":51.0,
     "commonvoice_locale":"sk",
     "in_benchmark":true
   },
     "family":"Tupian",
     "flores_path":"gug_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":4.0,
     "commonvoice_locale":"gn",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
+    "commonvoice_hours":0.5,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
     "family":"Kartvelian",
     "flores_path":"kat_Geor",
     "fleurs_tag":"ka_ge",
+    "commonvoice_hours":166.0,
     "commonvoice_locale":"ka",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"glg_Latn",
     "fleurs_tag":"gl_es",
+    "commonvoice_hours":117.0,
     "commonvoice_locale":"gl",
     "in_benchmark":true
   },
     "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":1.2,
     "commonvoice_locale":"brh",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.9,
     "commonvoice_locale":"haz",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"oci_Latn",
     "fleurs_tag":"oc_fr",
+    "commonvoice_hours":1.8,
     "commonvoice_locale":"oc",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":"gle_Latn",
     "fleurs_tag":"ga_ie",
+    "commonvoice_hours":8.3,
     "commonvoice_locale":"ga-IE",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"lvs_Latn",
     "fleurs_tag":"lv_lv",
+    "commonvoice_hours":262.0,
     "commonvoice_locale":"lv",
     "in_benchmark":true
   },
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":440.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":83.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"ydd_Hebr",
     "fleurs_tag":null,
+    "commonvoice_hours":0.7,
     "commonvoice_locale":"yi",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":29.0,
     "commonvoice_locale":"br",
     "in_benchmark":false
   },
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":30.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },
     "family":"Nakh-Daghestanian",
     "flores_path":"dar_Cyrl",
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
     "commonvoice_locale":"dar",
     "in_benchmark":true
   },
     "family":"Artificial Language",
     "flores_path":"epo_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":1436.0,
     "commonvoice_locale":"eo",
     "in_benchmark":true
   },

models.json CHANGED Viewed

@@ -20,15 +20,15 @@
     ]
   },
   {
-    "id":"anthropic\/claude-3-haiku",
-    "name":"Claude 3 Haiku",
     "provider_name":"Anthropic",
-    "cost":1.25,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1710288000000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -79,51 +79,11 @@
       "mgsm"
     ]
   },
-  {
-    "id":"arliai\/qwq-32b-arliai-rpr-v1",
-    "name":"QwQ 32B RpR v1",
-    "provider_name":"ArliAI",
-    "cost":0.0,
-    "hf_id":"ArliAI\/QwQ-32B-ArliAI-RpR-v1",
-    "size":32763876352.0,
-    "type":"open-source",
-    "license":"Apache 2.0",
-    "creation_date":1743984000000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"cohere\/command-r-08-2024",
-    "name":"Command R (08-2024)",
-    "provider_name":"Cohere",
-    "cost":0.6,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1724976000000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"deepseek\/deepseek-chat",
     "name":"DeepSeek V3",
     "provider_name":"DeepSeek",
-    "cost":0.8,
     "hf_id":"deepseek-ai\/DeepSeek-V3",
     "size":684531386000.0,
     "type":"open-source",
@@ -159,26 +119,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"deepseek\/deepseek-chat-v3.1",
-    "name":"DeepSeek V3.1",
-    "provider_name":"DeepSeek",
-    "cost":0.0,
-    "hf_id":"deepseek-ai\/DeepSeek-V3.1",
-    "size":684531386000.0,
-    "type":"open-source",
-    "license":"Mit",
-    "creation_date":1755734400000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"deepseek\/deepseek-r1",
     "name":"R1",
@@ -200,15 +140,15 @@
     ]
   },
   {
-    "id":"deepseek\/deepseek-r1-0528-qwen3-8b",
-    "name":"Deepseek R1 0528 Qwen3 8B",
     "provider_name":"DeepSeek",
     "cost":0.0,
-    "hf_id":"deepseek-ai\/DeepSeek-R1-0528-Qwen3-8B",
-    "size":8190735360.0,
     "type":"open-source",
     "license":"Mit",
-    "creation_date":1748476800000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -280,15 +220,145 @@
     ]
   },
   {
-    "id":"google\/gemma-3-12b-it",
-    "name":"Gemma 3 12B",
     "provider_name":"Google",
-    "cost":0.0,
-    "hf_id":"google\/gemma-3-12b-it",
-    "size":12187325040.0,
-    "type":"open-source",
-    "license":"Gemma",
-    "creation_date":1740787200000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -319,6 +389,41 @@
       "mgsm"
     ]
   },
   {
     "id":"meta-llama\/llama-3-70b-instruct",
     "name":"Llama 3 70B Instruct",
@@ -359,6 +464,30 @@
       "mgsm"
     ]
   },
   {
     "id":"meta-llama\/llama-3.3-70b-instruct",
     "name":"Llama 3.3 70B Instruct",
@@ -439,26 +568,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"mistralai\/mistral-7b-instruct-v0.3",
-    "name":"Mistral 7B Instruct v0.3",
-    "provider_name":"Mistral",
-    "cost":0.05,
-    "hf_id":"mistralai\/Mistral-7B-Instruct-v0.3",
-    "size":7248023552.0,
-    "type":"open-source",
-    "license":"Apache 2.0",
-    "creation_date":1716336000000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"mistralai\/mistral-nemo",
     "name":"Mistral Nemo",
@@ -520,55 +629,15 @@
     ]
   },
   {
-    "id":"moonshotai\/kimi-k2",
-    "name":"Kimi K2",
-    "provider_name":"MoonshotAI",
-    "cost":0.0,
-    "hf_id":"moonshotai\/Kimi-K2-Instruct",
     "size":null,
-    "type":"open-source",
-    "license":"Other",
-    "creation_date":1752192000000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"neversleep\/llama-3-lumimaid-70b",
-    "name":"Llama 3 Lumimaid 70B",
-    "provider_name":"NeverSleep",
-    "cost":6.0,
-    "hf_id":"NeverSleep\/Llama-3-Lumimaid-70B-v0.1",
-    "size":70553706496.0,
-    "type":"open-source",
-    "license":"Cc By Nc 4.0",
-    "creation_date":1714262400000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"nvidia\/llama-3.1-nemotron-70b-instruct",
-    "name":"Llama 3.1 Nemotron 70B Instruct",
-    "provider_name":"NVIDIA",
-    "cost":0.3,
-    "hf_id":"nvidia\/Llama-3.1-Nemotron-70B-Instruct-HF",
-    "size":70553706496.0,
-    "type":"open-source",
-    "license":"Llama3.1",
-    "creation_date":1728691200000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -639,26 +708,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"openai\/gpt-4o-2024-11-20",
-    "name":"GPT-4o (2024-11-20)",
-    "provider_name":"OpenAI",
-    "cost":10.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1732060800000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"openai\/gpt-4o-mini",
     "name":"GPT-4o-mini",
@@ -679,86 +728,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"openai\/gpt-5",
-    "name":"GPT-5",
-    "provider_name":"OpenAI",
-    "cost":10.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1754524800000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"openai\/gpt-5-nano",
-    "name":"GPT-5 Nano",
-    "provider_name":"OpenAI",
-    "cost":0.4,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1754524800000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"openai\/gpt-oss-120b",
-    "name":"gpt-oss-120b",
-    "provider_name":"OpenAI",
-    "cost":0.0,
-    "hf_id":"openai\/gpt-oss-120b",
-    "size":120412337472.0,
-    "type":"open-source",
-    "license":"Apache 2.0",
-    "creation_date":1754265600000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"qwen\/qwen-2.5-coder-32b-instruct",
-    "name":"Qwen2.5 Coder 32B Instruct",
-    "provider_name":"Qwen2.5 Coder 32B Instruct (free)",
-    "cost":0.0,
-    "hf_id":"Qwen\/Qwen2.5-Coder-32B-Instruct",
-    "size":32763876352.0,
-    "type":"open-source",
-    "license":"Apache 2.0",
-    "creation_date":1730851200000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
   {
     "id":"qwen\/qwen3-235b-a22b",
     "name":"Qwen3 235B A22B",
@@ -803,7 +772,7 @@
     "id":"qwen\/qwen3-32b",
     "name":"Qwen3 32B",
     "provider_name":"Qwen",
-    "cost":0.07,
     "hf_id":"Qwen\/Qwen3-32B",
     "size":32762123264.0,
     "type":"open-source",
@@ -818,120 +787,5 @@
       "truthfulqa",
       "mgsm"
     ]
-  },
-  {
-    "id":"scb10x\/llama3.1-typhoon2-70b-instruct",
-    "name":"Typhoon2 70B Instruct",
-    "provider_name":"Typhoon2 70B Instruct",
-    "cost":0.88,
-    "hf_id":"scb10x\/llama3.1-typhoon2-70b-instruct",
-    "size":70553706496.0,
-    "type":"open-source",
-    "license":"Llama3.1",
-    "creation_date":1734220800000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"tencent\/hunyuan-a13b-instruct",
-    "name":"Hunyuan A13B Instruct",
-    "provider_name":"Tencent",
-    "cost":0.0,
-    "hf_id":"tencent\/Hunyuan-A13B-Instruct",
-    "size":80393183232.0,
-    "type":"open-source",
-    "license":"Other",
-    "creation_date":1750809600000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"thedrummer\/anubis-pro-105b-v1",
-    "name":"Anubis Pro 105B V1",
-    "provider_name":"TheDrummer",
-    "cost":1.0,
-    "hf_id":"TheDrummer\/Anubis-Pro-105B-v1",
-    "size":104779882496.0,
-    "type":"open-source",
-    "license":"Other",
-    "creation_date":1738454400000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"x-ai\/grok-4",
-    "name":"Grok 4",
-    "provider_name":"xAI",
-    "cost":15.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1752019200000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"z-ai\/glm-4.5v",
-    "name":"GLM 4.5V",
-    "provider_name":"Z.AI",
-    "cost":1.8,
-    "hf_id":"zai-org\/GLM-4.5V",
-    "size":107710933120.0,
-    "type":"open-source",
-    "license":"Mit",
-    "creation_date":1754784000000,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"google\/translate-v2",
-    "name":"Google Translate",
-    "provider_name":"Google",
-    "cost":20.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":null,
-    "tasks":[
-      "translation_from",
-      "translation_to"
-    ]
   }
 ]

     ]
   },
   {
+    "id":"anthropic\/claude-3.5-sonnet",
+    "name":"Claude 3.5 Sonnet",
     "provider_name":"Anthropic",
+    "cost":15.0,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1729555200000,
     "tasks":[
       "translation_from",
       "translation_to",
       "mgsm"
     ]
   },
   {
     "id":"deepseek\/deepseek-chat",
     "name":"DeepSeek V3",
     "provider_name":"DeepSeek",
+    "cost":0.0,
     "hf_id":"deepseek-ai\/DeepSeek-V3",
     "size":684531386000.0,
     "type":"open-source",
       "mgsm"
     ]
   },
   {
     "id":"deepseek\/deepseek-r1",
     "name":"R1",
     ]
   },
   {
+    "id":"deepseek\/deepseek-r1-0528",
+    "name":"R1 0528",
     "provider_name":"DeepSeek",
     "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-R1-0528",
+    "size":684531386000.0,
     "type":"open-source",
     "license":"Mit",
+    "creation_date":1748390400000.0,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"google\/gemini-2.5-flash-lite-preview-06-17",
+    "name":"Gemini 2.5 Flash Lite Preview 06-17",
     "provider_name":"Google",
+    "cost":0.4,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1750118400000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-flash-preview",
+    "name":"Gemini 2.5 Flash Preview 04-17",
+    "provider_name":"Google",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1744848000000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-flash-preview-05-20",
+    "name":"Gemini 2.5 Flash Preview 05-20",
+    "provider_name":"Google",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1747699200000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-pro",
+    "name":"Gemini 2.5 Pro",
+    "provider_name":"Google",
+    "cost":10.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1750118400000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-pro-preview",
+    "name":"Gemini 2.5 Pro Preview 06-05",
+    "provider_name":"Google",
+    "cost":10.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1749081600000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-2.5-pro-preview-05-06",
+    "name":"Gemini 2.5 Pro Preview 05-06",
+    "provider_name":"Google",
+    "cost":10.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1746576000000.0,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-flash-1.5",
+    "name":"Gemini 1.5 Flash ",
+    "provider_name":"Google",
+    "cost":0.3,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1715644800000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/gemini-flash-1.5-8b",
+    "name":"Gemini 1.5 Flash 8B",
+    "provider_name":"Google",
+    "cost":0.15,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1727913600000,
     "tasks":[
       "translation_from",
       "translation_to",
       "mgsm"
     ]
   },
+  {
+    "id":"google\/translate-v2",
+    "name":"Google Translate",
+    "provider_name":"Google",
+    "cost":20.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":null,
+    "tasks":[
+      "translation_from",
+      "translation_to"
+    ]
+  },
+  {
+    "id":"gryphe\/mythomax-l2-13b",
+    "name":"MythoMax 13B",
+    "provider_name":"MythoMax 13B",
+    "cost":0.07,
+    "hf_id":"Gryphe\/MythoMax-L2-13b",
+    "size":null,
+    "type":"open-source",
+    "license":"Other",
+    "creation_date":1691625600000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
   {
     "id":"meta-llama\/llama-3-70b-instruct",
     "name":"Llama 3 70B Instruct",
       "mgsm"
     ]
   },
+  {
+    "id":"meta-llama\/llama-3.1-8b-instruct",
+    "name":"Llama 3.1 8B Instruct",
+    "provider_name":"Meta",
+    "cost":0.0,
+    "hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
+    "size":8030261248.0,
+    "type":"open-source",
+    "license":"Llama3.1",
+    "creation_date":1721260800000.0,
+    "tasks":null
+  },
+  {
+    "id":"meta-llama\/llama-3.2-1b-instruct",
+    "name":"Llama 3.2 1B Instruct",
+    "provider_name":"Meta",
+    "cost":0.0,
+    "hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
+    "size":1235814400.0,
+    "type":"open-source",
+    "license":"Llama3.2",
+    "creation_date":1726617600000.0,
+    "tasks":null
+  },
   {
     "id":"meta-llama\/llama-3.3-70b-instruct",
     "name":"Llama 3.3 70B Instruct",
       "mgsm"
     ]
   },
   {
     "id":"mistralai\/mistral-nemo",
     "name":"Mistral Nemo",
     ]
   },
   {
+    "id":"openai\/gpt-3.5-turbo-0613",
+    "name":"GPT-3.5 Turbo (older v0613)",
+    "provider_name":"OpenAI",
+    "cost":2.0,
+    "hf_id":null,
     "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1706140800000,
     "tasks":[
       "translation_from",
       "translation_to",
       "mgsm"
     ]
   },
   {
     "id":"openai\/gpt-4o-mini",
     "name":"GPT-4o-mini",
       "mgsm"
     ]
   },
   {
     "id":"qwen\/qwen3-235b-a22b",
     "name":"Qwen3 235B A22B",
     "id":"qwen\/qwen3-32b",
     "name":"Qwen3 32B",
     "provider_name":"Qwen",
+    "cost":0.0,
     "hf_id":"Qwen\/Qwen3-32B",
     "size":32762123264.0,
     "type":"open-source",
       "truthfulqa",
       "mgsm"
     ]
   }
 ]

pyproject.toml CHANGED Viewed

@@ -44,6 +44,3 @@ dev = [
     "scipy>=1.16.0",
     "seaborn>=0.13.2",
 ]
-[tool.uv]
-package = true

     "scipy>=1.16.0",
     "seaborn>=0.13.2",
 ]

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff