Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
e32fd78
1
Parent(s):
e9a19be
Filter UI setup
Browse files- app.py +233 -123
- pyproject.toml +1 -0
- uv.lock +14 -0
app.py
CHANGED
|
@@ -6,95 +6,98 @@ import pandas as pd
|
|
| 6 |
import plotly.express as px
|
| 7 |
import plotly.graph_objects as go
|
| 8 |
import pycountry
|
|
|
|
| 9 |
|
| 10 |
with open("results.json") as f:
|
| 11 |
languages = json.load(f)
|
| 12 |
|
| 13 |
-
languages_with_scores = [
|
| 14 |
-
lang for lang in languages if lang["t2t_score"] is not None
|
| 15 |
-
]
|
| 16 |
|
| 17 |
# Global constants for metric mappings
|
| 18 |
-
METRICS =
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
| 24 |
**Overall Score for Text-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
| 25 |
Higher scores indicate better overall language capabilities.
|
| 26 |
""",
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
**Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations.
|
| 34 |
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
|
| 35 |
""",
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
**Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
|
| 43 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
| 44 |
Higher scores (0-1) indicate better translations.
|
| 45 |
""",
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
**Classification Accuracy**: Measures how accurately models can classify text into predefined categories.
|
| 53 |
This evaluates a model's understanding of content and context across different languages.
|
| 54 |
Reported as a percentage where higher values indicate better classification performance.
|
| 55 |
""",
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
**Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text.
|
| 63 |
This tests a model's understanding of language structure and semantics by measuring the character-level similarity
|
| 64 |
between predicted and actual text. Higher scores indicate better language understanding.
|
| 65 |
""",
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
**Overall Score for Speech-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
| 73 |
Higher scores indicate better overall language capabilities.
|
| 74 |
""",
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
**Automatic Speech Recognition Word Error Rate**: Measures the accuracy of speech-to-text transcription.
|
| 82 |
It calculates the minimum number of word edits (insertions, deletions, substitutions) needed to transform the
|
| 83 |
transcription into the reference text, divided by the number of words in the reference.
|
| 84 |
Lower scores indicate better performance, with 0 being perfect transcription.
|
| 85 |
""",
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
**Automatic Speech Recognition ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
|
| 93 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
| 94 |
Higher scores (0-1) indicate better translations.
|
| 95 |
""",
|
| 96 |
-
|
| 97 |
-
]
|
|
|
|
| 98 |
|
| 99 |
|
| 100 |
def mean(lst):
|
|
@@ -136,7 +139,10 @@ def create_leaderboard_df(metric):
|
|
| 136 |
"Mid-Resource": [],
|
| 137 |
"Low-Resource": [],
|
| 138 |
}
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# Calculate average scores and create DataFrame
|
| 142 |
leaderboard_data = []
|
|
@@ -164,7 +170,8 @@ def create_leaderboard_df(metric):
|
|
| 164 |
+ categories["Mid-Resource"]
|
| 165 |
+ categories["Low-Resource"]
|
| 166 |
)
|
| 167 |
-
|
|
|
|
| 168 |
|
| 169 |
model_name = model.split("/")[-1]
|
| 170 |
leaderboard_data.append(
|
|
@@ -218,7 +225,9 @@ def create_leaderboard_df(metric):
|
|
| 218 |
|
| 219 |
|
| 220 |
def create_model_comparison_plot(metric):
|
| 221 |
-
top_languages = sorted(
|
|
|
|
|
|
|
| 222 |
|
| 223 |
# Create appropriate title and y-axis label based on metric
|
| 224 |
title = f"{metric['display_name']} by Model and Language"
|
|
@@ -266,10 +275,14 @@ def create_language_stats_df(metric):
|
|
| 266 |
|
| 267 |
for lang in languages:
|
| 268 |
# Find the best model and its BLEU score
|
| 269 |
-
best_model =
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
model = best_model["model"] if best_model else None
|
| 275 |
model_name = model.split("/")[-1] if model else "N/A"
|
|
@@ -340,7 +353,9 @@ def create_language_stats_df(metric):
|
|
| 340 |
|
| 341 |
def create_scatter_plot(metric):
|
| 342 |
# Filter results to include only languages with sufficient speakers
|
| 343 |
-
filtered_results = [
|
|
|
|
|
|
|
| 344 |
|
| 345 |
# Create a list to store data for the scatter plot
|
| 346 |
scatter_data = []
|
|
@@ -602,25 +617,119 @@ def create_world_map(metric):
|
|
| 602 |
|
| 603 |
|
| 604 |
def create_metric_explanation(metric):
|
| 605 |
-
return gr.Markdown(metric["explanation"])
|
|
|
|
| 606 |
|
| 607 |
|
| 608 |
# Create the visualization components
|
| 609 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
| 610 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
| 611 |
gr.Markdown("Comparing language proficiency across different models and languages.")
|
| 612 |
-
start_metric = METRICS[0]
|
| 613 |
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
interactive=True,
|
| 619 |
)
|
| 620 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
|
| 622 |
gr.Markdown("## Model Comparison")
|
| 623 |
-
|
|
|
|
| 624 |
model_comparison_plot = gr.Plot(
|
| 625 |
value=create_model_comparison_plot(start_metric),
|
| 626 |
label="Model Comparison",
|
|
@@ -639,63 +748,64 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
| 639 |
elem_classes="fullwidth-plot",
|
| 640 |
)
|
| 641 |
|
| 642 |
-
gr.Markdown(
|
| 643 |
-
"""
|
| 644 |
-
## Methodology
|
| 645 |
-
|
| 646 |
-
### Benchmark Data
|
| 647 |
-
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
|
| 648 |
-
|
| 649 |
-
Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package.
|
| 650 |
-
|
| 651 |
-
### AI Models
|
| 652 |
-
We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API.
|
| 653 |
-
|
| 654 |
-
### Evaluation Tasks
|
| 655 |
-
Our benchmark includes three core tasks to assess different aspects of language understanding:
|
| 656 |
-
|
| 657 |
-
1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using:
|
| 658 |
-
- [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty
|
| 659 |
-
- [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations
|
| 660 |
-
|
| 661 |
-
2. **Text Classification**: Models classify text into predefined topics after being shown examples. We:
|
| 662 |
-
- Group sentences by URL into paragraphs with the same topic
|
| 663 |
-
- Use the 5 most common topics, encoded as numbers rather than English labels
|
| 664 |
-
- Provide 5 examples of each topic as few-shot examples
|
| 665 |
-
- Test the model's ability to classify new text
|
| 666 |
-
- Report accuracy as the primary metric
|
| 667 |
-
|
| 668 |
-
3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We:
|
| 669 |
-
- Mask approximately 5% of each sentence at a random position
|
| 670 |
-
- Provide 10 examples of complete sentences paired with masked versions in a few-shot setting
|
| 671 |
-
- Evaluate predictions using ChrF score against the original text
|
| 672 |
-
|
| 673 |
-
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
|
| 674 |
-
""",
|
| 675 |
-
container=True,
|
| 676 |
-
)
|
| 677 |
-
|
| 678 |
def update_component(fn, metric_choice):
|
| 679 |
metric = [m for m in METRICS if m["display_name"] == metric_choice][0]
|
| 680 |
return fn(metric)
|
| 681 |
|
| 682 |
-
metric.change(
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
)
|
| 687 |
-
metric.change(
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
)
|
| 692 |
-
metric.change(
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
)
|
| 697 |
-
metric.change(
|
| 698 |
-
|
| 699 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
|
| 701 |
demo.launch()
|
|
|
|
| 6 |
import plotly.express as px
|
| 7 |
import plotly.graph_objects as go
|
| 8 |
import pycountry
|
| 9 |
+
from gradio_rangeslider import RangeSlider
|
| 10 |
|
| 11 |
with open("results.json") as f:
|
| 12 |
languages = json.load(f)
|
| 13 |
|
| 14 |
+
languages_with_scores = [lang for lang in languages if lang["t2t_score"] is not None]
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# Global constants for metric mappings
|
| 17 |
+
METRICS = {
|
| 18 |
+
"t2t": [
|
| 19 |
+
{
|
| 20 |
+
"display_name": "Overall Text-to-Text Performance",
|
| 21 |
+
"field_name": "t2t_score",
|
| 22 |
+
"label": "Overall Score",
|
| 23 |
+
"explanation": """
|
| 24 |
**Overall Score for Text-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
| 25 |
Higher scores indicate better overall language capabilities.
|
| 26 |
""",
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"display_name": "Translation (BLEU)",
|
| 30 |
+
"field_name": "mt_bleu",
|
| 31 |
+
"label": "BLEU Score",
|
| 32 |
+
"explanation": """
|
| 33 |
**Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations.
|
| 34 |
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
|
| 35 |
""",
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"display_name": "Translation (ChrF)",
|
| 39 |
+
"field_name": "mt_chrf",
|
| 40 |
+
"label": "ChrF Score",
|
| 41 |
+
"explanation": """
|
| 42 |
**Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
|
| 43 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
| 44 |
Higher scores (0-1) indicate better translations.
|
| 45 |
""",
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"display_name": "Classification (Accuracy)",
|
| 49 |
+
"field_name": "cls_acc",
|
| 50 |
+
"label": "Classification Accuracy",
|
| 51 |
+
"explanation": """
|
| 52 |
**Classification Accuracy**: Measures how accurately models can classify text into predefined categories.
|
| 53 |
This evaluates a model's understanding of content and context across different languages.
|
| 54 |
Reported as a percentage where higher values indicate better classification performance.
|
| 55 |
""",
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"display_name": "Masked Language Modeling (ChrF)",
|
| 59 |
+
"field_name": "mlm_chrf",
|
| 60 |
+
"label": "MLM ChrF Score",
|
| 61 |
+
"explanation": """
|
| 62 |
**Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text.
|
| 63 |
This tests a model's understanding of language structure and semantics by measuring the character-level similarity
|
| 64 |
between predicted and actual text. Higher scores indicate better language understanding.
|
| 65 |
""",
|
| 66 |
+
},
|
| 67 |
+
],
|
| 68 |
+
"s2t": [
|
| 69 |
+
{
|
| 70 |
+
"display_name": "Overall Speech-to-Text Performance",
|
| 71 |
+
"field_name": "s2t_score",
|
| 72 |
+
"label": "Overall Score",
|
| 73 |
+
"explanation": """
|
| 74 |
**Overall Score for Speech-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
| 75 |
Higher scores indicate better overall language capabilities.
|
| 76 |
""",
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"display_name": "Automatic Speech Recognition (WER)",
|
| 80 |
+
"field_name": "asr_wer",
|
| 81 |
+
"label": "WER",
|
| 82 |
+
"explanation": """
|
| 83 |
**Automatic Speech Recognition Word Error Rate**: Measures the accuracy of speech-to-text transcription.
|
| 84 |
It calculates the minimum number of word edits (insertions, deletions, substitutions) needed to transform the
|
| 85 |
transcription into the reference text, divided by the number of words in the reference.
|
| 86 |
Lower scores indicate better performance, with 0 being perfect transcription.
|
| 87 |
""",
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"display_name": "Automatic Speech Recognition ChrF",
|
| 91 |
+
"field_name": "asr_chrf",
|
| 92 |
+
"label": "ChrF",
|
| 93 |
+
"explanation": """
|
| 94 |
**Automatic Speech Recognition ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
|
| 95 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
| 96 |
Higher scores (0-1) indicate better translations.
|
| 97 |
""",
|
| 98 |
+
},
|
| 99 |
+
],
|
| 100 |
+
}
|
| 101 |
|
| 102 |
|
| 103 |
def mean(lst):
|
|
|
|
| 139 |
"Mid-Resource": [],
|
| 140 |
"Low-Resource": [],
|
| 141 |
}
|
| 142 |
+
# Check if the metric field exists in the score dictionary before accessing it
|
| 143 |
+
if metric["field_name"] in score:
|
| 144 |
+
model_scores[model][category].append(score[metric["field_name"]])
|
| 145 |
+
# If the metric is missing, we'll skip this score
|
| 146 |
|
| 147 |
# Calculate average scores and create DataFrame
|
| 148 |
leaderboard_data = []
|
|
|
|
| 170 |
+ categories["Mid-Resource"]
|
| 171 |
+ categories["Low-Resource"]
|
| 172 |
)
|
| 173 |
+
# Check if all_scores is empty to avoid division by zero
|
| 174 |
+
overall_avg = round(sum(all_scores) / len(all_scores), 3) if all_scores else 0
|
| 175 |
|
| 176 |
model_name = model.split("/")[-1]
|
| 177 |
leaderboard_data.append(
|
|
|
|
| 225 |
|
| 226 |
|
| 227 |
def create_model_comparison_plot(metric):
|
| 228 |
+
top_languages = sorted(
|
| 229 |
+
languages_with_scores, key=lambda x: x["speakers"], reverse=True
|
| 230 |
+
)[:10]
|
| 231 |
|
| 232 |
# Create appropriate title and y-axis label based on metric
|
| 233 |
title = f"{metric['display_name']} by Model and Language"
|
|
|
|
| 275 |
|
| 276 |
for lang in languages:
|
| 277 |
# Find the best model and its BLEU score
|
| 278 |
+
best_model = (
|
| 279 |
+
max(
|
| 280 |
+
lang["scores"] or [{"t2t_score": None, "model": None}],
|
| 281 |
+
key=lambda x: x.get("t2t_score", 0),
|
| 282 |
+
)
|
| 283 |
+
if lang["t2t_score"] is not None
|
| 284 |
+
else None
|
| 285 |
+
)
|
| 286 |
|
| 287 |
model = best_model["model"] if best_model else None
|
| 288 |
model_name = model.split("/")[-1] if model else "N/A"
|
|
|
|
| 353 |
|
| 354 |
def create_scatter_plot(metric):
|
| 355 |
# Filter results to include only languages with sufficient speakers
|
| 356 |
+
filtered_results = [
|
| 357 |
+
lang for lang in languages_with_scores if lang["speakers"] >= 10_000
|
| 358 |
+
]
|
| 359 |
|
| 360 |
# Create a list to store data for the scatter plot
|
| 361 |
scatter_data = []
|
|
|
|
| 617 |
|
| 618 |
|
| 619 |
def create_metric_explanation(metric):
|
| 620 |
+
return gr.Markdown(metric["explanation"], container=True)
|
| 621 |
+
|
| 622 |
|
| 623 |
|
| 624 |
# Create the visualization components
|
| 625 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
| 626 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
| 627 |
gr.Markdown("Comparing language proficiency across different models and languages.")
|
|
|
|
| 628 |
|
| 629 |
+
language_choices = [
|
| 630 |
+
f"{lang['language_name']} ({lang['bcp_47']})" for lang in languages
|
| 631 |
+
]
|
| 632 |
+
models = {score["model"] for lang in languages for score in lang["scores"]}
|
| 633 |
+
search = gr.Dropdown(
|
| 634 |
+
choices=list(models) + language_choices,
|
| 635 |
+
value=None,
|
| 636 |
+
label="Search for Language or Model",
|
| 637 |
interactive=True,
|
| 638 |
)
|
| 639 |
+
with gr.Row():
|
| 640 |
+
with gr.Column():
|
| 641 |
+
with gr.Accordion("Model Filters", open=False):
|
| 642 |
+
model_type = gr.Radio(
|
| 643 |
+
choices=["Text-to-Text", "Speech-to-Text"],
|
| 644 |
+
value="Text-to-Text",
|
| 645 |
+
label="Select Model Type",
|
| 646 |
+
interactive=True,
|
| 647 |
+
)
|
| 648 |
+
model_licenses = gr.CheckboxGroup(
|
| 649 |
+
choices=["open source", "commercial"],
|
| 650 |
+
value=["open source", "commercial"],
|
| 651 |
+
label="Filter by Model License",
|
| 652 |
+
interactive=True,
|
| 653 |
+
)
|
| 654 |
+
model_sizes = RangeSlider(
|
| 655 |
+
minimum=0,
|
| 656 |
+
maximum=1000,
|
| 657 |
+
value=(0, 1000),
|
| 658 |
+
label="Filter by Model Size (in Billion Parameters)",
|
| 659 |
+
interactive=True,
|
| 660 |
+
)
|
| 661 |
+
|
| 662 |
+
with gr.Column():
|
| 663 |
+
with gr.Accordion("Language Filters", open=False):
|
| 664 |
+
unit_of_analysis = gr.Radio(
|
| 665 |
+
choices=["Languages", "Language Families", "Regions"],
|
| 666 |
+
value="Languages",
|
| 667 |
+
label="Select Unit of Analysis",
|
| 668 |
+
interactive=True,
|
| 669 |
+
)
|
| 670 |
+
region_filter = gr.CheckboxGroup(
|
| 671 |
+
choices=[
|
| 672 |
+
"Africa",
|
| 673 |
+
"Asia",
|
| 674 |
+
"Europe",
|
| 675 |
+
"North America",
|
| 676 |
+
"South America",
|
| 677 |
+
"Oceania",
|
| 678 |
+
],
|
| 679 |
+
value=[
|
| 680 |
+
"Africa",
|
| 681 |
+
"Asia",
|
| 682 |
+
"Europe",
|
| 683 |
+
"North America",
|
| 684 |
+
"South America",
|
| 685 |
+
"Oceania",
|
| 686 |
+
],
|
| 687 |
+
label="Filter by Region",
|
| 688 |
+
interactive=True,
|
| 689 |
+
)
|
| 690 |
+
family_filter = gr.CheckboxGroup(
|
| 691 |
+
choices=[
|
| 692 |
+
"Indo-European",
|
| 693 |
+
"Sino-Tibetan",
|
| 694 |
+
"Afro-Asiatic",
|
| 695 |
+
"Dravidian",
|
| 696 |
+
"Uralic",
|
| 697 |
+
"Austronesian",
|
| 698 |
+
"Other",
|
| 699 |
+
],
|
| 700 |
+
value=[
|
| 701 |
+
"Indo-European",
|
| 702 |
+
"Sino-Tibetan",
|
| 703 |
+
"Afro-Asiatic",
|
| 704 |
+
"Dravidian",
|
| 705 |
+
"Uralic",
|
| 706 |
+
"Austronesian",
|
| 707 |
+
"Other",
|
| 708 |
+
],
|
| 709 |
+
label="Filter by Language Family",
|
| 710 |
+
interactive=True,
|
| 711 |
+
)
|
| 712 |
+
speakers_filter = RangeSlider(
|
| 713 |
+
minimum=0,
|
| 714 |
+
maximum=100_000_000,
|
| 715 |
+
value=(0, 100_000_000),
|
| 716 |
+
label="Filter by Number of Speakers",
|
| 717 |
+
interactive=True,
|
| 718 |
+
)
|
| 719 |
+
with gr.Row():
|
| 720 |
+
start_metric = METRICS["t2t"][0]
|
| 721 |
+
metric = gr.Dropdown(
|
| 722 |
+
choices=[metric["display_name"] for metric in METRICS["t2t"]],
|
| 723 |
+
value=start_metric["display_name"],
|
| 724 |
+
label="Main metric to display in figures and map",
|
| 725 |
+
interactive=True,
|
| 726 |
+
)
|
| 727 |
+
|
| 728 |
+
metric_explanation = create_metric_explanation(start_metric)
|
| 729 |
|
| 730 |
gr.Markdown("## Model Comparison")
|
| 731 |
+
create_leaderboard_df(start_metric)
|
| 732 |
+
|
| 733 |
model_comparison_plot = gr.Plot(
|
| 734 |
value=create_model_comparison_plot(start_metric),
|
| 735 |
label="Model Comparison",
|
|
|
|
| 748 |
elem_classes="fullwidth-plot",
|
| 749 |
)
|
| 750 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
def update_component(fn, metric_choice):
|
| 752 |
metric = [m for m in METRICS if m["display_name"] == metric_choice][0]
|
| 753 |
return fn(metric)
|
| 754 |
|
| 755 |
+
# metric.change(
|
| 756 |
+
# fn=partial(update_component, create_metric_explanation),
|
| 757 |
+
# inputs=metric,
|
| 758 |
+
# outputs=metric_explanation,
|
| 759 |
+
# )
|
| 760 |
+
# metric.change(
|
| 761 |
+
# fn=partial(update_component, create_model_comparison_plot),
|
| 762 |
+
# inputs=metric,
|
| 763 |
+
# outputs=model_comparison_plot,
|
| 764 |
+
# )
|
| 765 |
+
# metric.change(
|
| 766 |
+
# fn=partial(update_component, create_scatter_plot),
|
| 767 |
+
# inputs=metric,
|
| 768 |
+
# outputs=scatter_plot,
|
| 769 |
+
# )
|
| 770 |
+
# metric.change(
|
| 771 |
+
# fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
|
| 772 |
+
# )
|
| 773 |
+
|
| 774 |
+
with gr.Accordion("Methodology", open=False):
|
| 775 |
+
gr.Markdown(
|
| 776 |
+
"""
|
| 777 |
+
## Methodology
|
| 778 |
+
|
| 779 |
+
### Benchmark Data
|
| 780 |
+
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
|
| 781 |
+
|
| 782 |
+
Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package.
|
| 783 |
+
|
| 784 |
+
### AI Models
|
| 785 |
+
We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API.
|
| 786 |
+
|
| 787 |
+
### Evaluation Tasks
|
| 788 |
+
Our benchmark includes three core tasks to assess different aspects of language understanding:
|
| 789 |
+
|
| 790 |
+
1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using:
|
| 791 |
+
- [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty
|
| 792 |
+
- [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations
|
| 793 |
+
|
| 794 |
+
2. **Text Classification**: Models classify text into predefined topics after being shown examples. We:
|
| 795 |
+
- Group sentences by URL into paragraphs with the same topic
|
| 796 |
+
- Use the 5 most common topics, encoded as numbers rather than English labels
|
| 797 |
+
- Provide 5 examples of each topic as few-shot examples
|
| 798 |
+
- Test the model's ability to classify new text
|
| 799 |
+
- Report accuracy as the primary metric
|
| 800 |
+
|
| 801 |
+
3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We:
|
| 802 |
+
- Mask approximately 5% of each sentence at a random position
|
| 803 |
+
- Provide 10 examples of complete sentences paired with masked versions in a few-shot setting
|
| 804 |
+
- Evaluate predictions using ChrF score against the original text
|
| 805 |
+
|
| 806 |
+
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
|
| 807 |
+
""",
|
| 808 |
+
container=True,
|
| 809 |
+
)
|
| 810 |
|
| 811 |
demo.launch()
|
pyproject.toml
CHANGED
|
@@ -5,6 +5,7 @@ description = "Add your description here"
|
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.10"
|
| 7 |
dependencies = [
|
|
|
|
| 8 |
"gradio>=5.16.2",
|
| 9 |
"language-data>=1.3.0",
|
| 10 |
"pandas>=2.2.3",
|
|
|
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.10"
|
| 7 |
dependencies = [
|
| 8 |
+
"gradio-rangeslider>=0.0.8",
|
| 9 |
"gradio>=5.16.2",
|
| 10 |
"language-data>=1.3.0",
|
| 11 |
"pandas>=2.2.3",
|
uv.lock
CHANGED
|
@@ -695,6 +695,18 @@ wheels = [
|
|
| 695 |
{ url = "https://files.pythonhosted.org/packages/16/52/4fe9dfc2239e7b748ad8dc3b80ad8755f5c9378432715193586c3ab74bf9/gradio_client-1.7.1-py3-none-any.whl", hash = "sha256:d7737bc473a2093549c06004379c42f0a3510a98095cf7cea9033837e252149f", size = 321994 },
|
| 696 |
]
|
| 697 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
[[package]]
|
| 699 |
name = "h11"
|
| 700 |
version = "0.14.0"
|
|
@@ -958,6 +970,7 @@ version = "0.1.0"
|
|
| 958 |
source = { virtual = "." }
|
| 959 |
dependencies = [
|
| 960 |
{ name = "gradio" },
|
|
|
|
| 961 |
{ name = "language-data" },
|
| 962 |
{ name = "pandas" },
|
| 963 |
{ name = "plotly" },
|
|
@@ -987,6 +1000,7 @@ dev = [
|
|
| 987 |
[package.metadata]
|
| 988 |
requires-dist = [
|
| 989 |
{ name = "gradio", specifier = ">=5.16.2" },
|
|
|
|
| 990 |
{ name = "language-data", specifier = ">=1.3.0" },
|
| 991 |
{ name = "pandas", specifier = ">=2.2.3" },
|
| 992 |
{ name = "plotly", specifier = ">=6.0.0" },
|
|
|
|
| 695 |
{ url = "https://files.pythonhosted.org/packages/16/52/4fe9dfc2239e7b748ad8dc3b80ad8755f5c9378432715193586c3ab74bf9/gradio_client-1.7.1-py3-none-any.whl", hash = "sha256:d7737bc473a2093549c06004379c42f0a3510a98095cf7cea9033837e252149f", size = 321994 },
|
| 696 |
]
|
| 697 |
|
| 698 |
+
[[package]]
|
| 699 |
+
name = "gradio-rangeslider"
|
| 700 |
+
version = "0.0.8"
|
| 701 |
+
source = { registry = "https://pypi.org/simple" }
|
| 702 |
+
dependencies = [
|
| 703 |
+
{ name = "gradio" },
|
| 704 |
+
]
|
| 705 |
+
sdist = { url = "https://files.pythonhosted.org/packages/88/93/aa1723076cc1056279e43f46d804f663c794f2e098c0650d150b46372f62/gradio_rangeslider-0.0.8.tar.gz", hash = "sha256:414fd4b093d4327cc86c51b4e81e53df19664023f95bed95e3ecc03071fa1ea0", size = 1263683 }
|
| 706 |
+
wheels = [
|
| 707 |
+
{ url = "https://files.pythonhosted.org/packages/54/44/14f759678a76ffbd6e3fe8852a4f758bc5ad51b379c67b5d167cfde752f6/gradio_rangeslider-0.0.8-py3-none-any.whl", hash = "sha256:3728c44e58ec1bff0bdf236cc84f12b183fbd596fb4714d8b797585a0515f89e", size = 1224388 },
|
| 708 |
+
]
|
| 709 |
+
|
| 710 |
[[package]]
|
| 711 |
name = "h11"
|
| 712 |
version = "0.14.0"
|
|
|
|
| 970 |
source = { virtual = "." }
|
| 971 |
dependencies = [
|
| 972 |
{ name = "gradio" },
|
| 973 |
+
{ name = "gradio-rangeslider" },
|
| 974 |
{ name = "language-data" },
|
| 975 |
{ name = "pandas" },
|
| 976 |
{ name = "plotly" },
|
|
|
|
| 1000 |
[package.metadata]
|
| 1001 |
requires-dist = [
|
| 1002 |
{ name = "gradio", specifier = ">=5.16.2" },
|
| 1003 |
+
{ name = "gradio-rangeslider" },
|
| 1004 |
{ name = "language-data", specifier = ">=1.3.0" },
|
| 1005 |
{ name = "pandas", specifier = ">=2.2.3" },
|
| 1006 |
{ name = "plotly", specifier = ">=6.0.0" },
|