|
|
from dataclasses import dataclass |
|
|
from enum import Enum |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class Task: |
|
|
benchmark: str |
|
|
task_name: str |
|
|
metric: str |
|
|
col_name: str |
|
|
displayed_by_default: bool = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Tasks(Enum): |
|
|
|
|
|
benchmark0 = Task("blimp", None, "acc", "BLiMP") |
|
|
benchmark1 = Task("blimp_supplement", None, "acc", "BLiMP Supplement") |
|
|
benchmark2 = Task("ewok", None, "acc", "EWoK") |
|
|
benchmark4 = Task("entity_tracking", None, "acc", "Entity Tracking") |
|
|
benchmark5 = Task("wug_adj", None, "Spearman Correlation", "WUG Adjective Nominalization") |
|
|
benchmark6 = Task("wug_past", None, "Spearman Correlation", "WUG Past Tense") |
|
|
benchmark7 = Task("comps", None, "acc", "COMPS") |
|
|
benchmark8 = Task("reading", None, "delta % R2", "Reading") |
|
|
task0 = Task("reading", "spr", "delta % R2", "Self-paced Reading Time", displayed_by_default=False) |
|
|
task1 = Task("reading", "rt", "delta % R2", "Eye Tracking", displayed_by_default=False) |
|
|
benchmark9 = Task("aoa", None, "MSE", "AoA") |
|
|
benchmark10 = Task("glue", None, "acc/f1", "(Super)GLUE") |
|
|
task2 = Task("glue", "boolq", "acc", "BoolQ", displayed_by_default=False) |
|
|
task3 = Task("glue", "mnli", "acc", "MNLI", displayed_by_default=False) |
|
|
task4 = Task("glue", "mrpc", "f1", "MRPC", displayed_by_default=False) |
|
|
task5 = Task("glue", "multirc", "acc", "MultiRC", displayed_by_default=False) |
|
|
task6 = Task("glue", "qqp", "f1", "QQP", displayed_by_default=False) |
|
|
task7 = Task("glue", "rte", "acc", "RTE", displayed_by_default=False) |
|
|
task8 = Task("glue", "wsc", "acc", "WSC", displayed_by_default=False) |
|
|
|
|
|
|
|
|
class TasksMultimodal(Enum): |
|
|
benchmark0 = Task("blimp", None, "acc", "BLiMP") |
|
|
benchmark1 = Task("blimp_supplement", None, "acc", "BLiMP Supplement") |
|
|
benchmark2 = Task("ewok", None, "acc", "EWoK") |
|
|
benchmark4 = Task("entity_tracking", None, "acc", "Entity Tracking") |
|
|
benchmark5 = Task("wug_adj", None, "Spearman Correlation", "WUG Adjective Nominalization") |
|
|
benchmark6 = Task("wug_past", None, "Spearman Correlation", "WUG Past Tense") |
|
|
benchmark7 = Task("comps", None, "acc", "COMPS") |
|
|
benchmark8 = Task("reading", None, "delta % R2", "Reading") |
|
|
task0 = Task("reading", "spr", "delta % R2", "Self-paced Reading Time", displayed_by_default=False) |
|
|
task1 = Task("reading", "rt", "delta % R2", "Eye Tracking", displayed_by_default=False) |
|
|
benchmark9 = Task("aoa", None, "MSE", "AoA") |
|
|
benchmark10 = Task("glue", None, "acc/f1", "(Super)GLUE") |
|
|
task2 = Task("glue", "boolq", "acc", "BoolQ", displayed_by_default=False) |
|
|
task3 = Task("glue", "mnli", "acc", "MNLI", displayed_by_default=False) |
|
|
task4 = Task("glue", "mrpc", "f1", "MRPC", displayed_by_default=False) |
|
|
task5 = Task("glue", "multirc", "acc", "MultiRC", displayed_by_default=False) |
|
|
task6 = Task("glue", "qqp", "f1", "QQP", displayed_by_default=False) |
|
|
task7 = Task("glue", "rte", "acc", "RTE", displayed_by_default=False) |
|
|
task8 = Task("glue", "wsc", "acc", "WSC", displayed_by_default=False) |
|
|
benchmark11 = Task("vqa", None, "acc", "VQA") |
|
|
benchmark12 = Task("winoground", None, "acc", "Winoground") |
|
|
benchmark13 = Task("devbench", None, "acc", "DevBench") |
|
|
|
|
|
|
|
|
NUM_FEWSHOT = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TITLE = """<h1 align="center" id="space-title">BabyLM 2025 Leaderboards</h1>""" |
|
|
|
|
|
|
|
|
INTRODUCTION_TEXT = """ |
|
|
This leaderboard displays scores from the 2025 BabyLM Challenge. Each track has its own tab. |
|
|
""" |
|
|
|
|
|
|
|
|
LLM_BENCHMARKS_TEXT = """ |
|
|
This leaderboard displays scores from the 2025 BabyLM Challenge. Each track has its own tab. |
|
|
""" |
|
|
|
|
|
EVALUATION_QUEUE_TEXT = """ |
|
|
## Submission |
|
|
If you want your submission to go towards the challenge, make sure to fill all the textboxes below. |
|
|
If you use one of the BabyLM datasets there is no need to fillout the custom dataset information, and leave dropdowns on not applicable. |
|
|
The other hyperparameter JSON file is not required for submission. |
|
|
The ⚠️ represents components that need to be filled out for a submission, the 👶 represents the components to fill out for a valid submission to the BabyLM Challenge, and the 🔹 represents optional information to submit. |
|
|
## Some good practices before submitting a predictions upload: |
|
|
Make sure you can get scores from your predictions file using the `collate_preds.py` script. |
|
|
```bash |
|
|
git clone https://github.com/babylm/evaluation-pipeline-2025/ |
|
|
cd evaluation-pipeline-2025 |
|
|
python -m evaluation_pipeline.collate_preds --model_path_or_name=NAME_OF_YOUR_MODEL --backend=BACKEND |
|
|
``` |
|
|
If this step fails, follow the error messages to debug your predictions before getting in touch. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted. |
|
|
Make sure your model has an open license! This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model. |
|
|
Once these steps have been followed, get in touch with the organizers with your predictions file(s), and the scores you've obtained. |
|
|
We'll verify that we can match your scores, and then upload to the leaderboard. Optionally, you can give us your preferred model display name for the leaderboard, and a link to your model on HuggingFace. |
|
|
""" |
|
|
|
|
|
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2025 BabyLM Call for Papers paper, as well as the authors of the model(s) whose results you cite!" |
|
|
CITATION_BUTTON_TEXT = r""" |
|
|
@misc{charpentier2025babylmturns3papers, |
|
|
title={BabyLM Turns 3: Call for papers for the 2025 BabyLM workshop}, |
|
|
author={Lucas Charpentier and Leshem Choshen and Ryan Cotterell and Mustafa Omer Gul and Michael Hu and Jaap Jumelet and Tal Linzen and Jing Liu and Aaron Mueller and Candace Ross and Raj Sanjay Shah and Alex Warstadt and Ethan Wilcox and Adina Williams}, |
|
|
year={2025}, |
|
|
eprint={2502.10645}, |
|
|
archivePrefix={arXiv}, |
|
|
primaryClass={cs.CL}, |
|
|
url={https://arxiv.org/abs/2502.10645}, |
|
|
} |
|
|
""" |
|
|
|