Spaces:
Running
Running
import pandas as pd | |
def load_data(): | |
"""Load and preprocess the data.""" | |
df = pd.read_csv("leaderboard.csv").dropna() | |
return df | |
df = load_data() | |
MODELS = [x.strip() for x in df["Model"].unique().tolist()] | |
COMMON = """ | |
<style> | |
@media (prefers-color-scheme: dark) { | |
:root { | |
--bg-primary: #0B0B19; | |
--bg-secondary: rgba(19, 19, 37, 0.4); | |
--bg-hover: rgba(30, 30, 45, 0.95); | |
--text-primary: #ffffff; | |
--text-secondary: #e2e8f0; | |
--text-tertiary: #e2e8f0; | |
--card-bg: rgba(17, 17, 27, 0.4); | |
--border-color: rgba(31, 41, 55, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--accent-color: #ffffff; | |
--accent-bg: rgba(79, 70, 229, 0.1); | |
--blue-gradient: linear-gradient(45deg, #3B82F6, #A8C4F0);; | |
--orange-gradient: linear-gradient(45deg, #E05205, #FAD8D2); | |
--green-gradient: linear-gradient(45deg, #60cc1c, #a0e65e); | |
--shadow-color: rgba(0, 0, 0, 0.2); | |
} | |
} | |
@media (prefers-color-scheme: light) { | |
:root { | |
--bg-primary: #ffffff; | |
--bg-secondary: rgba(243, 244, 246, 0.4); | |
--bg-hover: rgba(229, 231, 235, 0.95); | |
--text-primary: #1F2937; | |
--text-secondary: #4B5563; | |
--text-tertiary: #6B7280; | |
--card-bg: rgba(249, 250, 251, 0.4); | |
--border-color: rgba(209, 213, 219, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--accent-color: #4F46E5; | |
--accent-bg: rgba(79, 70, 229, 0.1); | |
--blue-gradient: linear-gradient(45deg, #3B82F6, #A8C4F0);; | |
--orange-gradient: linear-gradient(45deg, #E05205, #FF8340); | |
--green-gradient: linear-gradient(45deg, #60cc1c, #a0e65e); | |
--shadow-color: rgba(0, 0, 0, 0.1); | |
} | |
} | |
</style> | |
""" | |
# Define constants for the links | |
PAPER_LINK = "https://arxiv.org/abs/2503.02972" | |
CODE_LINK = "https://github.com/jkhouja/L2" | |
BLOG_LINK = "https://www.lesswrong.com/posts/pbt8GYpdip7NkuwGy/are-recent-llms-better-at-reasoning-or-better-at-memorizing" | |
DATASET_LINK = "https://huggingface.co/datasets/jkhouja/LingOly-TOO" | |
ADD_MODEL_LINK = ( | |
"https://mail.google.com/mail/?view=cm&fs=1&[email protected]" | |
"&su=Get%20Model%20Added%20to%20Leaderboard&body=Hi%20there%2C%0A%0AI%20" | |
"would%20like%20to%20add%20my%20model%20to%20the%20Lingoly-TOO%20Leaderboard.%0A%0AModel%20Name%3A%0AModel%20URL%3A%0A%0ABest%20regards" | |
) | |
HEADER_CONTENT = ( | |
COMMON | |
+ f""" | |
<style> | |
.header-wrapper {{ | |
position: relative; | |
background: var(--bg-primary); | |
border-radius: 16px; | |
margin-bottom: 0; | |
transition: all 0.3s ease; | |
}} | |
.header-content {{ | |
max-width: 72rem; | |
margin: 0 auto; | |
}} | |
.title-section {{ | |
position: relative; | |
display: flex; | |
align-items: center; | |
justify-content: center; | |
margin-bottom: 3rem; | |
}} | |
.title-gradient {{ | |
font-size: 5rem; | |
font-weight: 800; | |
line-height: 1.25; | |
background: var(--orange-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
margin-bottom: 0.5rem; | |
}} | |
.title-image {{ | |
position: absolute; | |
top: 30px; | |
left: 30px; | |
width: 100px; | |
height: 100px; | |
/* To make it look ok on dark mode */ | |
background-color: #ffffffd0; | |
padding: 10px; | |
border-radius: 6px; | |
}} | |
.subtitle-white {{ | |
font-size: 5rem; | |
font-weight: 800; | |
line-height: 1.1; | |
color: var(--text-primary); | |
margin-bottom: 3rem; | |
transition: color 0.3s ease; | |
}} | |
.description {{ | |
color: var(--text-secondary); | |
font-size: 1.25rem; | |
line-height: 1.75; | |
max-width: 800px; | |
margin: 0 auto; | |
text-align: center; | |
transition: color 0.3s ease; | |
}} | |
.highlight-question {{ | |
background: var(--blue-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
display: block; | |
margin-top: 1rem; | |
font-size: 1.5rem; | |
font-weight: 500; | |
}} | |
.metrics-grid {{ | |
display: grid; | |
grid-template-columns: repeat(3, 1fr); | |
gap: 1.5rem; | |
margin-top: 4rem; | |
}} | |
.metric-card {{ | |
background: var(--bg-secondary); | |
border: 1px solid var(--border-color); | |
text-align: center; | |
border-radius: 1rem; | |
padding: 2rem; | |
transition: all 0.3s ease; | |
align-items: center; | |
}} | |
.metric-card:hover {{ | |
transform: translateY(-5px); | |
border-color: var(--border-hover); | |
box-shadow: 0 4px 20px var(--shadow-color); | |
}} | |
.metric-number {{ | |
font-size: 4rem; | |
font-weight: 800; | |
margin-bottom: 1rem; | |
}} | |
.metric-blue {{ | |
background: var(--blue-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
}} | |
.metric-purple {{ | |
background: var(--orange-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
}} | |
.metric-green {{ | |
background: var(--green-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
}} | |
.metric-label {{ | |
color: var(--text-secondary); | |
font-size: 1.5rem; | |
margin-bottom: 1.5rem; | |
transition: color 0.3s ease; | |
}} | |
.metric-detail {{ | |
font-size: 1.125rem; | |
line-height: 1.75; | |
margin-top: 0.5rem; | |
transition: color 0.3s ease; | |
}} | |
.metric-detail.primary {{ | |
color: var(--accent-color); | |
}} | |
.metric-detail.secondary {{ | |
color: var(--text-secondary); | |
}} | |
.actions {{ | |
display: flex; | |
gap: 1rem; | |
justify-content: center; | |
margin-top: 3rem; | |
}} | |
.action-button {{ | |
display: flex; | |
align-items: center; | |
gap: 0.5rem; | |
padding: 0.75rem 1.5rem; | |
background: var(--bg-secondary); | |
border: 1px solid var(--border-color); | |
border-radius: 100px; | |
color: var(--text-primary) !important; | |
text-decoration: none !important; | |
font-size: 0.95rem; | |
transition: all 0.3s ease; | |
}} | |
.action-button:hover {{ | |
transform: translateY(-2px); | |
border-color: var(--accent-color); | |
background: var(--accent-bg); | |
}} | |
@media (max-width: 1024px) {{ | |
.title-image {{ | |
top: 20px; | |
left: 20px; | |
width: 80px; | |
height: 80px; | |
}} | |
.title-gradient, .subtitle-white {{ | |
font-size: 3rem; | |
}} | |
}} | |
@media (max-width: 620px) {{ | |
.title-image {{ | |
position: relative; | |
margin-top: -30px !important; | |
margin-bottom: 20px !important; | |
top: 0; | |
left: 0; | |
}} | |
}} | |
</style> | |
<div class="header-wrapper"> | |
<div class="header-content"> | |
<div class="title-section"> | |
<div class="title-gradient">LingOly-TOO</div> | |
</div> | |
<div class="description"> | |
LingOly-TOO (L2) is a challenging linguistics reasoning benchmark designed to counteracts answering without reasoning (e.g. by guessing or memorizing answers). | |
We accomplish this by permuting <b>Ling</b>uistics <b>Oly</b>mpiad problems with <b>T</b>emplates and <b>O</b>rthographic <b>O</b>bfuscations. By rewriting (obfuscating) parts of questions and answers, the chance of benchmark leakage in training data is minimized. | |
<div class="highlight-question"> | |
"How do top LLMs reason on unseen linguistic questions?" | |
</div> | |
</div> | |
</div> | |
<div class="actions"> | |
<a href="{PAPER_LINK}" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/> | |
<line x1="8" y1="12" x2="16" y2="12"/> | |
</svg> | |
Paper | |
</a> | |
<a href="{CODE_LINK}" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/> | |
</svg> | |
Code | |
</a> | |
<a href="{BLOG_LINK}" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
<polyline points="7 10 12 15 17 10"/> | |
<line x1="12" y1="15" x2="12" y2="3"/> | |
</svg> | |
Blog | |
</a> | |
<a href="{DATASET_LINK}" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
<polyline points="7 10 12 15 17 10"/> | |
<line x1="12" y1="15" x2="12" y2="3"/> | |
</svg> | |
Dataset | |
</a> | |
<a href="{ADD_MODEL_LINK}" class="action-button" target="_blank" rel="noopener noreferrer"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M19 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V5a2 2 0 0 0-2-2z"/> | |
<line x1="12" y1="8" x2="12" y2="16"/> | |
<line x1="8" y1="12" x2="16" y2="12"/> | |
</svg> | |
Add Your Model | |
</a> | |
</div> | |
</div> | |
""" | |
) | |
CARDS = """ <div class="metrics-grid"> | |
<div class="metric-card"> | |
<div class="metric-number metric-blue">11</div> | |
<div class="metric-label">Total Models</div> | |
<div class="metric-detail primary">4 Reasoning Models</div> | |
<div class="metric-detail primary">4 Open Source Models</div> | |
</div> | |
<div class="metric-card"> | |
<div class="metric-number metric-purple">82</div> | |
<div class="metric-label">Linguistics Problems</div> | |
<div class="metric-detail primary">6 Permutations per problem</div> | |
<div class="metric-detail primary">Problems from Low-resource Languages</div> | |
</div> | |
<div class="metric-card"> | |
<div class="metric-number metric-green">1.2k</div> | |
<div class="metric-label">Total Questions</div> | |
<div class="metric-detail primary">Includes Match-Up, Multiple Choice and Completion</div> | |
</div> | |
</div>""" | |
METHODOLOGY = """ | |
<style> | |
@media (prefers-color-scheme: dark) { | |
:root { | |
--bg-primary: #0B0B19; | |
--bg-secondary: rgba(19, 19, 37, 0.4); | |
--text-primary: #ffffff; | |
--text-secondary: #94A3B8; | |
--border-primary: rgba(31, 41, 55, 0.5); | |
--accent-blue: #60A5FA; | |
--accent-purple: #A78BFA; | |
--card-hover-bg: rgba(79, 70, 229, 0.1); | |
--shadow-color: rgba(79, 70, 229, 0.1); | |
} | |
} | |
@media (prefers-color-scheme: light) { | |
:root { | |
--bg-primary: #ffffff; | |
--bg-secondary: rgba(243, 244, 246, 0.4); | |
--text-primary: #111827; | |
--text-secondary: #4B5563; | |
--border-primary: rgba(209, 213, 219, 0.5); | |
--accent-blue: #3B82F6; | |
--accent-purple: #8B5CF6; | |
--card-hover-bg: rgba(243, 244, 246, 0.8); | |
--shadow-color: rgba(0, 0, 0, 0.1); | |
} | |
} | |
.dataset-table { | |
width: 100%; | |
border-collapse: separate; | |
border-spacing: 0; | |
margin: 2rem 0; | |
background: var(--bg-secondary); | |
border-radius: 1rem; | |
overflow: hidden; | |
box-shadow: 0 4px 20px var(--shadow-color); | |
} | |
.dataset-table thead { | |
background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple)); | |
} | |
.dataset-table th { | |
padding: 1.25rem 1rem; | |
text-align: left; | |
color: white; | |
font-weight: 600; | |
font-size: 1rem; | |
} | |
.dataset-table td { | |
padding: 1rem; | |
border-bottom: 1px solid var(--border-primary); | |
color: var(--text-secondary); | |
transition: all 0.2s ease; | |
} | |
.dataset-table tbody tr:hover td { | |
background: var(--card-hover-bg); | |
color: var(--text-primary); | |
} | |
.methodology-content { | |
max-width: 1200px; | |
margin: 0 auto; | |
padding: 2rem; | |
color: var(--text-secondary); | |
line-height: 1.7; | |
font-size: 1rem; | |
} | |
.section-title { | |
font-size: 2.5rem; | |
font-weight: 700; | |
margin: 3rem 0 1.5rem; | |
color: var(--text-primary); | |
background: linear-gradient(to right, var(--accent-blue), var(--accent-purple)); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
letter-spacing: -0.02em; | |
} | |
</style> | |
<div class="section-divider"></div> | |
<h1 class="section-title">Citation</h2> | |
<div class="bibtex-citation" style="font-family: monospace; white-space: pre; padding: 1em; background-color: rgba(128, 128, 128, 0.1); border: 1px solid rgba(128, 128, 128, 0.2); border-radius: 4px; color: currentColor;"> | |
@misc{khouja2025lingolytoodisentanglingmemorisationreasoning, | |
title={LINGOLY-TOO: Disentangling Memorisation from Reasoning with Linguistic Templatisation and Orthographic Obfuscation}, | |
author={Jude Khouja and Karolina Korgul and Simi Hellsten | |
and Lingyi Yang and Vlad Neacsu and Harry Mayne and Ryan Kearns and Andrew Bean and Adam Mahdi}, | |
year={2025}, | |
eprint={2503.02972}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.CL}, | |
url={https://arxiv.org/abs/2503.02972}, | |
} | |
</div> | |
""" | |
UNUSED = """ | |
<!-- Insights Section --> | |
<h1 class="section-title">Key insights</h1> | |
<p> | |
We use orthographic templatisation on Linguistics Olympiad problems to create obfuscated variants | |
that maintain the same reasoning steps. Through extensive experiments, we show that obfuscation | |
reduces measurement bias from data exposure and provides reasoning estimates that correlate with | |
the ability to solve linguistic reasoning problems. Additionally, we find that state-of-the-art | |
models exhibit inconsistent reasoning abilities and that simple fine-tuning does not necessarily | |
equip models with context-free and robust problem-solving skills. This work establishes a reasoning | |
measure that is resilient to data exposure effects and supports ongoing efforts to fully understand | |
response generation in advanced models. | |
</p> | |
""" | |