import pandas as pd def load_data(): """Load and preprocess the data.""" df = pd.read_csv("leaderboard.csv").dropna() return df df = load_data() MODELS = [x.strip() for x in df["Model"].unique().tolist()] COMMON = """ """ # Define constants for the links PAPER_LINK = "https://arxiv.org/abs/2503.02972" CODE_LINK = "https://github.com/jkhouja/L2" BLOG_LINK = "https://www.lesswrong.com/posts/pbt8GYpdip7NkuwGy/are-recent-llms-better-at-reasoning-or-better-at-memorizing" DATASET_LINK = "https://huggingface.co/datasets/jkhouja/LingOly-TOO" ADD_MODEL_LINK = ( "https://mail.google.com/mail/?view=cm&fs=1&to=jude.khouja@oii.ox.ac.uk" "&su=Get%20Model%20Added%20to%20Leaderboard&body=Hi%20there%2C%0A%0AI%20" "would%20like%20to%20add%20my%20model%20to%20the%20Lingoly-TOO%20Leaderboard.%0A%0AModel%20Name%3A%0AModel%20URL%3A%0A%0ABest%20regards" ) HEADER_CONTENT = ( COMMON + f"""
We use orthographic templatisation on Linguistics Olympiad problems to create obfuscated variants that maintain the same reasoning steps. Through extensive experiments, we show that obfuscation reduces measurement bias from data exposure and provides reasoning estimates that correlate with the ability to solve linguistic reasoning problems. Additionally, we find that state-of-the-art models exhibit inconsistent reasoning abilities and that simple fine-tuning does not necessarily equip models with context-free and robust problem-solving skills. This work establishes a reasoning measure that is resilient to data exposure effects and supports ongoing efforts to fully understand response generation in advanced models.
"""