Spaces:
Runtime error
Runtime error
rusticluftig
commited on
Commit
·
92ec2a2
1
Parent(s):
838067a
Update LB for INSTRUCT_8B comp
Browse files- app.py +47 -56
- competitions.py +10 -0
- utils.py +21 -7
app.py
CHANGED
|
@@ -4,10 +4,10 @@ import datetime
|
|
| 4 |
import os
|
| 5 |
|
| 6 |
import gradio as gr
|
|
|
|
| 7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
from huggingface_hub import HfApi
|
| 10 |
-
import matplotlib.pyplot as plt
|
| 11 |
|
| 12 |
import competitions
|
| 13 |
import utils
|
|
@@ -57,7 +57,6 @@ def main():
|
|
| 57 |
validator_df = state_vars["validator_df"]
|
| 58 |
benchmarks_df = state_vars["benchmarks_df"]
|
| 59 |
benchmarks_targets = state_vars["benchmarks_targets"]
|
| 60 |
-
losses_2 = state_vars["losses_2"]
|
| 61 |
|
| 62 |
demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
|
| 63 |
with demo:
|
|
@@ -75,58 +74,41 @@ def main():
|
|
| 75 |
num_top_classes=10,
|
| 76 |
)
|
| 77 |
|
|
|
|
| 78 |
with gr.Accordion("Competition Results"):
|
| 79 |
gr.HTML(EVALUATION_HEADER)
|
| 80 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
| 81 |
competition_leaderboards = []
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
gr.
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
| 109 |
)
|
| 110 |
-
)
|
| 111 |
-
gr.LinePlot(
|
| 112 |
-
losses_2,
|
| 113 |
-
x="timestamp",
|
| 114 |
-
x_title="Date",
|
| 115 |
-
y="losses",
|
| 116 |
-
y_title="Score",
|
| 117 |
-
interactive=True,
|
| 118 |
-
visible=True,
|
| 119 |
-
width=1024,
|
| 120 |
-
title="Best Score Over Time",
|
| 121 |
-
)
|
| 122 |
-
gr.HTML(
|
| 123 |
-
"""
|
| 124 |
-
The definition of score changes over time as new evaluation tasks are added in releases.
|
| 125 |
-
<ul>
|
| 126 |
-
<li><b>Start-Oct 27</b>: % of wrong answers on synthetic MMLU</li>
|
| 127 |
-
<li><b>Oct 27-Now</b>: + word sorting eval</li>
|
| 128 |
-
"""
|
| 129 |
-
)
|
| 130 |
gr.HTML(
|
| 131 |
"""
|
| 132 |
<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
|
|
@@ -137,17 +119,23 @@ def main():
|
|
| 137 |
<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
|
| 138 |
)
|
| 139 |
show_stale.change(
|
| 140 |
-
lambda stale: [
|
|
|
|
|
|
|
|
|
|
| 141 |
inputs=[show_stale],
|
| 142 |
outputs=competition_leaderboards,
|
| 143 |
)
|
| 144 |
|
| 145 |
if benchmarks_df is not None:
|
| 146 |
|
| 147 |
-
def create_benchmark_plot(benchmark: str):
|
| 148 |
fig = plt.figure(figsize=(10, 8))
|
| 149 |
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
# Adding horizontal dotted lines for various benchmark targets (well-known models)
|
| 153 |
for model, score in benchmarks_targets[benchmark].items():
|
|
@@ -169,10 +157,13 @@ def main():
|
|
| 169 |
return fig
|
| 170 |
|
| 171 |
with gr.Accordion("Top Model Benchmarks"):
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
| 176 |
gr.HTML(
|
| 177 |
"""<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
|
| 178 |
)
|
|
|
|
| 4 |
import os
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
from huggingface_hub import HfApi
|
|
|
|
| 11 |
|
| 12 |
import competitions
|
| 13 |
import utils
|
|
|
|
| 57 |
validator_df = state_vars["validator_df"]
|
| 58 |
benchmarks_df = state_vars["benchmarks_df"]
|
| 59 |
benchmarks_targets = state_vars["benchmarks_targets"]
|
|
|
|
| 60 |
|
| 61 |
demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
|
| 62 |
with demo:
|
|
|
|
| 74 |
num_top_classes=10,
|
| 75 |
)
|
| 76 |
|
| 77 |
+
comp_ids = [2, 3]
|
| 78 |
with gr.Accordion("Competition Results"):
|
| 79 |
gr.HTML(EVALUATION_HEADER)
|
| 80 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
| 81 |
competition_leaderboards = []
|
| 82 |
+
for comp_id in comp_ids:
|
| 83 |
+
details = competitions.COMPETITION_DETAILS[comp_id]
|
| 84 |
+
with gr.Accordion(f"{details.name} Competition"):
|
| 85 |
+
gr.HTML(details.html_description)
|
| 86 |
+
competition_leaderboards.append(
|
| 87 |
+
gr.components.Dataframe(
|
| 88 |
+
value=utils.leaderboard_data(
|
| 89 |
+
model_data, scores, comp_id, show_stale.value
|
| 90 |
+
),
|
| 91 |
+
headers=[
|
| 92 |
+
"Name",
|
| 93 |
+
"Win Rate",
|
| 94 |
+
"Score",
|
| 95 |
+
"Weight",
|
| 96 |
+
"UID",
|
| 97 |
+
"Block",
|
| 98 |
+
],
|
| 99 |
+
datatype=[
|
| 100 |
+
"markdown",
|
| 101 |
+
"number",
|
| 102 |
+
"number",
|
| 103 |
+
"number",
|
| 104 |
+
"number",
|
| 105 |
+
"number",
|
| 106 |
+
],
|
| 107 |
+
elem_id=f"comp{comp_id}-table",
|
| 108 |
+
interactive=False,
|
| 109 |
+
visible=True,
|
| 110 |
+
)
|
| 111 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
gr.HTML(
|
| 113 |
"""
|
| 114 |
<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
|
|
|
|
| 119 |
<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
|
| 120 |
)
|
| 121 |
show_stale.change(
|
| 122 |
+
lambda stale: [
|
| 123 |
+
utils.leaderboard_data(model_data, scores, id, stale)
|
| 124 |
+
for id in comp_ids
|
| 125 |
+
],
|
| 126 |
inputs=[show_stale],
|
| 127 |
outputs=competition_leaderboards,
|
| 128 |
)
|
| 129 |
|
| 130 |
if benchmarks_df is not None:
|
| 131 |
|
| 132 |
+
def create_benchmark_plot(benchmark: str, comp_id: int):
|
| 133 |
fig = plt.figure(figsize=(10, 8))
|
| 134 |
|
| 135 |
+
# Filter to just entries for this competition.
|
| 136 |
+
df = benchmarks_df[benchmarks_df["competition_id"] == comp_id]
|
| 137 |
+
|
| 138 |
+
plt.plot(df["timestamp"], df[benchmark])
|
| 139 |
|
| 140 |
# Adding horizontal dotted lines for various benchmark targets (well-known models)
|
| 141 |
for model, score in benchmarks_targets[benchmark].items():
|
|
|
|
| 157 |
return fig
|
| 158 |
|
| 159 |
with gr.Accordion("Top Model Benchmarks"):
|
| 160 |
+
for comp_id in comp_ids:
|
| 161 |
+
details = competitions.COMPETITION_DETAILS[comp_id]
|
| 162 |
+
with gr.Accordion(f"{details.name} Benchmarks"):
|
| 163 |
+
mmlu = create_benchmark_plot("mmlu", comp_id)
|
| 164 |
+
mmlu_pro = create_benchmark_plot("mmlu_pro", comp_id)
|
| 165 |
+
gr.Plot(mmlu)
|
| 166 |
+
gr.Plot(mmlu_pro)
|
| 167 |
gr.HTML(
|
| 168 |
"""<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
|
| 169 |
)
|
competitions.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from dataclasses import dataclass
|
|
|
|
| 2 |
from typing import Dict
|
| 3 |
|
| 4 |
|
|
@@ -21,5 +22,14 @@ COMPETITION_DETAILS: Dict[int, CompetitionDetails] = {
|
|
| 21 |
name="General Knowledge Chat-bot",
|
| 22 |
# TODO: Add link to SN1 dataset details.
|
| 23 |
html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
)
|
| 25 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
+
import html
|
| 3 |
from typing import Dict
|
| 4 |
|
| 5 |
|
|
|
|
| 22 |
name="General Knowledge Chat-bot",
|
| 23 |
# TODO: Add link to SN1 dataset details.
|
| 24 |
html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
|
| 25 |
+
),
|
| 26 |
+
3: CompetitionDetails(
|
| 27 |
+
name="General Knowledge Chat-bot (BYO tokenizer)",
|
| 28 |
+
html_description="""<b>Competition ID 3</b><br/>Produce the best general knowledge chat-bot. Models bring their own tokenizer and are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
|
| 29 |
)
|
| 30 |
}
|
| 31 |
+
|
| 32 |
+
COMP_NAME_TO_ID = {
|
| 33 |
+
"B7_MULTI_CHOICE": 2,
|
| 34 |
+
"INSTRUCT_8B": 3,
|
| 35 |
+
}
|
utils.py
CHANGED
|
@@ -15,7 +15,9 @@ import pandas as pd
|
|
| 15 |
import wandb
|
| 16 |
from bittensor.extrinsics.serving import get_metadata
|
| 17 |
from dotenv import load_dotenv
|
| 18 |
-
from wandb.apis.public.history import SampledHistoryScan
|
|
|
|
|
|
|
| 19 |
|
| 20 |
NETUID = 37
|
| 21 |
DELAY_SECS = 3
|
|
@@ -331,17 +333,16 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
|
|
| 331 |
runs = get_wandb_runs(
|
| 332 |
project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
|
| 333 |
)
|
| 334 |
-
timestamps, uids, models, mmlu, mmlu_pro = [], [], [], [], []
|
| 335 |
for run in runs:
|
| 336 |
uid = run.config.get("uid", None)
|
| 337 |
model = run.config.get("model", None)
|
| 338 |
if not uid or not model:
|
| 339 |
continue
|
| 340 |
samples = list(
|
| 341 |
-
|
| 342 |
run.client,
|
| 343 |
run,
|
| 344 |
-
["_timestamp", "mmlu.acc,none", "mmlu_pro"],
|
| 345 |
0,
|
| 346 |
1,
|
| 347 |
)
|
|
@@ -349,6 +350,19 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
|
|
| 349 |
if not samples:
|
| 350 |
continue
|
| 351 |
sample = samples[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
|
| 353 |
mmlu.append(sample["mmlu.acc,none"])
|
| 354 |
mmlu_pro.append(sample["mmlu_pro"])
|
|
@@ -360,6 +374,7 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
|
|
| 360 |
"timestamp": timestamps,
|
| 361 |
"uid": uids,
|
| 362 |
"model": models,
|
|
|
|
| 363 |
"mmlu": mmlu,
|
| 364 |
"mmlu_pro": mmlu_pro,
|
| 365 |
}
|
|
@@ -463,8 +478,8 @@ def load_state_vars() -> dict[Any]:
|
|
| 463 |
print("Loaded validator weights")
|
| 464 |
|
| 465 |
# Compute loss over time for all competitions.
|
| 466 |
-
losses_2 = get_losses_over_time(vali_runs, 2)
|
| 467 |
-
print("Loaded losses over time for comp 2")
|
| 468 |
|
| 469 |
benchmarks_df, benchmarks_targets = get_benchmarks()
|
| 470 |
print("Loaded benchmarks")
|
|
@@ -486,5 +501,4 @@ def load_state_vars() -> dict[Any]:
|
|
| 486 |
"validator_df": validator_df,
|
| 487 |
"benchmarks_df": benchmarks_df,
|
| 488 |
"benchmarks_targets": benchmarks_targets,
|
| 489 |
-
"losses_2": losses_2,
|
| 490 |
}
|
|
|
|
| 15 |
import wandb
|
| 16 |
from bittensor.extrinsics.serving import get_metadata
|
| 17 |
from dotenv import load_dotenv
|
| 18 |
+
from wandb.apis.public.history import HistoryScan, SampledHistoryScan
|
| 19 |
+
|
| 20 |
+
from competitions import COMP_NAME_TO_ID
|
| 21 |
|
| 22 |
NETUID = 37
|
| 23 |
DELAY_SECS = 3
|
|
|
|
| 333 |
runs = get_wandb_runs(
|
| 334 |
project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
|
| 335 |
)
|
| 336 |
+
timestamps, uids, models, comp_ids, mmlu, mmlu_pro = [], [], [], [], [], []
|
| 337 |
for run in runs:
|
| 338 |
uid = run.config.get("uid", None)
|
| 339 |
model = run.config.get("model", None)
|
| 340 |
if not uid or not model:
|
| 341 |
continue
|
| 342 |
samples = list(
|
| 343 |
+
HistoryScan(
|
| 344 |
run.client,
|
| 345 |
run,
|
|
|
|
| 346 |
0,
|
| 347 |
1,
|
| 348 |
)
|
|
|
|
| 350 |
if not samples:
|
| 351 |
continue
|
| 352 |
sample = samples[0]
|
| 353 |
+
|
| 354 |
+
# Make sure we have all the required keys.
|
| 355 |
+
has_all_keys = True
|
| 356 |
+
for required_key in ["mmlu.acc,none", "mmlu_pro", "_timestamp"]:
|
| 357 |
+
if required_key not in sample:
|
| 358 |
+
has_all_keys = False
|
| 359 |
+
break
|
| 360 |
+
if not has_all_keys:
|
| 361 |
+
continue
|
| 362 |
+
|
| 363 |
+
# Any run without a competition ID was for competition 2.
|
| 364 |
+
comp_name = sample.get("competition_id", "B7_MULTI_CHOICE")
|
| 365 |
+
comp_ids.append(COMP_NAME_TO_ID.get(comp_name, 2))
|
| 366 |
timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
|
| 367 |
mmlu.append(sample["mmlu.acc,none"])
|
| 368 |
mmlu_pro.append(sample["mmlu_pro"])
|
|
|
|
| 374 |
"timestamp": timestamps,
|
| 375 |
"uid": uids,
|
| 376 |
"model": models,
|
| 377 |
+
"competition_id": comp_ids,
|
| 378 |
"mmlu": mmlu,
|
| 379 |
"mmlu_pro": mmlu_pro,
|
| 380 |
}
|
|
|
|
| 478 |
print("Loaded validator weights")
|
| 479 |
|
| 480 |
# Compute loss over time for all competitions.
|
| 481 |
+
# losses_2 = get_losses_over_time(vali_runs, 2)
|
| 482 |
+
# print("Loaded losses over time for comp 2")
|
| 483 |
|
| 484 |
benchmarks_df, benchmarks_targets = get_benchmarks()
|
| 485 |
print("Loaded benchmarks")
|
|
|
|
| 501 |
"validator_df": validator_df,
|
| 502 |
"benchmarks_df": benchmarks_df,
|
| 503 |
"benchmarks_targets": benchmarks_targets,
|
|
|
|
| 504 |
}
|