Spaces:

macrocosm-os
/

finetuning-leaderboard

Runtime error

App Files Files Community

rusticluftig commited on Dec 8, 2024

Commit

92ec2a2

1 Parent(s): 838067a

Update LB for INSTRUCT_8B comp

Browse files

Files changed (3) hide show

app.py +47 -56
competitions.py +10 -0
utils.py +21 -7

app.py CHANGED Viewed

@@ -4,10 +4,10 @@ import datetime
 import os
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from dotenv import load_dotenv
 from huggingface_hub import HfApi
-import matplotlib.pyplot as plt
 import competitions
 import utils
@@ -57,7 +57,6 @@ def main():
     validator_df = state_vars["validator_df"]
     benchmarks_df = state_vars["benchmarks_df"]
     benchmarks_targets = state_vars["benchmarks_targets"]
-    losses_2 = state_vars["losses_2"]
     demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
     with demo:
@@ -75,58 +74,41 @@ def main():
             num_top_classes=10,
         )
         with gr.Accordion("Competition Results"):
             gr.HTML(EVALUATION_HEADER)
             show_stale = gr.Checkbox(label="Show Stale", interactive=True)
             competition_leaderboards = []
-            comp_2 = competitions.COMPETITION_DETAILS[2]
-            with gr.Accordion(f"{comp_2.name} Competition"):
-                gr.HTML(comp_2.html_description)
-                competition_leaderboards.append(
-                    gr.components.Dataframe(
-                        value=utils.leaderboard_data(
-                            model_data, scores, 2, show_stale.value
-                        ),
-                        headers=[
-                            "Name",
-                            "Win Rate",
-                            "Score",
-                            "Weight",
-                            "UID",
-                            "Block",
-                        ],
-                        datatype=[
-                            "markdown",
-                            "number",
-                            "number",
-                            "number",
-                            "number",
-                            "number",
-                        ],
-                        elem_id="comp2-table",
-                        interactive=False,
-                        visible=True,
                     )
-                )
-                gr.LinePlot(
-                    losses_2,
-                    x="timestamp",
-                    x_title="Date",
-                    y="losses",
-                    y_title="Score",
-                    interactive=True,
-                    visible=True,
-                    width=1024,
-                    title="Best Score Over Time",
-                )
-                gr.HTML(
-                    """
-                        The definition of score changes over time as new evaluation tasks are added in releases.
-                        <ul>
-                        <li><b>Start-Oct 27</b>: % of wrong answers on synthetic MMLU</li>
-                        <li><b>Oct 27-Now</b>: + word sorting eval</li>
-                        """
-                )
             gr.HTML(
                 """
                     <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
@@ -137,17 +119,23 @@ def main():
                     <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
             )
             show_stale.change(
-                lambda stale: [utils.leaderboard_data(model_data, scores, 2, stale)],
                 inputs=[show_stale],
                 outputs=competition_leaderboards,
             )
         if benchmarks_df is not None:
-            def create_benchmark_plot(benchmark: str):
                 fig = plt.figure(figsize=(10, 8))
-                plt.plot(benchmarks_df["timestamp"], benchmarks_df[benchmark])
                 # Adding horizontal dotted lines for various benchmark targets (well-known models)
                 for model, score in benchmarks_targets[benchmark].items():
@@ -169,10 +157,13 @@ def main():
                 return fig
             with gr.Accordion("Top Model Benchmarks"):
-                mmlu = create_benchmark_plot("mmlu")
-                mmlu_pro = create_benchmark_plot("mmlu_pro")
-                gr.Plot(mmlu)
-                gr.Plot(mmlu_pro)
                 gr.HTML(
                     """<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
                 )

 import os
 import gradio as gr
+import matplotlib.pyplot as plt
 from apscheduler.schedulers.background import BackgroundScheduler
 from dotenv import load_dotenv
 from huggingface_hub import HfApi
 import competitions
 import utils
     validator_df = state_vars["validator_df"]
     benchmarks_df = state_vars["benchmarks_df"]
     benchmarks_targets = state_vars["benchmarks_targets"]
     demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
     with demo:
             num_top_classes=10,
         )
+        comp_ids = [2, 3]
         with gr.Accordion("Competition Results"):
             gr.HTML(EVALUATION_HEADER)
             show_stale = gr.Checkbox(label="Show Stale", interactive=True)
             competition_leaderboards = []
+            for comp_id in comp_ids:
+                details = competitions.COMPETITION_DETAILS[comp_id]
+                with gr.Accordion(f"{details.name} Competition"):
+                    gr.HTML(details.html_description)
+                    competition_leaderboards.append(
+                        gr.components.Dataframe(
+                            value=utils.leaderboard_data(
+                                model_data, scores, comp_id, show_stale.value
+                            ),
+                            headers=[
+                                "Name",
+                                "Win Rate",
+                                "Score",
+                                "Weight",
+                                "UID",
+                                "Block",
+                            ],
+                            datatype=[
+                                "markdown",
+                                "number",
+                                "number",
+                                "number",
+                                "number",
+                                "number",
+                            ],
+                            elem_id=f"comp{comp_id}-table",
+                            interactive=False,
+                            visible=True,
+                        )
                     )
             gr.HTML(
                 """
                     <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
                     <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
             )
             show_stale.change(
+                lambda stale: [
+                    utils.leaderboard_data(model_data, scores, id, stale)
+                    for id in comp_ids
+                ],
                 inputs=[show_stale],
                 outputs=competition_leaderboards,
             )
         if benchmarks_df is not None:
+            def create_benchmark_plot(benchmark: str, comp_id: int):
                 fig = plt.figure(figsize=(10, 8))
+                # Filter to just entries for this competition.
+                df = benchmarks_df[benchmarks_df["competition_id"] == comp_id]
+                plt.plot(df["timestamp"], df[benchmark])
                 # Adding horizontal dotted lines for various benchmark targets (well-known models)
                 for model, score in benchmarks_targets[benchmark].items():
                 return fig
             with gr.Accordion("Top Model Benchmarks"):
+                for comp_id in comp_ids:
+                    details = competitions.COMPETITION_DETAILS[comp_id]
+                    with gr.Accordion(f"{details.name} Benchmarks"):
+                        mmlu = create_benchmark_plot("mmlu", comp_id)
+                        mmlu_pro = create_benchmark_plot("mmlu_pro", comp_id)
+                        gr.Plot(mmlu)
+                        gr.Plot(mmlu_pro)
                 gr.HTML(
                     """<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
                 )

competitions.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from dataclasses import dataclass
 from typing import Dict
@@ -21,5 +22,14 @@ COMPETITION_DETAILS: Dict[int, CompetitionDetails] = {
         name="General Knowledge Chat-bot",
         # TODO: Add link to SN1 dataset details.
         html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
     )
 }

 from dataclasses import dataclass
+import html
 from typing import Dict
         name="General Knowledge Chat-bot",
         # TODO: Add link to SN1 dataset details.
         html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
+    ),
+    3: CompetitionDetails(
+        name="General Knowledge Chat-bot (BYO tokenizer)",
+        html_description="""<b>Competition ID 3</b><br/>Produce the best general knowledge chat-bot. Models bring their own tokenizer and are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
     )
 }
+COMP_NAME_TO_ID = {
+    "B7_MULTI_CHOICE": 2,
+    "INSTRUCT_8B": 3,
+}

utils.py CHANGED Viewed

@@ -15,7 +15,9 @@ import pandas as pd
 import wandb
 from bittensor.extrinsics.serving import get_metadata
 from dotenv import load_dotenv
-from wandb.apis.public.history import SampledHistoryScan
 NETUID = 37
 DELAY_SECS = 3
@@ -331,17 +333,16 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
     runs = get_wandb_runs(
         project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
     )
-    timestamps, uids, models, mmlu, mmlu_pro = [], [], [], [], []
     for run in runs:
         uid = run.config.get("uid", None)
         model = run.config.get("model", None)
         if not uid or not model:
             continue
         samples = list(
-            SampledHistoryScan(
                 run.client,
                 run,
-                ["_timestamp", "mmlu.acc,none", "mmlu_pro"],
                 0,
                 1,
             )
@@ -349,6 +350,19 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
         if not samples:
             continue
         sample = samples[0]
         timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
         mmlu.append(sample["mmlu.acc,none"])
         mmlu_pro.append(sample["mmlu_pro"])
@@ -360,6 +374,7 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
                 "timestamp": timestamps,
                 "uid": uids,
                 "model": models,
                 "mmlu": mmlu,
                 "mmlu_pro": mmlu_pro,
             }
@@ -463,8 +478,8 @@ def load_state_vars() -> dict[Any]:
             print("Loaded validator weights")
             # Compute loss over time for all competitions.
-            losses_2 = get_losses_over_time(vali_runs, 2)
-            print("Loaded losses over time for comp 2")
             benchmarks_df, benchmarks_targets = get_benchmarks()
             print("Loaded benchmarks")
@@ -486,5 +501,4 @@ def load_state_vars() -> dict[Any]:
         "validator_df": validator_df,
         "benchmarks_df": benchmarks_df,
         "benchmarks_targets": benchmarks_targets,
-        "losses_2": losses_2,
     }

 import wandb
 from bittensor.extrinsics.serving import get_metadata
 from dotenv import load_dotenv
+from wandb.apis.public.history import HistoryScan, SampledHistoryScan
+from competitions import COMP_NAME_TO_ID
 NETUID = 37
 DELAY_SECS = 3
     runs = get_wandb_runs(
         project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
     )
+    timestamps, uids, models, comp_ids, mmlu, mmlu_pro = [], [], [], [], [], []
     for run in runs:
         uid = run.config.get("uid", None)
         model = run.config.get("model", None)
         if not uid or not model:
             continue
         samples = list(
+            HistoryScan(
                 run.client,
                 run,
                 0,
                 1,
             )
         if not samples:
             continue
         sample = samples[0]
+        # Make sure we have all the required keys.
+        has_all_keys = True
+        for required_key in ["mmlu.acc,none", "mmlu_pro", "_timestamp"]:
+            if required_key not in sample:
+                has_all_keys = False
+                break
+        if not has_all_keys:
+            continue
+        # Any run without a competition ID was for competition 2.
+        comp_name = sample.get("competition_id", "B7_MULTI_CHOICE")
+        comp_ids.append(COMP_NAME_TO_ID.get(comp_name, 2))
         timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
         mmlu.append(sample["mmlu.acc,none"])
         mmlu_pro.append(sample["mmlu_pro"])
                 "timestamp": timestamps,
                 "uid": uids,
                 "model": models,
+                "competition_id": comp_ids,
                 "mmlu": mmlu,
                 "mmlu_pro": mmlu_pro,
             }
             print("Loaded validator weights")
             # Compute loss over time for all competitions.
+            # losses_2 = get_losses_over_time(vali_runs, 2)
+            # print("Loaded losses over time for comp 2")
             benchmarks_df, benchmarks_targets = get_benchmarks()
             print("Loaded benchmarks")
         "validator_df": validator_df,
         "benchmarks_df": benchmarks_df,
         "benchmarks_targets": benchmarks_targets,
     }