Speech-IQ-leaderboard

Running

App Files Files Community

huckiyang commited on Jul 26

Commit

97984bb

1 Parent(s): 97dd638

[release] speechIQ

Browse files

Files changed (3) hide show

SpeechIQ_table.csv +14 -0
app.py +148 -169
src/about.py +71 -47

SpeechIQ_table.csv ADDED Viewed

	@@ -0,0 +1,14 @@

+Model Type,Setup,Audio Encoder,Remember,Understand,Apply,Speech IQ
+Agentic: ASR + LLM,Whisper_v2-1.5B + Qwen2_7B,Whisper_v2-1.5B,0.554,0.499,0.481,107.43
+Agentic: ASR + LLM,Whisper_v3-1.5B + Qwen2_7B,Whisper_v2-1.5B,0.553,0.433,0.432,106.49
+Agentic: ASR + LLM,Canary_1B + Qwen2_7B,Whisper_v2-1.5B,0.559,0.566,0.504,107.78
+Agentic: ASR + LLM,OWSM-CTC_v3.1-1B + Qwen2_7B,OWSM-CTC_v3.1-1B,0.534,0.151,0.353,103.05
+Agentic: ASR + GER + LLM,Whisper_v2-1.5B + GPT-4o + Qwen2_7B,Whisper_v2-1.5B,0.543,0.632,0.487,108.64
+End2End,Qwen2-Audio_7B ,1.5B Whisper,-0.187,0.366,0.011,103.88
+End2End,Qwen2.5-Omni_7B ,1.5B Whisper,0.472,0.41,0.509,105.74
+End2End,Salmonn_13B ,1.5B Whisper,0.508,0.381,-1.146,101.03
+End2End,Desta2_8B,1.5B Whisper,-2.575,-1.604,-0.233,79.69
+End2End,AnyGPT_7B,SpeechTokenizer,0.314,-2.718,-2.893,60.02
+End2End,Baichuan-omni-1.5_7B,1.5B Whisper,0.448,0.184,0.546,104.02
+End2End,Gemini-1.5-flash,Google_USM,-1.885,0.641,0.673,107.85
+End2End,Gemini-1.5-pro,Google_USM,0.492,0.409,0.71,107.08

app.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
@@ -13,192 +11,173 @@ from src.about import (
     TITLE,
 )
 from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
-demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
             with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
-                lines=20,
                 elem_id="citation-button",
                 show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import gradio as gr
 import pandas as pd
+import numpy as np
 from src.about import (
     CITATION_BUTTON_LABEL,
     TITLE,
 )
 from src.display.css_html_js import custom_css
+def load_speechiq_data():
+    """Load and process the SpeechIQ results from CSV file."""
+    try:
+        df = pd.read_csv("SpeechIQ_table.csv")
+        # Round numerical columns to 3 decimal places for better display
+        numerical_cols = ['Remember', 'Understand', 'Apply', 'Speech IQ']
+        for col in numerical_cols:
+            if col in df.columns:
+                df[col] = df[col].round(3)
+        # Sort by Speech IQ score in descending order
+        df = df.sort_values('Speech IQ', ascending=False)
+        return df
+    except Exception as e:
+        print(f"Error loading SpeechIQ data: {e}")
+        # Return empty dataframe with expected columns if file not found
+        return pd.DataFrame(columns=['Model Type', 'Setup', 'Audio Encoder', 'Remember', 'Understand', 'Apply', 'Speech IQ'])
+def create_leaderboard_table(df):
+    """Create a formatted leaderboard table with color coding."""
+    if df.empty:
+        return gr.Dataframe(
+            value=df,
+            headers=['Model Type', 'Setup', 'Audio Encoder', 'Remember', 'Understand', 'Apply', 'Speech IQ'],
+            interactive=False
+        )
+    return gr.Dataframe(
+        value=df,
+        headers=df.columns.tolist(),
         interactive=False,
+        wrap=True,
+        column_widths=["15%", "25%", "15%", "11%", "11%", "11%", "12%"],
+        height=600
     )
+def get_top_performers(df):
+    """Get statistics about top performers."""
+    if df.empty:
+        return "No data available."
+    top_score = df['Speech IQ'].max()
+    top_model = df.loc[df['Speech IQ'].idxmax()]
+    agentic_best = df[df['Model Type'].str.contains('Agentic', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('Agentic', na=False)].empty else 0
+    end2end_best = df[df['Model Type'].str.contains('End2End', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('End2End', na=False)].empty else 0
+    stats_text = f"""
+    ### 📊 Leaderboard Statistics
+    **🏆 Top Performer:** {top_model['Setup']} (Score: {top_score})
+    **🤖 Best Agentic Model:** {agentic_best}
+    **🔄 Best End2End Model:** {end2end_best}
+    **📈 Total Models Evaluated:** {len(df)}
+    """
+    return stats_text
+# Load the data
+speechiq_df = load_speechiq_data()
+# Create the Gradio interface
+demo = gr.Blocks(css=custom_css, title="SpeechIQ Leaderboard")
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 SpeechIQ Leaderboard", elem_id="speechiq-leaderboard-tab", id=0):
+            # Statistics section
             with gr.Row():
+                gr.Markdown(get_top_performers(speechiq_df), elem_classes="markdown-text")
+            # Main leaderboard table
             with gr.Row():
+                leaderboard_table = create_leaderboard_table(speechiq_df)
+            # Legend and explanation
+            with gr.Row():
+                gr.Markdown("""
+                ### 📋 Column Explanations
+                - **Model Type**: Architecture approach (Agentic vs End2End)
+                - **Setup**: Specific model configuration and components
+                - **Audio Encoder**: The audio processing component used
+                - **Remember**: Verbatim accuracy score (WER-based)
+                - **Understand**: Semantic interpretation similarity score
+                - **Apply**: Downstream task performance score
+                - **Speech IQ**: Overall intelligence quotient combining all dimensions
+                *Higher scores indicate better performance across all metrics.*
+                """, elem_classes="markdown-text")
+        with gr.TabItem("📊 Analysis", elem_id="analysis-tab", id=1):
+            with gr.Row():
+                # Create performance comparison charts
+                if not speechiq_df.empty:
+                    # Group by model type for comparison
+                    agentic_models = speechiq_df[speechiq_df['Model Type'].str.contains('Agentic', na=False)]
+                    end2end_models = speechiq_df[speechiq_df['Model Type'].str.contains('End2End', na=False)]
+                    comparison_text = f"""
+                    ### 🔍 Model Type Comparison
+                    **Agentic Models (ASR + LLM):**
+                    - Count: {len(agentic_models)}
+                    - Average Speech IQ: {agentic_models['Speech IQ'].mean():.2f}
+                    - Best Score: {agentic_models['Speech IQ'].max():.2f}
+                    **End-to-End Models:**
+                    - Count: {len(end2end_models)}
+                    - Average Speech IQ: {end2end_models['Speech IQ'].mean():.2f}
+                    - Best Score: {end2end_models['Speech IQ'].max():.2f}
+                    ### 🎯 Cognitive Dimension Analysis
+                    **Remember (Verbatim Accuracy):**
+                    - Best performer: {speechiq_df.loc[speechiq_df['Remember'].idxmax(), 'Setup']} ({speechiq_df['Remember'].max():.3f})
+                    **Understand (Semantic Similarity):**
+                    - Best performer: {speechiq_df.loc[speechiq_df['Understand'].idxmax(), 'Setup']} ({speechiq_df['Understand'].max():.3f})
+                    **Apply (Task Performance):**
+                    - Best performer: {speechiq_df.loc[speechiq_df['Apply'].idxmax(), 'Setup']} ({speechiq_df['Apply'].max():.3f})
+                    """
+                    gr.Markdown(comparison_text, elem_classes="markdown-text")
+                else:
+                    gr.Markdown("No data available for analysis.", elem_classes="markdown-text")
+        with gr.TabItem("📝 About", elem_id="about-tab", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit", elem_id="submit-tab", id=3):
+            gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+    # Citation section
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
+                lines=6,
                 elem_id="citation-button",
                 show_copy_button=True,
             )
+    # Add refresh functionality
+    with gr.Row():
+        refresh_button = gr.Button("🔄 Refresh Data", variant="secondary")
+        def refresh_data():
+            updated_df = load_speechiq_data()
+            return create_leaderboard_table(updated_df), get_top_performers(updated_df)
+        refresh_button.click(
+            refresh_data,
+            outputs=[leaderboard_table, gr.Markdown()]
+        )
+if __name__ == "__main__":
+    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)

src/about.py CHANGED Viewed

@@ -1,72 +1,96 @@
 from dataclasses import dataclass
 from enum import Enum
-@dataclass
-class Task:
-    benchmark: str
-    metric: str
-    col_name: str
-# Select your tasks here
-# ---------------------------------------------------
-class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0 # Change with your few shot
-# ---------------------------------------------------
-# Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
-# What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""
-"""

 from dataclasses import dataclass
 from enum import Enum
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">🎙️ Speech Intelligence Quotient (SpeechIQ) Leaderboard</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+## 🎯 Welcome to the Speech Intelligence Quotient (SpeechIQ) Leaderboard!
+This leaderboard presents evaluation results for voice understanding large language models (LLM<sub>Voice</sub>) using our novel SpeechIQ evaluation framework.
+**SpeechIQ** is a human cognition-inspired evaluation pipeline that assesses voice understanding abilities across three cognitive levels based on Bloom's Taxonomy:
+- **🧠 Remembering**: Verbatim accuracy (WER-based)
+- **💡 Understanding**: Similarity of LLM's interpretations
+- **🚀 Application**: QA accuracy for downstream tasks
+The **Speech IQ Score** provides a unified metric for comparing both cascaded methods (ASR+LLM) and end-to-end models.
 """
 # Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = """
+## 📊 About SpeechIQ Evaluation
+**Speech Intelligence Quotient (SpeechIQ)** represents a first-of-its-kind intelligence examination that bridges cognitive principles with voice-oriented benchmarks. Our framework moves beyond traditional metrics like Word Error Rate (WER) to provide comprehensive evaluation of voice understanding capabilities.
+### 🎯 Evaluation Framework
+SpeechIQ evaluates models across three cognitive dimensions inspired by Bloom's Taxonomy:
+1. **Remember** (Verbatim Accuracy): Tests the model's ability to accurately capture spoken content
+2. **Understand** (Interpretation Similarity): Evaluates how well the model comprehends the meaning of speech
+3. **Apply** (Downstream Performance): Measures the model's ability to use speech understanding for practical tasks
+### 🏆 Model Categories
+- **Agentic (ASR + LLM)**: Cascaded approaches using separate ASR and LLM components
+- **End2End**: Direct speech-to-text models that process audio end-to-end
+### 🔬 Key Benefits
+- **Unified Comparisons**: Compare cascaded and end-to-end approaches on equal footing
+- **Error Detection**: Identify annotation errors in existing benchmarks
+- **Hallucination Detection**: Detect and quantify hallucinations in voice LLMs
+- **Cognitive Assessment**: Map model capabilities to human cognitive principles
+### 📈 Speech IQ Score
+The final Speech IQ Score combines performance across all three dimensions to provide a comprehensive measure of voice understanding intelligence.
+## 🔄 Reproducibility
+For detailed methodology and reproduction instructions, please refer to our paper and codebase.
 """
 EVALUATION_QUEUE_TEXT = """
+## 🚀 Submit Your Model for SpeechIQ Evaluation
+To submit your voice understanding model for SpeechIQ evaluation:
+### 1) Ensure Model Compatibility
+Make sure your model can process audio inputs and generate text outputs in one of these formats:
+- **ASR + LLM**: Separate ASR and LLM components
+- **End-to-End**: Direct audio-to-text processing
+### 2) Model Requirements
+- Model must be publicly accessible
+- Provide clear documentation of audio input format and expected outputs
+- Include information about audio encoder specifications
+### 3) Evaluation Domains
+Your model will be evaluated across:
+- **Remember**: Transcription accuracy
+- **Understand**: Semantic understanding
+- **Apply**: Task-specific performance
+### 4) Documentation
+Please provide:
+- Model architecture details
+- Training data information
+- Audio preprocessing requirements
+- Expected input/output formats
+## 📧 Contact
+For questions about SpeechIQ evaluation or to submit your model, please contact the research team.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@article{speechiq2024,
+  title={Speech Intelligence Quotient (SpeechIQ): A Human Cognition-Inspired Evaluation Framework for Voice Understanding Large Language Models},
+  author={[Authors]},
+  journal={[Journal/Conference]},
+  year={2024}
+}"""