# import gradio as gr # import pandas as pd # from datasets import load_dataset # from jiwer import wer, cer # import os # from datetime import datetime # import re # from huggingface_hub import login # # Login to Hugging Face Hub (if token is available) # token = os.environ.get("HG_TOKEN") # if token: # login(token) # try: # dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"] # references = {row["id"]: row["text"] for row in dataset} # print(f"Loaded {len(references)} reference transcriptions") # except Exception as e: # print(f"Error loading dataset: {str(e)}") # references = {} # leaderboard_file = "leaderboard.csv" # if not os.path.exists(leaderboard_file): # sample_data = [ # ["test_1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"], # ["test_2", 0.3264, 0.1094, 0.1922, "2025-03-15 10:30:45"], # ] # pd.DataFrame(sample_data, # columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]).to_csv(leaderboard_file, index=False) # print(f"Created new leaderboard file with sample data") # else: # leaderboard_df = pd.read_csv(leaderboard_file) # if "Combined_Score" not in leaderboard_df.columns: # leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3 # leaderboard_df.to_csv(leaderboard_file, index=False) # print(f"Added Combined_Score column to existing leaderboard") # print(f"Loaded leaderboard with {len(leaderboard_df)} entries") # def normalize_text(text): # """Normalize text for WER/CER calculation""" # if not isinstance(text, str): # text = str(text) # text = text.lower() # text = re.sub(r'[^\w\s]', '', text) # text = re.sub(r'\s+', ' ', text).strip() # return text # def calculate_metrics(predictions_df): # """Calculate WER and CER for predictions.""" # results = [] # total_ref_words = 0 # total_ref_chars = 0 # for _, row in predictions_df.iterrows(): # id_val = row["id"] # if id_val not in references: # continue # reference = normalize_text(references[id_val]) # hypothesis = normalize_text(row["text"]) # if not reference or not hypothesis: # continue # reference_words = reference.split() # hypothesis_words = hypothesis.split() # reference_chars = list(reference) # try: # sample_wer = wer(reference, hypothesis) # sample_cer = cer(reference, hypothesis) # sample_wer = min(sample_wer, 2.0) # sample_cer = min(sample_cer, 2.0) # total_ref_words += len(reference_words) # total_ref_chars += len(reference_chars) # results.append({ # "id": id_val, # "reference": reference, # "hypothesis": hypothesis, # "ref_word_count": len(reference_words), # "ref_char_count": len(reference_chars), # "wer": sample_wer, # "cer": sample_cer # }) # except Exception as e: # print(f"Error processing sample {id_val}: {str(e)}") # pass # if not results: # raise ValueError("No valid samples for WER/CER calculation") # avg_wer = sum(item["wer"] for item in results) / len(results) # avg_cer = sum(item["cer"] for item in results) / len(results) # weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words # weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars # return avg_wer, avg_cer, weighted_wer, weighted_cer, results # def format_as_percentage(value): # """Convert decimal to percentage with 2 decimal places""" # return f"{value * 100:.2f}%" # def prepare_leaderboard_for_display(df, sort_by="Combined_Score"): # """Format leaderboard for display with ranking and percentages""" # if df is None or len(df) == 0: # return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"]) # display_df = df.copy() # display_df = display_df.sort_values(sort_by) # display_df.insert(0, "Rank", range(1, len(display_df) + 1)) # for col in ["WER", "CER", "Combined_Score"]: # if col in display_df.columns: # display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}") # return display_df # def update_ranking(method): # """Update leaderboard ranking based on selected method""" # try: # current_lb = pd.read_csv(leaderboard_file) # if "Combined_Score" not in current_lb.columns: # current_lb["Combined_Score"] = current_lb["WER"] * 0.7 + current_lb["CER"] * 0.3 # sort_column = "Combined_Score" # if method == "WER Only": # sort_column = "WER" # elif method == "CER Only": # sort_column = "CER" # return prepare_leaderboard_for_display(current_lb, sort_column) # except Exception as e: # print(f"Error updating ranking: {str(e)}") # return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"]) # def process_submission(model_name, csv_file): # """Process a new model submission""" # if not model_name or not model_name.strip(): # return "Error: Please provide a model name.", None # if not csv_file: # return "Error: Please upload a CSV file.", None # try: # df = pd.read_csv(csv_file) # if len(df) == 0: # return "Error: Uploaded CSV is empty.", None # if set(df.columns) != {"id", "text"}: # return f"Error: CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None # if df["id"].duplicated().any(): # dup_ids = df[df["id"].duplicated()]["id"].unique() # return f"Error: Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None # missing_ids = set(references.keys()) - set(df["id"]) # extra_ids = set(df["id"]) - set(references.keys()) # if missing_ids: # return f"Error: Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None # if extra_ids: # return f"Error: Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None # try: # avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df) # # Check for suspiciously low values # if avg_wer < 0.001: # return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None # except Exception as e: # return f"Error calculating metrics: {str(e)}", None # leaderboard = pd.read_csv(leaderboard_file) # timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # combined_score = avg_wer * 0.7 + avg_cer * 0.3 # if model_name in leaderboard["Model_Name"].values: # idx = leaderboard[leaderboard["Model_Name"] == model_name].index # leaderboard.loc[idx, "WER"] = avg_wer # leaderboard.loc[idx, "CER"] = avg_cer # leaderboard.loc[idx, "Combined_Score"] = combined_score # leaderboard.loc[idx, "timestamp"] = timestamp # updated_leaderboard = leaderboard # else: # new_entry = pd.DataFrame( # [[model_name, avg_wer, avg_cer, combined_score, timestamp]], # columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"] # ) # updated_leaderboard = pd.concat([leaderboard, new_entry]) # updated_leaderboard = updated_leaderboard.sort_values("Combined_Score") # updated_leaderboard.to_csv(leaderboard_file, index=False) # display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard) # return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard # except Exception as e: # return f"Error processing submission: {str(e)}", None # def get_current_leaderboard(): # """Get the current leaderboard data for display""" # try: # if os.path.exists(leaderboard_file): # current_leaderboard = pd.read_csv(leaderboard_file) # if "Combined_Score" not in current_leaderboard.columns: # current_leaderboard["Combined_Score"] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3 # current_leaderboard.to_csv(leaderboard_file, index=False) # return current_leaderboard # else: # return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]) # except Exception as e: # print(f"Error getting leaderboard: {str(e)}") # return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]) # def create_leaderboard_table(): # """Create and format the leaderboard table for display""" # leaderboard_data = get_current_leaderboard() # return prepare_leaderboard_for_display(leaderboard_data) # with gr.Blocks(title="Bambara ASR Leaderboard") as demo: # gr.Markdown( # """ # # 🇲🇱 Bambara ASR Leaderboard # This leaderboard tracks and evaluates speech recognition models for the Bambara language. # Models are ranked based on Word Error Rate (WER), Character Error Rate (CER), and a combined score. # ## Current Models Performance # """ # ) # current_data = get_current_leaderboard() # if len(current_data) > 0: # best_model = current_data.sort_values("Combined_Score").iloc[0] # gr.Markdown(f""" # ### 🏆 Current Best Model: **{best_model['Model_Name']}** # * WER: **{best_model['WER']*100:.2f}%** # * CER: **{best_model['CER']*100:.2f}%** # * Combined Score: **{best_model['Combined_Score']*100:.2f}%** # """) # with gr.Tabs() as tabs: # with gr.TabItem("🏅 Model Rankings"): # initial_leaderboard = create_leaderboard_table() # ranking_method = gr.Radio( # ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"], # label="Ranking Method", # value="Combined Score (WER 70%, CER 30%)" # ) # leaderboard_view = gr.DataFrame( # value=initial_leaderboard, # interactive=False, # label="Models are ranked by selected metric - lower is better" # ) # ranking_method.change( # fn=update_ranking, # inputs=[ranking_method], # outputs=[leaderboard_view] # ) # with gr.Accordion("Metrics Explanation", open=False): # gr.Markdown( # """ # ## Understanding ASR Metrics # ### Word Error Rate (WER) # WER measures how accurately the ASR system recognizes whole words: # * Lower values indicate better performance # * Calculated as: (Substitutions + Insertions + Deletions) / Total Words # * A WER of 0% means perfect transcription # * A WER of 20% means approximately 1 in 5 words contains an error # ### Character Error Rate (CER) # CER measures accuracy at the character level: # * More fine-grained than WER # * Better at capturing partial word matches # * Particularly useful for agglutinative languages like Bambara # ### Combined Score # * Weighted average: 70% WER + 30% CER # * Provides a balanced evaluation of model performance # * Used as the primary ranking metric # """ # ) # with gr.TabItem("📊 Submit New Results"): # gr.Markdown( # """ # ### Submit a new model for evaluation # Upload a CSV file with the following format: # * Must contain exactly two columns: 'id' and 'text' # * The 'id' column should match the reference dataset IDs # * The 'text' column should contain your model's transcriptions # """ # ) # with gr.Row(): # model_name_input = gr.Textbox( # label="Model Name", # placeholder="e.g., MALIBA-AI/bambara-asr" # ) # gr.Markdown("*Use a descriptive name to identify your model*") # with gr.Row(): # csv_upload = gr.File( # label="Upload CSV File", # file_types=[".csv"] # ) # gr.Markdown("*CSV with columns: id, text*") # submit_btn = gr.Button("Submit", variant="primary") # output_msg = gr.Textbox(label="Status", interactive=False) # leaderboard_display = gr.DataFrame( # label="Updated Leaderboard", # value=initial_leaderboard, # interactive=False # ) # submit_btn.click( # fn=process_submission, # inputs=[model_name_input, csv_upload], # outputs=[output_msg, leaderboard_display] # ) # with gr.TabItem("📝 Benchmark Dataset"): # gr.Markdown( # """ # ## About the Benchmark Dataset # This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/MALIBA-AI/bambara-speech-recognition-leaderboard)** dataset: # * Contains diverse Bambara speech samples # * Includes various speakers, accents, and dialects # * Covers different speech styles and recording conditions # * Transcribed and validated # ### How to Generate Predictions # To submit results to this leaderboard: # 1. Download the audio files from the benchmark dataset # 2. Run your ASR model on the audio files # 3. Generate a CSV file with 'id' and 'text' columns # 4. Submit your results using the form in the "Submit New Results" tab # ### Evaluation Guidelines # * Text is normalized (lowercase, punctuation removed) before metrics calculation # * Extreme outliers are capped to prevent skewing results # * All submissions are validated for format and completeness # NB: This work is a collaboration between MALIBA-AI, RobotsMali AI4D-LAB and Djelia # """ # ) # gr.Markdown( # """ # --- # ### About MALIBA-AI # **MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation** # *"No Malian Language Left Behind"* # This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology. # For more information, visit [MALIBA-AI on Hugging Face](https://huggingface.co/MALIBA-AI). # """ # ) # if __name__ == "__main__": # demo.launch() import gradio as gr import pandas as pd from datasets import load_dataset from jiwer import wer, cer import os from datetime import datetime import re import plotly.express as px import plotly.graph_objects as go from huggingface_hub import login import numpy as np # Custom CSS inspired by Sahara leaderboard custom_head_html = """ """ # Header with MALIBA-AI branding new_header_html = """


🇲🇱

Bambara ASR Leaderboard

Powered by MALIBA-AI • "No Malian Language Left Behind"

🎙️
""" # Advanced CSS styling inspired by Sahara sahara_style_css = """ /* Global Styles */ div[class*="gradio-container"] { background: #FFFBF5 !important; color: #000 !important; font-family: 'Inter', sans-serif !important; } div.svelte-1nguped { background: white !important; } .fillable.svelte-15jxnnn.svelte-15jxnnn:not(.fill_width) { max-width: 1580px !important; } /* Navigation Buttons */ .nav-button { background-color: #117b75 !important; color: #fff !important; font-weight: bold !important; border-radius: 8px !important; border: none !important; box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important; transition: all 0.3s ease !important; } .nav-button:hover { background-color: #0f6b66 !important; color: #e8850e !important; transform: translateY(-1px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; } /* Content Cards */ .content-section { padding: 40px 0; } .content-card { background-color: #fff !important; border-radius: 16px !important; box-shadow: 0 10px 25px -5px rgba(0,0,0,0.1), 0 8px 10px -6px rgba(0,0,0,0.1) !important; padding: 40px !important; margin-bottom: 30px !important; border: 1px solid rgba(0,0,0,0.05) !important; } /* Typography */ .content-card h2 { font-family: "Rubik", sans-serif !important; font-size: 32px !important; font-weight: 700 !important; line-height: 1.25 !important; letter-spacing: -1px !important; color: #2f3b7d !important; margin-bottom: 20px !important; text-align: center !important; } .content-card h3 { font-size: 22px !important; color: #2f3b7d !important; font-weight: 600 !important; margin-bottom: 15px !important; } .content-card h4 { font-family: "Rubik", sans-serif !important; color: #7d3561 !important; font-weight: 600 !important; margin-bottom: 10px !important; } .title { color: #7d3561 !important; font-weight: 600 !important; } /* Tab Styling */ .tab-wrapper.svelte-1tcem6n.svelte-1tcem6n { display: flex; align-items: center; justify-content: space-between; position: relative; height: auto !important; padding-bottom: 0 !important; } .selected.svelte-1tcem6n.svelte-1tcem6n { background-color: #7d3561 !important; color: #fff !important; border-radius: 8px 8px 0 0 !important; } button.svelte-1tcem6n.svelte-1tcem6n { color: #7d3561 !important; font-weight: 600 !important; font-size: 16px !important; padding: 12px 20px !important; background-color: #fff !important; border-radius: 8px 8px 0 0 !important; border: 2px solid #e9ecef !important; border-bottom: none !important; transition: all 0.3s ease !important; } button.svelte-1tcem6n.svelte-1tcem6n:hover { background-color: #f8f9fa !important; border-color: #7d3561 !important; } .tab-container.svelte-1tcem6n.svelte-1tcem6n:after { content: ""; position: absolute; bottom: 0; left: 0; right: 0; height: 3px; background: linear-gradient(90deg, #7d3561 0%, #2f3b7d 100%) !important; } /* Table Styling */ div[class*="gradio-container"] .prose table { color: #000 !important; border: 2px solid #dca02a !important; border-radius: 12px !important; margin-bottom: 20px !important; margin-left: auto !important; margin-right: auto !important; width: 100% !important; border-collapse: separate !important; border-spacing: 0 !important; overflow: hidden !important; box-shadow: 0 4px 6px rgba(0,0,0,0.1) !important; } div[class*="gradio-container"] .prose thead tr { background: linear-gradient(90deg, #7d3561 0%, #2f3b7d 100%) !important; } div[class*="gradio-container"] .prose th { color: #fff !important; font-weight: 700 !important; font-size: 14px !important; padding: 15px 10px !important; text-align: center !important; border: none !important; } div[class*="gradio-container"] .prose td { font-size: 14px !important; padding: 12px 10px !important; border: none !important; text-align: center !important; color: #000 !important; border-bottom: 1px solid #f8f9fa !important; } div[class*="gradio-container"] .prose tbody tr:nth-child(even) { background-color: #f8f9fa !important; } div[class*="gradio-container"] .prose tbody tr:hover { background-color: #e3f2fd !important; transition: background-color 0.2s ease !important; } /* First column (model names) styling */ div[class*="gradio-container"] .prose th:first-child, div[class*="gradio-container"] .prose td:first-child { text-align: left !important; min-width: 250px !important; font-weight: 600 !important; } /* Performance badges */ .performance-badge { display: inline-block; padding: 4px 8px; border-radius: 12px; font-size: 12px; font-weight: 600; margin-left: 8px; } .badge-excellent { background: #d4edda; color: #155724; } .badge-good { background: #fff3cd; color: #856404; } .badge-fair { background: #f8d7da; color: #721c24; } /* Stats cards */ .stats-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin: 20px 0; } .stat-card { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 12px; text-align: center; box-shadow: 0 4px 6px rgba(0,0,0,0.1); } .stat-number { font-size: 2em; font-weight: 700; margin-bottom: 5px; } .stat-label { font-size: 0.9em; opacity: 0.9; } /* Form styling */ .form-section { background: #f8f9fa; border-radius: 12px; padding: 25px; margin: 20px 0; border-left: 4px solid #7d3561; } /* Citation block */ .citation-block { background-color: #FDF6E3 !important; border-radius: 12px !important; padding: 25px !important; border-left: 4px solid #D97706 !important; margin: 20px 0 !important; } /* Dropdown styling */ .gradio-dropdown { border-radius: 8px !important; border: 2px solid #e9ecef !important; } .gradio-dropdown:focus { border-color: #7d3561 !important; box-shadow: 0 0 0 3px rgba(125, 53, 97, 0.1) !important; } /* Button styling */ .gradio-button { border-radius: 8px !important; font-weight: 600 !important; transition: all 0.3s ease !important; } .gradio-button.primary { background: linear-gradient(135deg, #7d3561 0%, #2f3b7d 100%) !important; border: none !important; color: white !important; } .gradio-button.primary:hover { transform: translateY(-2px) !important; box-shadow: 0 4px 12px rgba(125, 53, 97, 0.3) !important; } /* Responsive design */ @media (max-width: 768px) { .content-card { padding: 20px !important; margin-bottom: 20px !important; } .content-card h2 { font-size: 24px !important; } .stats-grid { grid-template-columns: 1fr !important; } } """ # Login to Hugging Face Hub (if token is available) token = os.environ.get("HG_TOKEN") if token: login(token) # Load dataset try: dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"] references = {row["id"]: row["text"] for row in dataset} print(f"Loaded {len(references)} reference transcriptions") except Exception as e: print(f"Error loading dataset: {str(e)}") references = {} # Initialize leaderboard leaderboard_file = "leaderboard.csv" if not os.path.exists(leaderboard_file): sample_data = [ ["MALIBA-AI/bambara-whisper-small", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45", "Whisper-based", "Mali", "ASR"], ["OpenAI/whisper-base", 0.3264, 0.1094, 0.1922, "2025-03-15 10:30:45", "Foundation", "USA", "ASR"], ] pd.DataFrame(sample_data, columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"]).to_csv(leaderboard_file, index=False) print(f"Created new leaderboard file with sample data") else: leaderboard_df = pd.read_csv(leaderboard_file) # Add new columns if they don't exist required_columns = ["Combined_Score", "Type", "Origin", "Task"] for col in required_columns: if col not in leaderboard_df.columns: if col == "Combined_Score": leaderboard_df[col] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3 else: default_val = "Unknown" if col != "Task" else "ASR" leaderboard_df[col] = default_val leaderboard_df.to_csv(leaderboard_file, index=False) print(f"Loaded leaderboard with {len(leaderboard_df)} entries") def normalize_text(text): """Normalize text for WER/CER calculation""" if not isinstance(text, str): text = str(text) text = text.lower() text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text def calculate_metrics(predictions_df): """Calculate WER and CER for predictions.""" results = [] total_ref_words = 0 total_ref_chars = 0 for _, row in predictions_df.iterrows(): id_val = row["id"] if id_val not in references: continue reference = normalize_text(references[id_val]) hypothesis = normalize_text(row["text"]) if not reference or not hypothesis: continue reference_words = reference.split() hypothesis_words = hypothesis.split() reference_chars = list(reference) try: sample_wer = wer(reference, hypothesis) sample_cer = cer(reference, hypothesis) sample_wer = min(sample_wer, 2.0) sample_cer = min(sample_cer, 2.0) total_ref_words += len(reference_words) total_ref_chars += len(reference_chars) results.append({ "id": id_val, "reference": reference, "hypothesis": hypothesis, "ref_word_count": len(reference_words), "ref_char_count": len(reference_chars), "wer": sample_wer, "cer": sample_cer }) except Exception as e: print(f"Error processing sample {id_val}: {str(e)}") pass if not results: raise ValueError("No valid samples for WER/CER calculation") avg_wer = sum(item["wer"] for item in results) / len(results) avg_cer = sum(item["cer"] for item in results) / len(results) weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars return avg_wer, avg_cer, weighted_wer, weighted_cer, results def format_as_percentage(value): """Convert decimal to percentage with 2 decimal places""" return f"{value * 100:.2f}%" def get_performance_badge(score): """Get performance badge based on score""" if score < 0.15: return "🏆 Excellent" elif score < 0.30: return "🥉 Good" else: return "📈 Fair" def add_medals_to_models(df, score_col="Combined_Score"): """Add medals to top-performing models""" if df.empty or score_col not in df.columns: return df df_copy = df.copy() # Convert score to float for sorting df_copy[f"{score_col}_float"] = pd.to_numeric(df_copy[score_col], errors='coerce') # Sort by score (ascending - lower is better for error rates) df_copy = df_copy.sort_values(by=f"{score_col}_float", ascending=True, na_position='last').reset_index(drop=True) # Get unique scores for ranking valid_scores = df_copy[f"{score_col}_float"].dropna().unique() valid_scores.sort() # Assign medals medals = ["🏆", "🥈", "🥉"] def get_medal(score): if pd.isna(score): return "" rank = np.where(valid_scores == score)[0] if len(rank) > 0 and rank[0] < len(medals): return medals[rank[0]] + " " return "" df_copy["Medal"] = df_copy[f"{score_col}_float"].apply(get_medal) df_copy["Model_Name"] = df_copy["Medal"] + df_copy["Model_Name"].astype(str) # Clean up temporary columns df_copy = df_copy.drop(columns=[f"{score_col}_float", "Medal"]) return df_copy def prepare_leaderboard_for_display(df, sort_by="Combined_Score"): """Format leaderboard for display with ranking and percentages""" if df is None or len(df) == 0: return pd.DataFrame(columns=["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"]) display_df = df.copy() # Add medals first display_df = add_medals_to_models(display_df, sort_by) # Sort by the specified column display_df[f"{sort_by}_float"] = pd.to_numeric(display_df[sort_by], errors='coerce') display_df = display_df.sort_values(f"{sort_by}_float", ascending=True, na_position='last') # Add rank display_df.insert(0, "Rank", range(1, len(display_df) + 1)) # Format percentages for col in ["WER", "CER", "Combined_Score"]: if col in display_df.columns: display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}" if pd.notna(x) else "---") # Add performance badges display_df["Performance"] = display_df["Combined_Score"].apply(lambda x: get_performance_badge(x) if pd.notna(x) else "---") # Shorten model names for display display_df["Model"] = display_df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in str(x) else str(x)) # Format date if "timestamp" in display_df.columns: display_df["Date"] = pd.to_datetime(display_df["timestamp"], errors='coerce').dt.strftime("%Y-%m-%d") else: display_df["Date"] = "---" # Select and reorder columns display_columns = ["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"] available_columns = [col for col in display_columns if col in display_df.columns] # Clean up temporary columns temp_cols = [col for col in display_df.columns if col.endswith("_float")] display_df = display_df.drop(columns=temp_cols, errors='ignore') return display_df[available_columns] def create_performance_chart(): """Create performance visualization chart""" try: df = pd.read_csv(leaderboard_file) if len(df) == 0: return None # Sort by Combined_Score df = df.sort_values("Combined_Score") fig = go.Figure() # Add WER bars fig.add_trace(go.Bar( name="WER", x=df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in x else x), y=df["WER"] * 100, marker_color='#ff7f0e', hovertemplate='%{x}
WER: %{y:.2f}%' )) # Add CER bars fig.add_trace(go.Bar( name="CER", x=df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in x else x), y=df["CER"] * 100, marker_color='#2ca02c', hovertemplate='%{x}
CER: %{y:.2f}%' )) # Add Combined Score line fig.add_trace(go.Scatter( name="Combined Score", x=df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in x else x), y=df["Combined_Score"] * 100, mode='lines+markers', line=dict(color='#d62728', width=3), marker=dict(size=8), hovertemplate='%{x}
Combined Score: %{y:.2f}%' )) fig.update_layout( title={ 'text': "📊 Model Performance Comparison", 'x': 0.5, 'font': {'size': 18, 'family': 'Rubik'} }, xaxis_title="Model", yaxis_title="Error Rate (%)", hovermode='x unified', height=500, showlegend=True, plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font=dict(family="Inter", size=12), legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ) ) return fig except Exception as e: print(f"Error creating chart: {str(e)}") return None def get_leaderboard_stats(): """Get summary statistics for the leaderboard""" try: df = pd.read_csv(leaderboard_file) if len(df) == 0: return """
0
Models Submitted
""" best_model = df.loc[df["Combined_Score"].idxmin()] total_models = len(df) avg_wer = df["WER"].mean() avg_cer = df["CER"].mean() return f"""
{total_models}
Models Evaluated
{format_as_percentage(best_model['Combined_Score'])}
Best Combined Score
{format_as_percentage(avg_wer)}
Average WER
{format_as_percentage(avg_cer)}
Average CER

🏆 Current Champion: {best_model['Model_Name']}

""" except Exception as e: return f"

Error loading stats: {str(e)}

" def update_ranking(method): """Update leaderboard ranking based on selected method""" try: current_lb = pd.read_csv(leaderboard_file) if "Combined_Score" not in current_lb.columns: current_lb["Combined_Score"] = current_lb["WER"] * 0.7 + current_lb["CER"] * 0.3 sort_column = "Combined_Score" if method == "WER Only": sort_column = "WER" elif method == "CER Only": sort_column = "CER" return prepare_leaderboard_for_display(current_lb, sort_column) except Exception as e: print(f"Error updating ranking: {str(e)}") return pd.DataFrame(columns=["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"]) def compare_models(model_1_name, model_2_name): """Compare two models performance""" try: df = pd.read_csv(leaderboard_file) if model_1_name == model_2_name: return pd.DataFrame([{"Info": "Please select two different models to compare."}]) model_1 = df[df["Model_Name"] == model_1_name] model_2 = df[df["Model_Name"] == model_2_name] if model_1.empty or model_2.empty: return pd.DataFrame([{"Info": "One or both models not found in leaderboard."}]) m1 = model_1.iloc[0] m2 = model_2.iloc[0] comparison_data = { "Metric": ["WER", "CER", "Combined Score"], model_1_name.split("/")[-1]: [ f"{m1['WER']*100:.2f}%", f"{m1['CER']*100:.2f}%", f"{m1['Combined_Score']*100:.2f}%" ], model_2_name.split("/")[-1]: [ f"{m2['WER']*100:.2f}%", f"{m2['CER']*100:.2f}%", f"{m2['Combined_Score']*100:.2f}%" ], "Difference": [ f"{(m1['WER'] - m2['WER'])*100:+.2f}%", f"{(m1['CER'] - m2['CER'])*100:+.2f}%", f"{(m1['Combined_Score'] - m2['Combined_Score'])*100:+.2f}%" ] } return pd.DataFrame(comparison_data) except Exception as e: return pd.DataFrame([{"Error": f"Error comparing models: {str(e)}"}]) def process_submission(model_name, csv_file, model_type, origin_country): """Process a new model submission with enhanced metadata""" if not model_name or not model_name.strip(): return "❌ **Error:** Please provide a model name.", None, None if not csv_file: return "❌ **Error:** Please upload a CSV file.", None, None try: df = pd.read_csv(csv_file) if len(df) == 0: return "❌ **Error:** Uploaded CSV is empty.", None, None if set(df.columns) != {"id", "text"}: return f"❌ **Error:** CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None, None if df["id"].duplicated().any(): dup_ids = df[df["id"].duplicated()]["id"].unique() return f"❌ **Error:** Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None, None missing_ids = set(references.keys()) - set(df["id"]) extra_ids = set(df["id"]) - set(references.keys()) if missing_ids: return f"❌ **Error:** Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None, None if extra_ids: return f"❌ **Error:** Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None, None try: avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df) if avg_wer < 0.001: return "❌ **Error:** WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None, None except Exception as e: return f"❌ **Error calculating metrics:** {str(e)}", None, None # Update leaderboard leaderboard = pd.read_csv(leaderboard_file) timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") combined_score = avg_wer * 0.7 + avg_cer * 0.3 if model_name in leaderboard["Model_Name"].values: idx = leaderboard[leaderboard["Model_Name"] == model_name].index leaderboard.loc[idx, "WER"] = avg_wer leaderboard.loc[idx, "CER"] = avg_cer leaderboard.loc[idx, "Combined_Score"] = combined_score leaderboard.loc[idx, "timestamp"] = timestamp leaderboard.loc[idx, "Type"] = model_type leaderboard.loc[idx, "Origin"] = origin_country updated_leaderboard = leaderboard else: new_entry = pd.DataFrame( [[model_name, avg_wer, avg_cer, combined_score, timestamp, model_type, origin_country, "ASR"]], columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"] ) updated_leaderboard = pd.concat([leaderboard, new_entry]) updated_leaderboard = updated_leaderboard.sort_values("Combined_Score") updated_leaderboard.to_csv(leaderboard_file, index=False) display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard) chart = create_performance_chart() badge = get_performance_badge(combined_score) success_msg = f""" ✅ **Submission processed successfully!** **{model_name}** ({model_type} from {origin_country}) - **WER:** {format_as_percentage(avg_wer)} - **CER:** {format_as_percentage(avg_cer)} - **Combined Score:** {format_as_percentage(combined_score)} - **Performance:** {badge} """ return success_msg, display_leaderboard, chart except Exception as e: return f"❌ **Error processing submission:** {str(e)}", None, None def get_current_leaderboard(): """Get the current leaderboard data for display""" try: if os.path.exists(leaderboard_file): current_leaderboard = pd.read_csv(leaderboard_file) # Ensure all required columns exist required_columns = ["Combined_Score", "Type", "Origin", "Task"] for col in required_columns: if col not in current_leaderboard.columns: if col == "Combined_Score": current_leaderboard[col] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3 else: current_leaderboard[col] = "Unknown" if col != "Task" else "ASR" current_leaderboard.to_csv(leaderboard_file, index=False) return current_leaderboard else: return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"]) except Exception as e: print(f"Error getting leaderboard: {str(e)}") return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"]) def create_leaderboard_table(): """Create and format the leaderboard table for display""" leaderboard_data = get_current_leaderboard() return prepare_leaderboard_for_display(leaderboard_data) def df_to_html(df): """Convert DataFrame to HTML with custom styling""" if df.empty: return "

No data available

" # Convert DataFrame to HTML html = df.to_html(index=False, escape=False, classes="leaderboard-table") # Add custom styling html = html.replace('Main Leaderboard") initial_leaderboard = create_leaderboard_table() with gr.Row(): ranking_method = gr.Radio( ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"], label="🔄 Ranking Method", value="Combined Score (WER 70%, CER 30%)", info="Choose how to rank the models" ) leaderboard_view = gr.DataFrame( value=initial_leaderboard, interactive=False, label="📋 Leaderboard Rankings - Lower scores indicate better performance", wrap=True, height=400 ) # Performance chart gr.Markdown("### 📊 Visual Performance Comparison") performance_chart = gr.Plot( value=create_performance_chart(), label="Model Performance Visualization" ) ranking_method.change( fn=update_ranking, inputs=[ranking_method], outputs=[leaderboard_view] ) with gr.Accordion("📖 Understanding ASR Metrics", open=False): gr.Markdown(""" ## 🎯 Automatic Speech Recognition Evaluation Metrics ### Word Error Rate (WER) **WER** measures transcription accuracy at the word level: - **Formula:** `(Substitutions + Insertions + Deletions) / Total Reference Words` - **Range:** 0% (perfect) to 100%+ (very poor) - **Interpretation:** - 0-5%: 🏆 Excellent performance - 5-15%: 🥉 Good performance - 15-30%: 📈 Fair performance - 30%+: Poor performance ### Character Error Rate (CER) **CER** measures transcription accuracy at the character level: - **Advantage:** More granular than WER, captures partial matches - **Benefit for Bambara:** Particularly valuable for agglutinative languages - **Typical Range:** Usually lower than WER values ### Combined Score (Primary Ranking Metric) **Formula:** `Combined Score = 0.7 × WER + 0.3 × CER` - **Rationale:** Balanced evaluation emphasizing word-level accuracy - **Usage:** Primary metric for model ranking ### 🎯 Performance Categories - 🏆 **Excellent**: < 15% Combined Score - 🥉 **Good**: 15-30% Combined Score - 📈 **Fair**: > 30% Combined Score """) with gr.TabItem("📤 Submit New Model", id="submit"): gr.HTML("

Submit Your Bambara ASR Model

") gr.Markdown(""" ### 🚀 Ready to benchmark your model? Submit your results and join the leaderboard! Follow these steps to submit your Bambara ASR model for evaluation. """) with gr.Group(elem_classes="form-section"): with gr.Row(): with gr.Column(scale=2): model_name_input = gr.Textbox( label="🤖 Model Name", placeholder="e.g., MALIBA-AI/bambara-whisper-large", info="Use a descriptive name (organization/model format preferred)" ) model_type = gr.Dropdown( label="🏷️ Model Type", choices=["Whisper-based", "Wav2Vec2", "Foundation", "Custom", "Fine-tuned", "Multilingual", "Other"], value="Custom", info="Select the type/architecture of your model" ) origin_country = gr.Dropdown( label="🌍 Origin/Institution", choices=["Mali", "Senegal", "Burkina Faso", "Niger", "Guinea", "Ivory Coast", "USA", "France", "Canada", "UK", "Other"], value="Mali", info="Country or region of the developing institution" ) with gr.Column(scale=1): gr.Markdown(""" #### 📋 Submission Requirements **CSV Format:** - Columns: `id`, `text` - Match all reference dataset IDs - No duplicate IDs - Text transcriptions in Bambara **Data Quality:** - Clean, normalized text - Consistent formatting - Complete coverage of test set """) csv_upload = gr.File( label="📁 Upload Predictions CSV", file_types=[".csv"], info="Upload your model's transcriptions in the required CSV format" ) submit_btn = gr.Button("🚀 Submit Model", variant="primary", size="lg", elem_classes=['gradio-button', 'primary']) output_msg = gr.Markdown(label="📢 Submission Status") with gr.Row(): leaderboard_display = gr.DataFrame( label="📊 Updated Leaderboard", value=initial_leaderboard, interactive=False, wrap=True, height=400 ) updated_chart = gr.Plot( label="📈 Updated Performance Chart" ) submit_btn.click( fn=process_submission, inputs=[model_name_input, csv_upload, model_type, origin_country], outputs=[output_msg, leaderboard_display, updated_chart] ) with gr.TabItem("🔍 Compare Models", id="compare"): gr.HTML("

Compare Two Models

") gr.Markdown("### Select two models to compare their performance side-by-side") with gr.Row(): current_data = get_current_leaderboard() model_names = current_data["Model_Name"].tolist() if not current_data.empty else [] model_1_dropdown = gr.Dropdown( choices=model_names, label="🤖 Model 1", info="Select the first model for comparison" ) model_2_dropdown = gr.Dropdown( choices=model_names, label="🤖 Model 2", info="Select the second model for comparison" ) compare_btn = gr.Button("⚡ Compare Models", variant="primary", elem_classes=['gradio-button', 'primary']) comparison_note = gr.Markdown(""" **Note on Comparison Results:** - Positive difference values (🟢) indicate Model 1 performed better - Negative difference values (🔴) indicate Model 2 performed better - Lower error rates indicate better performance """, visible=False) comparison_output = gr.DataFrame( label="📊 Model Comparison Results", value=pd.DataFrame([{"Info": "Select two models and click Compare to see the results."}]), interactive=False ) def update_comparison_table(m1, m2): if not m1 or not m2: return gr.update(visible=False), pd.DataFrame([{"Info": "Please select both models before clicking Compare."}]) if m1 == m2: return gr.update(visible=False), pd.DataFrame([{"Info": "Please select two different models to compare."}]) df = compare_models(m1, m2) return gr.update(visible=True), df compare_btn.click( fn=update_comparison_table, inputs=[model_1_dropdown, model_2_dropdown], outputs=[comparison_note, comparison_output] ) with gr.TabItem("📊 Dataset & Methodology", id="dataset"): gr.HTML("

Dataset & Methodology

") gr.Markdown(""" ## 🎯 About the Bambara Speech Recognition Benchmark ### 📈 Dataset Overview Our benchmark is built on the **`sudoping01/bambara-speech-recognition-benchmark`** dataset, featuring: - **🎙️ Diverse Audio Samples:** Various speakers, dialects, and recording conditions - **🗣️ Speaker Variety:** Multiple native Bambara speakers from different regions - **🎵 Acoustic Diversity:** Different recording environments and quality levels - **✅ Quality Assurance:** Manually validated transcriptions - **📚 Content Variety:** Multiple domains and speaking styles ### 🔬 Evaluation Methodology #### Text Normalization Process 1. **Lowercase conversion** for consistency 2. **Punctuation removal** to focus on linguistic content 3. **Whitespace normalization** for standardized formatting 4. **Unicode normalization** for proper character handling #### Quality Controls - **Outlier Detection:** Extreme error rates are capped to prevent skewing - **Data Validation:** Comprehensive format and completeness checks - **Duplicate Prevention:** Automatic detection of duplicate submissions - **Missing Data Handling:** Identification of incomplete submissions ### 🚀 How to Participate #### Step 1: Access the Dataset ```python from datasets import load_dataset dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark") ``` #### Step 2: Generate Predictions - Process the audio files with your ASR model - Generate transcriptions for each audio sample - Ensure your model outputs text in Bambara language #### Step 3: Format Results Create a CSV file with exactly these columns: - **`id`**: Sample identifier (must match dataset IDs) - **`text`**: Your model's transcription #### Step 4: Submit & Evaluate - Upload your CSV using the submission form - Your model will be automatically evaluated - Results appear on the leaderboard immediately ### 🏆 Recognition & Impact **Top-performing models will be:** - Featured prominently on our leaderboard - Highlighted in MALIBA-AI communications - Considered for inclusion in production systems - Invited to present at community events ### 🤝 Community Guidelines - **Reproducibility:** Please provide model details and methodology - **Fair Play:** No data leakage or unfair advantages - **Collaboration:** Share insights and learnings with the community - **Attribution:** Properly cite the benchmark in publications ### 📚 Technical Specifications | Aspect | Details | |--------|---------| | **Audio Format** | WAV, various sample rates | | **Language** | Bambara (bam) | | **Evaluation Metrics** | WER, CER, Combined Score | | **Text Encoding** | UTF-8 | | **Submission Format** | CSV with id, text columns | """) # Citation and Footer with gr.Group(elem_classes="content-card"): gr.HTML("""

📚 Citation

If you use the Bambara ASR Leaderboard for your scientific publication, or if you find the resources useful, please cite our work:

@misc{bambara_asr_leaderboard_2025,
  title={Bambara Speech Recognition Leaderboard},
  author={MALIBA-AI Team},
  year={2025},
  url={https://huggingface.co/spaces/MALIBA-AI/bambara-asr-leaderboard},
  note={A community initiative for advancing Bambara speech recognition technology}
}
            
""") gr.HTML("""

About MALIBA-AI

MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation
"No Malian Language Left Behind"

This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology. For more information, visit MALIBA-AI or our Hugging Face page.

🇲🇱 🤝 🚀
""") if __name__ == "__main__": demo.launch()