"
def update_ranking(method):
"""Update leaderboard ranking based on selected method"""
try:
current_lb = pd.read_csv(leaderboard_file)
if "Combined_Score" not in current_lb.columns:
current_lb["Combined_Score"] = current_lb["WER"] * 0.7 + current_lb["CER"] * 0.3
sort_column = "Combined_Score"
if method == "WER Only":
sort_column = "WER"
elif method == "CER Only":
sort_column = "CER"
return prepare_leaderboard_for_display(current_lb, sort_column)
except Exception as e:
print(f"Error updating ranking: {str(e)}")
return pd.DataFrame(columns=["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"])
def compare_models(model_1_name, model_2_name):
"""Compare two models performance"""
try:
df = pd.read_csv(leaderboard_file)
if model_1_name == model_2_name:
return pd.DataFrame([{"Info": "Please select two different models to compare."}])
model_1 = df[df["Model_Name"] == model_1_name]
model_2 = df[df["Model_Name"] == model_2_name]
if model_1.empty or model_2.empty:
return pd.DataFrame([{"Info": "One or both models not found in leaderboard."}])
m1 = model_1.iloc[0]
m2 = model_2.iloc[0]
comparison_data = {
"Metric": ["WER", "CER", "Combined Score"],
model_1_name.split("/")[-1]: [
f"{m1['WER']*100:.2f}%",
f"{m1['CER']*100:.2f}%",
f"{m1['Combined_Score']*100:.2f}%"
],
model_2_name.split("/")[-1]: [
f"{m2['WER']*100:.2f}%",
f"{m2['CER']*100:.2f}%",
f"{m2['Combined_Score']*100:.2f}%"
],
"Difference": [
f"{(m1['WER'] - m2['WER'])*100:+.2f}%",
f"{(m1['CER'] - m2['CER'])*100:+.2f}%",
f"{(m1['Combined_Score'] - m2['Combined_Score'])*100:+.2f}%"
]
}
return pd.DataFrame(comparison_data)
except Exception as e:
return pd.DataFrame([{"Error": f"Error comparing models: {str(e)}"}])
def process_submission(model_name, csv_file, model_type, origin_country):
"""Process a new model submission with enhanced metadata"""
if not model_name or not model_name.strip():
return "❌ **Error:** Please provide a model name.", None, None
if not csv_file:
return "❌ **Error:** Please upload a CSV file.", None, None
try:
df = pd.read_csv(csv_file)
if len(df) == 0:
return "❌ **Error:** Uploaded CSV is empty.", None, None
if set(df.columns) != {"id", "text"}:
return f"❌ **Error:** CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None, None
if df["id"].duplicated().any():
dup_ids = df[df["id"].duplicated()]["id"].unique()
return f"❌ **Error:** Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None, None
missing_ids = set(references.keys()) - set(df["id"])
extra_ids = set(df["id"]) - set(references.keys())
if missing_ids:
return f"❌ **Error:** Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None, None
if extra_ids:
return f"❌ **Error:** Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None, None
try:
avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
if avg_wer < 0.001:
return "❌ **Error:** WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None, None
except Exception as e:
return f"❌ **Error calculating metrics:** {str(e)}", None, None
# Update leaderboard
leaderboard = pd.read_csv(leaderboard_file)
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
combined_score = avg_wer * 0.7 + avg_cer * 0.3
if model_name in leaderboard["Model_Name"].values:
idx = leaderboard[leaderboard["Model_Name"] == model_name].index
leaderboard.loc[idx, "WER"] = avg_wer
leaderboard.loc[idx, "CER"] = avg_cer
leaderboard.loc[idx, "Combined_Score"] = combined_score
leaderboard.loc[idx, "timestamp"] = timestamp
leaderboard.loc[idx, "Type"] = model_type
leaderboard.loc[idx, "Origin"] = origin_country
updated_leaderboard = leaderboard
else:
new_entry = pd.DataFrame(
[[model_name, avg_wer, avg_cer, combined_score, timestamp, model_type, origin_country, "ASR"]],
columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"]
)
updated_leaderboard = pd.concat([leaderboard, new_entry])
updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
updated_leaderboard.to_csv(leaderboard_file, index=False)
display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
chart = create_performance_chart()
badge = get_performance_badge(combined_score)
success_msg = f"""
✅ **Submission processed successfully!**
**{model_name}** ({model_type} from {origin_country})
- **WER:** {format_as_percentage(avg_wer)}
- **CER:** {format_as_percentage(avg_cer)}
- **Combined Score:** {format_as_percentage(combined_score)}
- **Performance:** {badge}
"""
return success_msg, display_leaderboard, chart
except Exception as e:
return f"❌ **Error processing submission:** {str(e)}", None, None
def get_current_leaderboard():
"""Get the current leaderboard data for display"""
try:
if os.path.exists(leaderboard_file):
current_leaderboard = pd.read_csv(leaderboard_file)
# Ensure all required columns exist
required_columns = ["Combined_Score", "Type", "Origin", "Task"]
for col in required_columns:
if col not in current_leaderboard.columns:
if col == "Combined_Score":
current_leaderboard[col] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
else:
current_leaderboard[col] = "Unknown" if col != "Task" else "ASR"
current_leaderboard.to_csv(leaderboard_file, index=False)
return current_leaderboard
else:
return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"])
except Exception as e:
print(f"Error getting leaderboard: {str(e)}")
return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"])
def create_leaderboard_table():
"""Create and format the leaderboard table for display"""
leaderboard_data = get_current_leaderboard()
return prepare_leaderboard_for_display(leaderboard_data)
def df_to_html(df):
"""Convert DataFrame to HTML with custom styling"""
if df.empty:
return "
No data available
"
# Convert DataFrame to HTML
html = df.to_html(index=False, escape=False, classes="leaderboard-table")
# Add custom styling
html = html.replace('
Main Leaderboard")
initial_leaderboard = create_leaderboard_table()
with gr.Row():
ranking_method = gr.Radio(
["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"],
label="🔄 Ranking Method",
value="Combined Score (WER 70%, CER 30%)",
info="Choose how to rank the models"
)
leaderboard_view = gr.DataFrame(
value=initial_leaderboard,
interactive=False,
label="📋 Leaderboard Rankings - Lower scores indicate better performance",
wrap=True,
height=400
)
# Performance chart
gr.Markdown("### 📊 Visual Performance Comparison")
performance_chart = gr.Plot(
value=create_performance_chart(),
label="Model Performance Visualization"
)
ranking_method.change(
fn=update_ranking,
inputs=[ranking_method],
outputs=[leaderboard_view]
)
with gr.Accordion("📖 Understanding ASR Metrics", open=False):
gr.Markdown("""
## 🎯 Automatic Speech Recognition Evaluation Metrics
### Word Error Rate (WER)
**WER** measures transcription accuracy at the word level:
- **Formula:** `(Substitutions + Insertions + Deletions) / Total Reference Words`
- **Range:** 0% (perfect) to 100%+ (very poor)
- **Interpretation:**
- 0-5%: 🏆 Excellent performance
- 5-15%: 🥉 Good performance
- 15-30%: 📈 Fair performance
- 30%+: Poor performance
### Character Error Rate (CER)
**CER** measures transcription accuracy at the character level:
- **Advantage:** More granular than WER, captures partial matches
- **Benefit for Bambara:** Particularly valuable for agglutinative languages
- **Typical Range:** Usually lower than WER values
### Combined Score (Primary Ranking Metric)
**Formula:** `Combined Score = 0.7 × WER + 0.3 × CER`
- **Rationale:** Balanced evaluation emphasizing word-level accuracy
- **Usage:** Primary metric for model ranking
### 🎯 Performance Categories
- 🏆 **Excellent**: < 15% Combined Score
- 🥉 **Good**: 15-30% Combined Score
- 📈 **Fair**: > 30% Combined Score
""")
with gr.TabItem("📤 Submit New Model", id="submit"):
gr.HTML("
Submit Your Bambara ASR Model
")
gr.Markdown("""
### 🚀 Ready to benchmark your model? Submit your results and join the leaderboard!
Follow these steps to submit your Bambara ASR model for evaluation.
""")
with gr.Group(elem_classes="form-section"):
with gr.Row():
with gr.Column(scale=2):
model_name_input = gr.Textbox(
label="🤖 Model Name",
placeholder="e.g., MALIBA-AI/bambara-whisper-large",
info="Use a descriptive name (organization/model format preferred)"
)
model_type = gr.Dropdown(
label="🏷️ Model Type",
choices=["Whisper-based", "Wav2Vec2", "Foundation", "Custom", "Fine-tuned", "Multilingual", "Other"],
value="Custom",
info="Select the type/architecture of your model"
)
origin_country = gr.Dropdown(
label="🌍 Origin/Institution",
choices=["Mali", "Senegal", "Burkina Faso", "Niger", "Guinea", "Ivory Coast", "USA", "France", "Canada", "UK", "Other"],
value="Mali",
info="Country or region of the developing institution"
)
with gr.Column(scale=1):
gr.Markdown("""
#### 📋 Submission Requirements
**CSV Format:**
- Columns: `id`, `text`
- Match all reference dataset IDs
- No duplicate IDs
- Text transcriptions in Bambara
**Data Quality:**
- Clean, normalized text
- Consistent formatting
- Complete coverage of test set
""")
csv_upload = gr.File(
label="📁 Upload Predictions CSV",
file_types=[".csv"],
info="Upload your model's transcriptions in the required CSV format"
)
submit_btn = gr.Button("🚀 Submit Model", variant="primary", size="lg", elem_classes=['gradio-button', 'primary'])
output_msg = gr.Markdown(label="📢 Submission Status")
with gr.Row():
leaderboard_display = gr.DataFrame(
label="📊 Updated Leaderboard",
value=initial_leaderboard,
interactive=False,
wrap=True,
height=400
)
updated_chart = gr.Plot(
label="📈 Updated Performance Chart"
)
submit_btn.click(
fn=process_submission,
inputs=[model_name_input, csv_upload, model_type, origin_country],
outputs=[output_msg, leaderboard_display, updated_chart]
)
with gr.TabItem("🔍 Compare Models", id="compare"):
gr.HTML("
Compare Two Models
")
gr.Markdown("### Select two models to compare their performance side-by-side")
with gr.Row():
current_data = get_current_leaderboard()
model_names = current_data["Model_Name"].tolist() if not current_data.empty else []
model_1_dropdown = gr.Dropdown(
choices=model_names,
label="🤖 Model 1",
info="Select the first model for comparison"
)
model_2_dropdown = gr.Dropdown(
choices=model_names,
label="🤖 Model 2",
info="Select the second model for comparison"
)
compare_btn = gr.Button("⚡ Compare Models", variant="primary", elem_classes=['gradio-button', 'primary'])
comparison_note = gr.Markdown("""
**Note on Comparison Results:**
- Positive difference values (🟢) indicate Model 1 performed better
- Negative difference values (🔴) indicate Model 2 performed better
- Lower error rates indicate better performance
""", visible=False)
comparison_output = gr.DataFrame(
label="📊 Model Comparison Results",
value=pd.DataFrame([{"Info": "Select two models and click Compare to see the results."}]),
interactive=False
)
def update_comparison_table(m1, m2):
if not m1 or not m2:
return gr.update(visible=False), pd.DataFrame([{"Info": "Please select both models before clicking Compare."}])
if m1 == m2:
return gr.update(visible=False), pd.DataFrame([{"Info": "Please select two different models to compare."}])
df = compare_models(m1, m2)
return gr.update(visible=True), df
compare_btn.click(
fn=update_comparison_table,
inputs=[model_1_dropdown, model_2_dropdown],
outputs=[comparison_note, comparison_output]
)
with gr.TabItem("📊 Dataset & Methodology", id="dataset"):
gr.HTML("
Dataset & Methodology
")
gr.Markdown("""
## 🎯 About the Bambara Speech Recognition Benchmark
### 📈 Dataset Overview
Our benchmark is built on the **`sudoping01/bambara-speech-recognition-benchmark`** dataset, featuring:
- **🎙️ Diverse Audio Samples:** Various speakers, dialects, and recording conditions
- **🗣️ Speaker Variety:** Multiple native Bambara speakers from different regions
- **🎵 Acoustic Diversity:** Different recording environments and quality levels
- **✅ Quality Assurance:** Manually validated transcriptions
- **📚 Content Variety:** Multiple domains and speaking styles
### 🔬 Evaluation Methodology
#### Text Normalization Process
1. **Lowercase conversion** for consistency
2. **Punctuation removal** to focus on linguistic content
3. **Whitespace normalization** for standardized formatting
4. **Unicode normalization** for proper character handling
#### Quality Controls
- **Outlier Detection:** Extreme error rates are capped to prevent skewing
- **Data Validation:** Comprehensive format and completeness checks
- **Duplicate Prevention:** Automatic detection of duplicate submissions
- **Missing Data Handling:** Identification of incomplete submissions
### 🚀 How to Participate
#### Step 1: Access the Dataset
```python
from datasets import load_dataset
dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark")
```
#### Step 2: Generate Predictions
- Process the audio files with your ASR model
- Generate transcriptions for each audio sample
- Ensure your model outputs text in Bambara language
#### Step 3: Format Results
Create a CSV file with exactly these columns:
- **`id`**: Sample identifier (must match dataset IDs)
- **`text`**: Your model's transcription
#### Step 4: Submit & Evaluate
- Upload your CSV using the submission form
- Your model will be automatically evaluated
- Results appear on the leaderboard immediately
### 🏆 Recognition & Impact
**Top-performing models will be:**
- Featured prominently on our leaderboard
- Highlighted in MALIBA-AI communications
- Considered for inclusion in production systems
- Invited to present at community events
### 🤝 Community Guidelines
- **Reproducibility:** Please provide model details and methodology
- **Fair Play:** No data leakage or unfair advantages
- **Collaboration:** Share insights and learnings with the community
- **Attribution:** Properly cite the benchmark in publications
### 📚 Technical Specifications
| Aspect | Details |
|--------|---------|
| **Audio Format** | WAV, various sample rates |
| **Language** | Bambara (bam) |
| **Evaluation Metrics** | WER, CER, Combined Score |
| **Text Encoding** | UTF-8 |
| **Submission Format** | CSV with id, text columns |
""")
# Citation and Footer
with gr.Group(elem_classes="content-card"):
gr.HTML("""
📚 Citation
If you use the Bambara ASR Leaderboard for your scientific publication, or if you find the resources useful, please cite our work:
@misc{bambara_asr_leaderboard_2025,
title={Bambara Speech Recognition Leaderboard},
author={MALIBA-AI Team},
year={2025},
url={https://huggingface.co/spaces/MALIBA-AI/bambara-asr-leaderboard},
note={A community initiative for advancing Bambara speech recognition technology}
}
""")
gr.HTML("""
About MALIBA-AI
MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation "No Malian Language Left Behind"
This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology.
For more information, visit MALIBA-AI or
our Hugging Face page.