Spaces:

Omartificial-Intelligence-Space
/

EgypTalk-ASR-v2-demo

Running on Zero

App Files Files Community

Omartificial-Intelligence-Space commited on Aug 9

Commit

6639d45

verified ·

1 Parent(s): b0a28f2

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -101

app.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import gradio as gr
-import nemo.collections.asr as nemo_asr
 from nemo.core import ModelPT
 import torch
-from evaluate import load as hf_load
 import os
 import spaces
@@ -19,9 +17,6 @@ MODEL_NAMES = [
 # Cache loaded models
 LOADED_MODELS = {}
-# Load WER and CER metrics from HuggingFace evaluate
-hf_wer = hf_load("wer")
-hf_cer = hf_load("cer")
 def get_model(model_name):
     if model_name not in LOADED_MODELS:
@@ -38,127 +33,56 @@ def get_model(model_name):
     return LOADED_MODELS[model_name]
 @spaces.GPU(duration=120)
-def transcribe_and_score(audio, ground_truth, model_name):
-    if audio is None or ground_truth is None or model_name not in MODEL_NAMES:
-        return "", None, None
-    model = get_model(model_name)
     # Use the correct transcribe API
     predictions = model.transcribe([audio])
     pred = predictions[0] if isinstance(predictions, list) else predictions
-    # Ensure both are strings and not empty
-    if not isinstance(ground_truth, str):
-        ground_truth = str(ground_truth)
     if not isinstance(pred, str):
         pred = str(pred)
-    # Strip whitespace
-    ground_truth = ground_truth.strip()
-    pred = pred.strip()
-    # Debug output
-    print(f"[DEBUG] Model: {model_name}")
-    print(f"[DEBUG] Ground truth: '{ground_truth}' (length: {len(ground_truth)})")
-    print(f"[DEBUG] Prediction: '{pred}' (length: {len(pred)})")
-    print(f"[DEBUG] Are they equal? {ground_truth == pred}")
-    print(f"[DEBUG] Ground truth bytes: {repr(ground_truth)}")
-    print(f"[DEBUG] Prediction bytes: {repr(pred)}")
-    if not ground_truth or not pred:
-        print("[DEBUG] Empty ground truth or prediction, returning 1.0")
-        return pred, 1.0, 1.0
-    # Calculate WER and CER
-    wer_score = hf_wer.compute(predictions=[pred], references=[ground_truth])
-    cer_score = hf_cer.compute(predictions=[pred], references=[ground_truth])
-    print(f"[DEBUG] WER: {wer_score}, CER: {cer_score}")
-    return pred, wer_score, cer_score
 @spaces.GPU(duration=120)
-def batch_transcribe_and_score(audio_files, ground_truths, model_name):
-    if not audio_files or not ground_truths or model_name not in MODEL_NAMES:
-        return [], [], [], None, None
-    if len(audio_files) != len(ground_truths):
-        return [], [], [], None, None
-    model = get_model(model_name)
     # Use the correct transcribe API for batch
     predictions = model.transcribe(audio_files)
-    results = []
-    pred_texts = []
-    for i, (audio_file, gt) in enumerate(zip(audio_files, ground_truths)):
-        pred = predictions[i] if isinstance(predictions, list) else predictions
-        if not isinstance(gt, str):
-            gt = str(gt)
-        if not isinstance(pred, str):
-            pred = str(pred)
-        gt = gt.strip()
-        pred = pred.strip()
-        if not gt or not pred:
-            wer_score = 1.0
-            cer_score = 1.0
-        else:
-            wer_score = hf_wer.compute(predictions=[pred], references=[gt])
-            cer_score = hf_cer.compute(predictions=[pred], references=[gt])
-        results.append([pred, wer_score, cer_score])
-        pred_texts.append(pred)
-    # Calculate average WER and CER
-    if pred_texts and ground_truths:
-        avg_wer = hf_wer.compute(predictions=pred_texts, references=ground_truths)
-        avg_cer = hf_cer.compute(predictions=pred_texts, references=ground_truths)
     else:
-        avg_wer = None
-        avg_cer = None
-    return results, avg_wer, avg_cer
 with gr.Blocks(title="EgypTalk-ASR-v2") as demo:
     gr.Markdown("""
     # EgypTalk-ASR-v2
-    Upload an audio file and its ground truth text. This app transcribes and evaluates WER/CER using EgypTalk-ASR-v2.
     """)
     with gr.Tab("Single Test"):
         with gr.Row():
             audio_input = gr.Audio(type="filepath", label="Audio File")
-            gt_input = gr.Textbox(label="Ground Truth Text")
-        model_choice = gr.Radio(MODEL_NAMES, value=MODEL_NAMES[0], label="Model")
-        transcribe_btn = gr.Button("Transcribe & Evaluate")
-        with gr.Row():
-            pred_output = gr.Textbox(label="Transcription")
-            wer_output = gr.Number(label="WER")
-            cer_output = gr.Number(label="CER")
-        transcribe_btn.click(transcribe_and_score, inputs=[audio_input, gt_input, model_choice], outputs=[pred_output, wer_output, cer_output])
     with gr.Tab("Batch Test"):
-        gr.Markdown("Upload multiple audio files and a text file with ground truth (one line per audio, same order). Batch size is limited by GPU/CPU memory.")
         audio_files = gr.Files(label="Audio Files (wav)")
-        gt_file = gr.File(label="Ground Truths (txt, one per line)")
-        model_choice_batch = gr.Radio(MODEL_NAMES, value=MODEL_NAMES[0], label="Model")
-        batch_btn = gr.Button("Batch Transcribe & Evaluate")
-        preds_output = gr.Dataframe(headers=["Prediction", "WER", "CER"], label="Results")
-        avg_wer_output = gr.Number(label="Average WER")
-        avg_cer_output = gr.Number(label="Average CER")
-        def batch_wrapper(audio_files, gt_file, model_name):
-            if not audio_files or not gt_file:
-                return [], None, None
-            with open(gt_file, 'r', encoding='utf-8') as f:
-                gts = [line.strip() for line in f if line.strip()]
-            audio_files_sorted = sorted(audio_files, key=lambda x: os.path.basename(x))
-            results, avg_wer, avg_cer = batch_transcribe_and_score(audio_files_sorted, gts, model_name)
-            return results, avg_wer, avg_cer
-        batch_btn.click(batch_wrapper, inputs=[audio_files, gt_file, model_choice_batch], outputs=[preds_output, avg_wer_output, avg_cer_output])
 demo.launch(share=True)

 import gradio as gr
 from nemo.core import ModelPT
 import torch
 import os
 import spaces
 # Cache loaded models
 LOADED_MODELS = {}
 def get_model(model_name):
     if model_name not in LOADED_MODELS:
     return LOADED_MODELS[model_name]
 @spaces.GPU(duration=120)
+def transcribe_and_score(audio):
+    if audio is None:
+        return ""
+    model = get_model(MODEL_NAMES[0])
     # Use the correct transcribe API
     predictions = model.transcribe([audio])
     pred = predictions[0] if isinstance(predictions, list) else predictions
     if not isinstance(pred, str):
         pred = str(pred)
+    return pred.strip()
 @spaces.GPU(duration=120)
+def batch_transcribe(audio_files):
+    if not audio_files:
+        return []
+    model = get_model(MODEL_NAMES[0])
     # Use the correct transcribe API for batch
     predictions = model.transcribe(audio_files)
+    if isinstance(predictions, list):
+        texts = [p if isinstance(p, str) else str(p) for p in predictions]
     else:
+        texts = [str(predictions)]
+    # Return as rows for a single-column dataframe
+    return [[t.strip()] for t in texts]
 with gr.Blocks(title="EgypTalk-ASR-v2") as demo:
     gr.Markdown("""
     # EgypTalk-ASR-v2
+    Upload an audio file. This app transcribes audio using EgypTalk-ASR-v2.
     """)
     with gr.Tab("Single Test"):
         with gr.Row():
             audio_input = gr.Audio(type="filepath", label="Audio File")
+        transcribe_btn = gr.Button("Transcribe")
+        pred_output = gr.Textbox(label="Transcription")
+        transcribe_btn.click(transcribe_and_score, inputs=[audio_input], outputs=[pred_output])
     with gr.Tab("Batch Test"):
+        gr.Markdown("Upload multiple audio files. Batch size is limited by GPU/CPU memory.")
         audio_files = gr.Files(label="Audio Files (wav)")
+        batch_btn = gr.Button("Batch Transcribe")
+        preds_output = gr.Dataframe(headers=["Transcription"], label="Results")
+        batch_btn.click(batch_transcribe, inputs=[audio_files], outputs=[preds_output])
 demo.launch(share=True)