Spaces:

sevda-tatlih
/

uyghur-speech-2-text

Sleeping

App Files Files Community

sevda commited on Mar 5

Commit

e80497f

verified ·

1 Parent(s): e4cfdaf

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -28

app.py CHANGED Viewed

@@ -1,15 +1,15 @@
 import torch
 import torchaudio
-# from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
 import gradio as gr
 DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 WHISPER_SAMPLE_RATE = 16000
 model_id = "ixxan/whisper-small-thugy20"
 processor = AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20")
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
@@ -26,37 +26,48 @@ pipe = pipeline(
 )
 def preprocess_audio(audio_path: str) -> torch.Tensor:
-    audio, sample_rate = torchaudio.load(audio_path)
-    # Resample if necessary
-    if sample_rate != WHISPER_SAMPLE_RATE:
-        resampler = torchaudio.transforms.Resample(
-            orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE
-        )
-        audio = resampler(audio)
-    # Convert to mono
-    if audio.shape[0] > 1:
-        audio = torch.mean(audio, dim=0)
-    return audio.squeeze()
-def transcribe(audio_path: str) -> str:
-    audio_input = preprocess_audio(audio_path)
-    input_features = processor(
-        audio_input,
-        sampling_rate=WHISPER_SAMPLE_RATE,
-        return_tensors="pt",
-        # language="Chinese",
-    ).input_features.to(DEVICE)
-    predicted_ids = model.generate(input_features)
-    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-    return transcription
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(type="filepath"),
     outputs="text",
     title="Uyghur Speech Recognition",
 )
-iface.launch()

 import torch
 import torchaudio
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
 import gradio as gr
+# Setup device
 DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 WHISPER_SAMPLE_RATE = 16000
+# Load model and processor
 model_id = "ixxan/whisper-small-thugy20"
 processor = AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20")
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
 )
 def preprocess_audio(audio_path: str) -> torch.Tensor:
+    try:
+        audio, sample_rate = torchaudio.load(audio_path)
+        # Resample if necessary
+        if sample_rate != WHISPER_SAMPLE_RATE:
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE
+            )
+            audio = resampler(audio)
+        # Convert to mono
+        if audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0)
+        return audio.squeeze()
+    except Exception as e:
+        raise RuntimeError(f"Error processing audio file: {str(e)}")
+def transcribe(audio_path):
+    try:
+        if audio_path is None:
+            return "No audio provided. Please record or upload an audio file."
+        audio_input = preprocess_audio(audio_path)
+        input_features = processor(
+            audio_input,
+            sampling_rate=WHISPER_SAMPLE_RATE,
+            return_tensors="pt",
+        ).input_features.to(DEVICE)
+        predicted_ids = model.generate(input_features)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        return transcription
+    except Exception as e:
+        return f"Error transcribing audio: {str(e)}"
+# Create Gradio interface
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(type="filepath"),
     outputs="text",
     title="Uyghur Speech Recognition",
+    description="Upload or record audio in Uyghur to get its transcription.",
+    examples=[],  # You can add example audio files here if you have them
 )
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()