Spaces:

sevda-tatlih
/

uyghur-speech-2-text

Sleeping

File size: 2,435 Bytes

3f95fab
 
 
1134893
 
e80497f
3f95fab
 
 
1134893
e80497f
3f95fab
 
 
 
 
 
 
 
 
 
 
 
 
e4cfdaf
3f95fab
 
 
e80497f
 
 
 
 
 
 
 
 
 
 
 
 
 
3f95fab
e80497f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f95fab
e80497f
3f95fab
 
 
 
 
e80497f
 
3f95fab
e80497f

import torch
import torchaudio
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
import gradio as gr

# Setup device
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
WHISPER_SAMPLE_RATE = 16000

# Load model and processor
model_id = "ixxan/whisper-small-thugy20"
processor = AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(DEVICE)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=DEVICE,
)

def preprocess_audio(audio_path: str) -> torch.Tensor:
    try:
        audio, sample_rate = torchaudio.load(audio_path)
        # Resample if necessary
        if sample_rate != WHISPER_SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(
                orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE
            )
            audio = resampler(audio)
        # Convert to mono
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0)
        return audio.squeeze()
    except Exception as e:
        raise RuntimeError(f"Error processing audio file: {str(e)}")

def transcribe(audio_path):
    try:
        if audio_path is None:
            return "No audio provided. Please record or upload an audio file."
        
        audio_input = preprocess_audio(audio_path)
        input_features = processor(
            audio_input,
            sampling_rate=WHISPER_SAMPLE_RATE,
            return_tensors="pt",
        ).input_features.to(DEVICE)
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        return transcription
    except Exception as e:
        return f"Error transcribing audio: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Uyghur Speech Recognition",
    description="Upload or record audio in Uyghur to get its transcription.",
    examples=[],  # You can add example audio files here if you have them
)

# Launch the app
if __name__ == "__main__":
    iface.launch()