import torch import torchaudio from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline import gradio as gr # Setup device DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 WHISPER_SAMPLE_RATE = 16000 # Load model and processor model_id = "ixxan/whisper-small-thugy20" processor = AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20") model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(DEVICE) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, device=DEVICE, ) def preprocess_audio(audio_path: str) -> torch.Tensor: try: audio, sample_rate = torchaudio.load(audio_path) # Resample if necessary if sample_rate != WHISPER_SAMPLE_RATE: resampler = torchaudio.transforms.Resample( orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE ) audio = resampler(audio) # Convert to mono if audio.shape[0] > 1: audio = torch.mean(audio, dim=0) return audio.squeeze() except Exception as e: raise RuntimeError(f"Error processing audio file: {str(e)}") def transcribe(audio_path): try: if audio_path is None: return "No audio provided. Please record or upload an audio file." audio_input = preprocess_audio(audio_path) input_features = processor( audio_input, sampling_rate=WHISPER_SAMPLE_RATE, return_tensors="pt", ).input_features.to(DEVICE) predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription except Exception as e: return f"Error transcribing audio: {str(e)}" # Create Gradio interface iface = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath"), outputs="text", title="Uyghur Speech Recognition", description="Upload or record audio in Uyghur to get its transcription.", examples=[], # You can add example audio files here if you have them ) # Launch the app if __name__ == "__main__": iface.launch()