Spaces:

sevda-tatlih
/

uyghur-speech-2-text

Running

sevda

Update app.py

e80497f verified 6 months ago

2.44 kB

	import torch
	import torchaudio
	from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
	import gradio as gr

	# Setup device
	DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	WHISPER_SAMPLE_RATE = 16000

	# Load model and processor
	model_id = "ixxan/whisper-small-thugy20"
	processor = AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20")
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	model.to(DEVICE)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=DEVICE,
	)

	def preprocess_audio(audio_path: str) -> torch.Tensor:
	try:
	audio, sample_rate = torchaudio.load(audio_path)
	# Resample if necessary
	if sample_rate != WHISPER_SAMPLE_RATE:
	resampler = torchaudio.transforms.Resample(
	orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE
	)
	audio = resampler(audio)
	# Convert to mono
	if audio.shape[0] > 1:
	audio = torch.mean(audio, dim=0)
	return audio.squeeze()
	except Exception as e:
	raise RuntimeError(f"Error processing audio file: {str(e)}")

	def transcribe(audio_path):
	try:
	if audio_path is None:
	return "No audio provided. Please record or upload an audio file."

	audio_input = preprocess_audio(audio_path)
	input_features = processor(
	audio_input,
	sampling_rate=WHISPER_SAMPLE_RATE,
	return_tensors="pt",
	).input_features.to(DEVICE)
	predicted_ids = model.generate(input_features)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
	return transcription
	except Exception as e:
	return f"Error transcribing audio: {str(e)}"

	# Create Gradio interface
	iface = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(type="filepath"),
	outputs="text",
	title="Uyghur Speech Recognition",
	description="Upload or record audio in Uyghur to get its transcription.",
	examples=[], # You can add example audio files here if you have them
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()