Spaces:

AvtnshM
/

Indic_ASR

Sleeping

App Files Files Community

Indic_ASR / app.py

AvtnshM

V10.0

ef348e0 verified 13 days ago

raw

history blame contribute delete

17.3 kB

	import gradio as gr
	import torch
	import torchaudio
	from transformers import AutoModel
	import numpy as np
	import tempfile
	import os

	# Load model
	model_name = "ai4bharat/indic-conformer-600m-multilingual"

	def load_model():
	"""Load the AI4Bharat IndicConformer model"""
	try:
	print("Loading AI4Bharat IndicConformer model...")
	print("This may take 2-3 minutes for first time download...")

	# Load model with low memory usage
	model = AutoModel.from_pretrained(
	model_name,
	trust_remote_code=True,
	torch_dtype=torch.float32,
	low_cpu_mem_usage=True
	)
	print("Model loaded successfully!")
	return model
	except Exception as e:
	print(f"Error loading model: {e}")
	print("Trying alternative loading method...")

	try:
	# Fallback: Load without low memory optimization
	model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
	print("Model loaded with fallback method!")
	return model
	except Exception as e2:
	print(f"All loading methods failed: {e2}")
	return None

	# Load the model
	model = load_model()

	# Language options - mapping display names to language codes
	LANGUAGE_OPTIONS = {
	"हिंदी (Hindi)": "hi",
	"বাংলা (Bengali)": "bn",
	"ગુજરાતી (Gujarati)": "gu",
	"मराठी (Marathi)": "mr",
	"ଓଡ଼ିଆ (Odia)": "or",
	"ਪੰਜਾਬੀ (Punjabi)": "pa",
	"தமிழ் (Tamil)": "ta",
	"తెలుగు (Telugu)": "te",
	"ಕನ್ನಡ (Kannada)": "kn",
	"മലയാളം (Malayalam)": "ml",
	"অসমীয়া (Assamese)": "as",
	"उর्दू (Urdu)": "ur",
	"नेपाली (Nepali)": "ne",
	"संस्कृत (Sanskrit)": "sa"
	}

	def transcribe_audio(audio_file, language_choice, decoding_method):
	"""
	Transcribe audio file to text using the AI4Bharat IndicConformer model
	"""
	if model is None:
	return "Error: Model not loaded properly. Please check the logs."

	try:
	# Handle different input types
	if audio_file is None:
	return "कृपया एक ऑडियो फ़ाइल प्रदान करें (Please provide an audio file)"

	# Get language code
	lang_code = LANGUAGE_OPTIONS.get(language_choice, "hi") # Default to Hindi

	# If audio_file is a tuple (sample_rate, audio_data)
	if isinstance(audio_file, tuple):
	sample_rate, audio_data = audio_file
	# Convert to tensor
	if isinstance(audio_data, np.ndarray):
	wav = torch.from_numpy(audio_data.astype(np.float32))
	else:
	wav = torch.tensor(audio_data, dtype=torch.float32)
	else:
	# If it's a file path
	wav, sample_rate = torchaudio.load(audio_file)

	# Ensure audio is not empty
	if wav.numel() == 0:
	return "ऑडियो फ़ाइल खाली है (Audio file is empty)"

	# Convert to mono if stereo
	if wav.dim() > 1 and wav.size(0) > 1:
	wav = torch.mean(wav, dim=0, keepdim=True)
	elif wav.dim() == 1:
	wav = wav.unsqueeze(0) # Add channel dimension

	# Resample to 16kHz if needed
	target_sample_rate = 16000
	if sample_rate != target_sample_rate:
	resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
	wav = resampler(wav)

	# Normalize audio
	if torch.max(torch.abs(wav)) > 0:
	wav = wav / torch.max(torch.abs(wav))

	print(f"Processing audio with language: {lang_code}, method: {decoding_method}")
	print(f"Audio shape: {wav.shape}, Sample rate: {target_sample_rate}")

	# Perform ASR with selected decoding method
	with torch.no_grad():
	if decoding_method == "CTC":
	transcription = model(wav, lang_code, "ctc")
	else: # RNNT
	transcription = model(wav, lang_code, "rnnt")

	# Clean up transcription
	if isinstance(transcription, list):
	transcription = transcription[0] if transcription else ""

	transcription = str(transcription).strip()

	if not transcription:
	return "कोई भाषण नहीं मिला या ट्रांस्क्रिप्शन खाली है। कृपया अधिक स्पष्ट रूप से बोलने का प्रयास करें। (No speech detected or transcription is empty. Please try speaking more clearly.)"

	return f"ट्रांस्क्रिप्शन ({decoding_method}): {transcription}"

	except Exception as e:
	error_msg = str(e)
	print(f"Error during transcription: {error_msg}")
	return f"ऑडियो प्रसंस्करण त्रुटि (Audio processing error): {error_msg}"

	def transcribe_microphone(audio, language_choice, decoding_method):
	"""Transcribe audio from microphone input"""
	if audio is None:
	return "कोई ऑडियो रिकॉर्ड नहीं हुआ। कृपया माइक्रोफ़ोन पर क्लिक करें और बोलें। (No audio recorded. Please click the microphone and speak.)"
	return transcribe_audio(audio, language_choice, decoding_method)

	def transcribe_file(audio_file, language_choice, decoding_method):
	"""Transcribe uploaded audio file"""
	if audio_file is None:
	return "कोई फ़ाइल अपलोड नहीं हुई। कृपया एक ऑडियो फ़ाइल चुनें। (No file uploaded. Please select an audio file.)"
	return transcribe_audio(audio_file, language_choice, decoding_method)

	def create_shareable_text(text, method="Voice"):
	"""Create formatted text for sharing"""
	if not text or text.strip() == "":
	return "कोई टेक्स्ट शेयर करने के लिए उपलब्ध नहीं है।"

	# Clean the text (remove markdown and emojis from result)
	clean_text = text.replace("ट्रांस्क्रिप्शन (CTC):", "").replace("ट्रांस्क्रिप्शन (RNNT):", "")
	clean_text = clean_text.strip()

	# Create shareable format
	share_text = f"""Hindi Speech-to-Text Result

	Input Method: {method}
	Transcription: {clean_text}

	Generated by: AI4Bharat IndicConformer
	App: Hindi ASR - Hugging Face Spaces

	---
	Share this Hindi transcription with others!"""

	return share_text

	def share_microphone_result(text):
	"""Create shareable text for microphone result"""
	return create_shareable_text(text, "Voice Recording")

	def share_file_result(text):
	"""Create shareable text for file result"""
	return create_shareable_text(text, "File Upload")

	# Model status message
	model_status = "Model loaded successfully!" if model is not None else "Model failed to load"

	# Create Gradio interface
	with gr.Blocks(title="भारतीय भाषा स्पीच टू टेक्स्ट (Indic Speech to Text)", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	f"""
	# भारतीय भाषा स्पीच टू टेक्स्ट कनवर्टर (Indic Speech to Text Converter)

	मॉडल स्थिति (Model Status): {model_status}

	{'पहली बार लोड हो रहा है - कृपया 2-3 मिनट प्रतीक्षा करें (First time loading - please wait 2-3 minutes)' if model is None else ''}

	AI4Bharat के बहुभाषी मॉडल का उपयोग करके भाषण को टेक्स्ट में बदलें।
	(Convert speech to text using AI4Bharat's multilingual model.)

	### समर्थित भाषाएं (Supported Languages):
	हिंदी, बंगाली, गुजराती, मराठी, ओडिया, पंजाबी, तमिल, तेलुगू, कन्नड़, मलयालम, असमिया, उर्दू, नेपाली, संस्कृत और अधिक।

	### उपयोग कैसे करें (How to use):
	1. भाषा चुनें (Select Language): अपनी भाषा चुनें
	2. डिकोडिंग विधि (Decoding Method): CTC (तेज़) या RNNT (अधिक सटीक) चुनें
	3. वॉइस इनपुट: माइक्रोफ़ोन बटन दबाएं और स्पष्ट रूप से बोलें
	4. फ़ाइल अपलोड: एक ऑडियो फ़ाइल अपलोड करें

	नोट: सर्वोत्तम परिणामों के लिए स्पष्ट ऑडियो का उपयोग करें।
	"""
	)

	# Language and method selection (shared across tabs)
	with gr.Row():
	language_dropdown = gr.Dropdown(
	choices=list(LANGUAGE_OPTIONS.keys()),
	value="हिंदी (Hindi)",
	label="भाषा चुनें (Select Language)",
	interactive=True
	)
	decoding_method = gr.Radio(
	choices=["CTC", "RNNT"],
	value="CTC",
	label="डिकोडिंग विधि (Decoding Method)",
	info="CTC: तेज़ (Fast), RNNT: अधिक सटीक (More Accurate)"
	)

	with gr.Tab("वॉइस इनपुट (Voice Input)"):
	gr.Markdown("### अपनी आवाज़ रिकॉर्ड करें और तत्काल ट्रांस्क्रिप्शन प्राप्त करें")

	with gr.Row():
	with gr.Column(scale=1):
	microphone_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	label="रिकॉर्ड करने के लिए क्लिक करें",
	show_download_button=False,
	interactive=True,
	streaming=False,
	autoplay=False,
	show_label=True,
	container=True,
	scale=None,
	min_width=160
	)

	with gr.Row():
	mic_submit_btn = gr.Button("ट्रांसक्राइब करें", variant="primary", size="lg")
	clear_mic_btn = gr.Button("साफ़ करें", variant="secondary")
	share_mic_btn = gr.Button("शेयर करें", variant="secondary")

	with gr.Column(scale=1):
	mic_output = gr.Textbox(
	label="ट्रांस्क्रिप्शन परिणाम (Transcription Result)",
	placeholder="रिकॉर्डिंग के बाद आपका ट्रांस्क्रिप्शन यहाँ दिखाई देगा...",
	lines=8,
	max_lines=15,
	interactive=True
	)

	# Button actions for microphone tab
	mic_submit_btn.click(
	fn=transcribe_microphone,
	inputs=[microphone_input, language_dropdown, decoding_method],
	outputs=mic_output
	)

	# Auto-transcribe when audio is recorded
	microphone_input.stop_recording(
	fn=transcribe_microphone,
	inputs=[microphone_input, language_dropdown, decoding_method],
	outputs=mic_output
	)

	clear_mic_btn.click(
	lambda: (None, ""),
	outputs=[microphone_input, mic_output]
	)

	# Share functionality for microphone tab
	share_mic_btn.click(
	fn=share_microphone_result,
	inputs=mic_output,
	outputs=mic_output
	)

	with gr.Tab("फ़ाइल अपलोड (File Upload)"):
	gr.Markdown("### ट्रांस्क्रिप्शन के लिए एक ऑडियो फ़ाइल अपलोड करें")

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.Audio(
	sources=["upload"],
	type="filepath",
	label="ऑडियो फ़ाइल अपलोड करें",
	show_download_button=False,
	interactive=True
	)

	with gr.Row():
	file_submit_btn = gr.Button("फ़ाइल ट्रांसक्राइब करें", variant="primary", size="lg")
	clear_file_btn = gr.Button("साफ़ करें", variant="secondary")
	share_file_btn = gr.Button("शेयर करें", variant="secondary")

	with gr.Column(scale=1):
	file_output = gr.Textbox(
	label="ट्रांस्क्रिप्शन परिणाम (Transcription Result)",
	placeholder="एक ऑडियो फ़ाइल अपलोड करें और ट्रांसक्राइब पर क्लिक करें...",
	lines=8,
	max_lines=15,
	interactive=True
	)

	# Button actions for file tab
	file_submit_btn.click(
	fn=transcribe_file,
	inputs=[file_input, language_dropdown, decoding_method],
	outputs=file_output
	)

	clear_file_btn.click(
	lambda: (None, ""),
	outputs=[file_input, file_output]
	)

	# Share functionality for file tab
	share_file_btn.click(
	fn=share_file_result,
	inputs=file_output,
	outputs=file_output
	)

	gr.Markdown(
	"""
	---
	### बेहतर ट्रांस्क्रिप्शन के लिए टिप्स (Tips for better transcription):

	वॉइस रिकॉर्डिंग के लिए (For Voice Recording):
	- स्पष्ट और मध्यम गति से बोलें (Speak clearly and at moderate pace)
	- डिवाइस को अपने मुंह के पास रखें (15-30 सेमी) (Keep device close to mouth)
	- शांत वातावरण में रिकॉर्ड करें (Record in quiet environment)
	- बोलने से पहले और बाद में 2-3 सेकंड की चुप्पी रखें (Keep 2-3 seconds silence before/after speaking)

	फ़ाइल अपलोड के लिए (For File Upload):
	- समर्थित प्रारूप (Supported formats): WAV, MP3, M4A, FLAC, OGG
	- अनुशंसित (Recommended): 16kHz sample rate, mono channel
	- फ़ाइल आकार सीमा (File size limit): आमतौर पर 10-50MB
	- ऑडियो लंबाई (Audio length): 1-60 सेकंड के साथ सर्वोत्तम परिणाम

	भाषा विकल्प (Language Options):
	- सटीकता के लिए सही भाषा चुनें (Select correct language for accuracy)
	- मिश्रित भाषा के लिए हिंदी का उपयोग करें (Use Hindi for mixed languages)

	डिकोडिंग विधियां (Decoding Methods):
	- CTC: तेज़ प्रसंस्करण, वास्तविक समय के लिए अच्छा (Fast processing, good for real-time)
	- RNNT: धीमा लेकिन अधिक सटीक (Slower but more accurate)

	---
	मॉडल (Model): AI4Bharat IndicConformer-600M-Multi
	फ्रेमवर्क (Framework): Transformers + Gradio
	परिनियोजन (Deployment): Hugging Face Spaces (CPU)
	समर्थित भाषाएं: भारत की 22 आधिकारिक भाषाएं
	"""
	)

	# Launch configuration
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True,
	share=False
	)