Spaces:

AvtnshM
/

Indic_ASR

Sleeping

File size: 17,311 Bytes

5ac41d5
 
 
fdd6a0c
5ac41d5
 
 
 
fdd6a0c
5ac41d5
 
a2a9fb8
fdd6a0c
a2a9fb8
fdd6a0c
96f2af5
 
 
 
 
 
 
 
 
ef348e0
fdd6a0c
a2a9fb8
ef348e0
96f2af5
 
 
 
 
ef348e0
96f2af5
 
ef348e0
96f2af5
a2a9fb8
 
fdd6a0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef348e0
fdd6a0c
 
 
5ac41d5
fdd6a0c
5ac41d5
fdd6a0c
5ac41d5
fdd6a0c
ef348e0
5ac41d5
 
 
 
ef348e0
fdd6a0c
 
 
5ac41d5
 
 
 
fdd6a0c
5ac41d5
fdd6a0c
5ac41d5
fdd6a0c
5ac41d5
 
fdd6a0c
5ac41d5
a2a9fb8
fdd6a0c
ef348e0
a2a9fb8
fdd6a0c
 
 
 
 
5ac41d5
 
fdd6a0c
 
 
 
5ac41d5
 
fdd6a0c
 
5ac41d5
fdd6a0c
 
5ac41d5
fdd6a0c
5ac41d5
fdd6a0c
 
 
 
5ac41d5
a2a9fb8
fdd6a0c
 
 
 
a2a9fb8
 
ef348e0
a2a9fb8
ef348e0
5ac41d5
 
fdd6a0c
 
ef348e0
5ac41d5
fdd6a0c
 
a2a9fb8
ef348e0
fdd6a0c
5ac41d5
ef348e0
 
 
 
 
 
1602163
 
 
ef348e0
1602163
 
ef348e0
1602163
 
 
ef348e0
1602163
ef348e0
 
1602163
ef348e0
 
1602163
 
 
 
 
5ac41d5
ef348e0
 
 
 
 
 
 
 
a2a9fb8
ef348e0
a2a9fb8
5ac41d5
fdd6a0c
5ac41d5
a2a9fb8
ef348e0
5ac41d5
fdd6a0c
a2a9fb8
ef348e0
96f2af5
fdd6a0c
 
5ac41d5
fdd6a0c
 
5ac41d5
fdd6a0c
 
 
 
 
5ac41d5
fdd6a0c
5ac41d5
 
 
fdd6a0c
 
 
 
 
ef348e0
fdd6a0c
 
 
 
 
ef348e0
fdd6a0c
 
 
ef348e0
fdd6a0c
a2a9fb8
5ac41d5
a2a9fb8
5ac41d5
 
 
ef348e0
a2a9fb8
ffdd72d
 
 
 
 
 
 
5ac41d5
a2a9fb8
 
ef348e0
 
 
5ac41d5
a2a9fb8
5ac41d5
ef348e0
fdd6a0c
a2a9fb8
 
ef348e0
5ac41d5
 
a2a9fb8
5ac41d5
 
fdd6a0c
5ac41d5
 
a2a9fb8
ffdd72d
 
 
 
 
 
 
a2a9fb8
 
 
 
96f2af5
1602163
 
 
96f2af5
 
 
5ac41d5
ef348e0
fdd6a0c
a2a9fb8
5ac41d5
a2a9fb8
5ac41d5
 
 
ef348e0
a2a9fb8
 
5ac41d5
a2a9fb8
 
ef348e0
 
 
5ac41d5
a2a9fb8
5ac41d5
ef348e0
fdd6a0c
a2a9fb8
 
ef348e0
5ac41d5
 
a2a9fb8
5ac41d5
 
fdd6a0c
5ac41d5
 
a2a9fb8
 
 
 
 
96f2af5
1602163
 
 
96f2af5
 
 
5ac41d5
 
ef348e0
5ac41d5
ef348e0
fdd6a0c
 
 
 
 
 
a2a9fb8
fdd6a0c
 
 
 
 
5ac41d5
fdd6a0c
 
 
a2a9fb8
fdd6a0c
 
 
a2a9fb8
 
fdd6a0c
ef348e0
fdd6a0c
 
5ac41d5
 
 
a2a9fb8
5ac41d5
 
 
 
a2a9fb8
fdd6a0c
5ac41d5

import gradio as gr
import torch
import torchaudio
from transformers import AutoModel
import numpy as np
import tempfile
import os

# Load model
model_name = "ai4bharat/indic-conformer-600m-multilingual"

def load_model():
    """Load the AI4Bharat IndicConformer model"""
    try:
        print("Loading AI4Bharat IndicConformer model...")
        print("This may take 2-3 minutes for first time download...")
        
        # Load model with low memory usage
        model = AutoModel.from_pretrained(
            model_name, 
            trust_remote_code=True,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True
        )
        print("Model loaded successfully!")
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Trying alternative loading method...")
        
        try:
            # Fallback: Load without low memory optimization
            model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
            print("Model loaded with fallback method!")
            return model
        except Exception as e2:
            print(f"All loading methods failed: {e2}")
            return None

# Load the model
model = load_model()

# Language options - mapping display names to language codes
LANGUAGE_OPTIONS = {
    "हिंदी (Hindi)": "hi",
    "বাংলা (Bengali)": "bn", 
    "ગુજરાતી (Gujarati)": "gu",
    "मराठी (Marathi)": "mr",
    "ଓଡ଼ିଆ (Odia)": "or",
    "ਪੰਜਾਬੀ (Punjabi)": "pa",
    "தமிழ் (Tamil)": "ta",
    "తెలుగు (Telugu)": "te",
    "ಕನ್ನಡ (Kannada)": "kn",
    "മലയാളം (Malayalam)": "ml",
    "অসমীয়া (Assamese)": "as",
    "उর्दू (Urdu)": "ur",
    "नेपाली (Nepali)": "ne",
    "संस्कृत (Sanskrit)": "sa"
}

def transcribe_audio(audio_file, language_choice, decoding_method):
    """
    Transcribe audio file to text using the AI4Bharat IndicConformer model
    """
    if model is None:
        return "Error: Model not loaded properly. Please check the logs."
    
    try:
        # Handle different input types
        if audio_file is None:
            return "कृपया एक ऑडियो फ़ाइल प्रदान करें (Please provide an audio file)"
        
        # Get language code
        lang_code = LANGUAGE_OPTIONS.get(language_choice, "hi")  # Default to Hindi
        
        # If audio_file is a tuple (sample_rate, audio_data)
        if isinstance(audio_file, tuple):
            sample_rate, audio_data = audio_file
            # Convert to tensor
            if isinstance(audio_data, np.ndarray):
                wav = torch.from_numpy(audio_data.astype(np.float32))
            else:
                wav = torch.tensor(audio_data, dtype=torch.float32)
        else:
            # If it's a file path
            wav, sample_rate = torchaudio.load(audio_file)
        
        # Ensure audio is not empty
        if wav.numel() == 0:
            return "ऑडियो फ़ाइल खाली है (Audio file is empty)"
        
        # Convert to mono if stereo
        if wav.dim() > 1 and wav.size(0) > 1:
            wav = torch.mean(wav, dim=0, keepdim=True)
        elif wav.dim() == 1:
            wav = wav.unsqueeze(0)  # Add channel dimension
        
        # Resample to 16kHz if needed
        target_sample_rate = 16000
        if sample_rate != target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
            wav = resampler(wav)
        
        # Normalize audio
        if torch.max(torch.abs(wav)) > 0:
            wav = wav / torch.max(torch.abs(wav))
        
        print(f"Processing audio with language: {lang_code}, method: {decoding_method}")
        print(f"Audio shape: {wav.shape}, Sample rate: {target_sample_rate}")
        
        # Perform ASR with selected decoding method
        with torch.no_grad():
            if decoding_method == "CTC":
                transcription = model(wav, lang_code, "ctc")
            else:  # RNNT
                transcription = model(wav, lang_code, "rnnt")
        
        # Clean up transcription
        if isinstance(transcription, list):
            transcription = transcription[0] if transcription else ""
        
        transcription = str(transcription).strip()
        
        if not transcription:
            return "कोई भाषण नहीं मिला या ट्रांस्क्रिप्शन खाली है। कृपया अधिक स्पष्ट रूप से बोलने का प्रयास करें। (No speech detected or transcription is empty. Please try speaking more clearly.)"
        
        return f"**ट्रांस्क्रिप्शन ({decoding_method}):** {transcription}"
        
    except Exception as e:
        error_msg = str(e)
        print(f"Error during transcription: {error_msg}")
        return f"ऑडियो प्रसंस्करण त्रुटि (Audio processing error): {error_msg}"

def transcribe_microphone(audio, language_choice, decoding_method):
    """Transcribe audio from microphone input"""
    if audio is None:
        return "कोई ऑडियो रिकॉर्ड नहीं हुआ। कृपया माइक्रोफ़ोन पर क्लिक करें और बोलें। (No audio recorded. Please click the microphone and speak.)"
    return transcribe_audio(audio, language_choice, decoding_method)

def transcribe_file(audio_file, language_choice, decoding_method):
    """Transcribe uploaded audio file"""
    if audio_file is None:
        return "कोई फ़ाइल अपलोड नहीं हुई। कृपया एक ऑडियो फ़ाइल चुनें। (No file uploaded. Please select an audio file.)"
    return transcribe_audio(audio_file, language_choice, decoding_method)

def create_shareable_text(text, method="Voice"):
    """Create formatted text for sharing"""
    if not text or text.strip() == "":
        return "कोई टेक्स्ट शेयर करने के लिए उपलब्ध नहीं है।"
    
    # Clean the text (remove markdown and emojis from result)
    clean_text = text.replace("**ट्रांस्क्रिप्शन (CTC):**", "").replace("**ट्रांस्क्रिप्शन (RNNT):**", "")
    clean_text = clean_text.strip()
    
    # Create shareable format
    share_text = f"""Hindi Speech-to-Text Result

Input Method: {method}
Transcription: {clean_text}

Generated by: AI4Bharat IndicConformer
App: Hindi ASR - Hugging Face Spaces

---
Share this Hindi transcription with others!"""
    
    return share_text

def share_microphone_result(text):
    """Create shareable text for microphone result"""
    return create_shareable_text(text, "Voice Recording")

def share_file_result(text):
    """Create shareable text for file result"""
    return create_shareable_text(text, "File Upload")

# Model status message
model_status = "Model loaded successfully!" if model is not None else "Model failed to load"

# Create Gradio interface
with gr.Blocks(title="भारतीय भाषा स्पीच टू टेक्स्ट (Indic Speech to Text)", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        f"""
        # भारतीय भाषा स्पीच टू टेक्स्ट कनवर्टर (Indic Speech to Text Converter)
        
        **मॉडल स्थिति (Model Status):** {model_status}
        
        {'पहली बार लोड हो रहा है - कृपया 2-3 मिनट प्रतीक्षा करें (First time loading - please wait 2-3 minutes)' if model is None else ''}
        
        AI4Bharat के बहुभाषी मॉडल का उपयोग करके भाषण को टेक्स्ट में बदलें।
        (Convert speech to text using AI4Bharat's multilingual model.)
        
        ### समर्थित भाषाएं (Supported Languages):
        हिंदी, बंगाली, गुजराती, मराठी, ओडिया, पंजाबी, तमिल, तेलुगू, कन्नड़, मलयालम, असमिया, उर्दू, नेपाली, संस्कृत और अधिक।
        
        ### उपयोग कैसे करें (How to use):
        1. **भाषा चुनें (Select Language)**: अपनी भाषा चुनें
        2. **डिकोडिंग विधि (Decoding Method)**: CTC (तेज़) या RNNT (अधिक सटीक) चुनें
        3. **वॉइस इनपुट**: माइक्रोफ़ोन बटन दबाएं और स्पष्ट रूप से बोलें
        4. **फ़ाइल अपलोड**: एक ऑडियो फ़ाइल अपलोड करें
        
        **नोट**: सर्वोत्तम परिणामों के लिए स्पष्ट ऑडियो का उपयोग करें।
        """
    )
    
    # Language and method selection (shared across tabs)
    with gr.Row():
        language_dropdown = gr.Dropdown(
            choices=list(LANGUAGE_OPTIONS.keys()),
            value="हिंदी (Hindi)",
            label="भाषा चुनें (Select Language)",
            interactive=True
        )
        decoding_method = gr.Radio(
            choices=["CTC", "RNNT"], 
            value="CTC", 
            label="डिकोडिंग विधि (Decoding Method)",
            info="CTC: तेज़ (Fast), RNNT: अधिक सटीक (More Accurate)"
        )
    
    with gr.Tab("वॉइस इनपुट (Voice Input)"):
        gr.Markdown("### अपनी आवाज़ रिकॉर्ड करें और तत्काल ट्रांस्क्रिप्शन प्राप्त करें")
        
        with gr.Row():
            with gr.Column(scale=1):
                microphone_input = gr.Audio(
                    sources=["microphone"],
                    type="numpy",
                    label="रिकॉर्ड करने के लिए क्लिक करें",
                    show_download_button=False,
                    interactive=True,
                    streaming=False,
                    autoplay=False,
                    show_label=True,
                    container=True,
                    scale=None,
                    min_width=160
                )
                
                with gr.Row():
                    mic_submit_btn = gr.Button("ट्रांसक्राइब करें", variant="primary", size="lg")
                    clear_mic_btn = gr.Button("साफ़ करें", variant="secondary")
                    share_mic_btn = gr.Button("शेयर करें", variant="secondary")
            
            with gr.Column(scale=1):
                mic_output = gr.Textbox(
                    label="ट्रांस्क्रिप्शन परिणाम (Transcription Result)",
                    placeholder="रिकॉर्डिंग के बाद आपका ट्रांस्क्रिप्शन यहाँ दिखाई देगा...",
                    lines=8,
                    max_lines=15,
                    interactive=True
                )
        
        # Button actions for microphone tab
        mic_submit_btn.click(
            fn=transcribe_microphone,
            inputs=[microphone_input, language_dropdown, decoding_method],
            outputs=mic_output
        )
        
        # Auto-transcribe when audio is recorded
        microphone_input.stop_recording(
            fn=transcribe_microphone,
            inputs=[microphone_input, language_dropdown, decoding_method],
            outputs=mic_output
        )
        
        clear_mic_btn.click(
            lambda: (None, ""),
            outputs=[microphone_input, mic_output]
        )
        
        # Share functionality for microphone tab
        share_mic_btn.click(
            fn=share_microphone_result,
            inputs=mic_output,
            outputs=mic_output
        )
    
    with gr.Tab("फ़ाइल अपलोड (File Upload)"):
        gr.Markdown("### ट्रांस्क्रिप्शन के लिए एक ऑडियो फ़ाइल अपलोड करें")
        
        with gr.Row():
            with gr.Column(scale=1):
                file_input = gr.Audio(
                    sources=["upload"],
                    type="filepath",
                    label="ऑडियो फ़ाइल अपलोड करें",
                    show_download_button=False,
                    interactive=True
                )
                
                with gr.Row():
                    file_submit_btn = gr.Button("फ़ाइल ट्रांसक्राइब करें", variant="primary", size="lg")
                    clear_file_btn = gr.Button("साफ़ करें", variant="secondary")
                    share_file_btn = gr.Button("शेयर करें", variant="secondary")
            
            with gr.Column(scale=1):
                file_output = gr.Textbox(
                    label="ट्रांस्क्रिप्शन परिणाम (Transcription Result)",
                    placeholder="एक ऑडियो फ़ाइल अपलोड करें और ट्रांसक्राइब पर क्लिक करें...",
                    lines=8,
                    max_lines=15,
                    interactive=True
                )
        
        # Button actions for file tab
        file_submit_btn.click(
            fn=transcribe_file,
            inputs=[file_input, language_dropdown, decoding_method],
            outputs=file_output
        )
        
        clear_file_btn.click(
            lambda: (None, ""),
            outputs=[file_input, file_output]
        )
        
        # Share functionality for file tab
        share_file_btn.click(
            fn=share_file_result,
            inputs=file_output,
            outputs=file_output
        )
    
    gr.Markdown(
        """
        ---
        ### बेहतर ट्रांस्क्रिप्शन के लिए टिप्स (Tips for better transcription):
        
        **वॉइस रिकॉर्डिंग के लिए (For Voice Recording):**
        - स्पष्ट और मध्यम गति से बोलें (Speak clearly and at moderate pace)
        - डिवाइस को अपने मुंह के पास रखें (15-30 सेमी) (Keep device close to mouth)
        - शांत वातावरण में रिकॉर्ड करें (Record in quiet environment)
        - बोलने से पहले और बाद में 2-3 सेकंड की चुप्पी रखें (Keep 2-3 seconds silence before/after speaking)
        
        **फ़ाइल अपलोड के लिए (For File Upload):**
        - समर्थित प्रारूप (Supported formats): WAV, MP3, M4A, FLAC, OGG
        - अनुशंसित (Recommended): 16kHz sample rate, mono channel
        - फ़ाइल आकार सीमा (File size limit): आमतौर पर 10-50MB
        - ऑडियो लंबाई (Audio length): 1-60 सेकंड के साथ सर्वोत्तम परिणाम
        
        **भाषा विकल्प (Language Options):**
        - सटीकता के लिए सही भाषा चुनें (Select correct language for accuracy)
        - मिश्रित भाषा के लिए हिंदी का उपयोग करें (Use Hindi for mixed languages)
        
        **डिकोडिंग विधियां (Decoding Methods):**
        - **CTC**: तेज़ प्रसंस्करण, वास्तविक समय के लिए अच्छा (Fast processing, good for real-time)
        - **RNNT**: धीमा लेकिन अधिक सटीक (Slower but more accurate)
        
        ---
        **मॉडल (Model)**: AI4Bharat IndicConformer-600M-Multi  
        **फ्रेमवर्क (Framework)**: Transformers + Gradio  
        **परिनियोजन (Deployment)**: Hugging Face Spaces (CPU)  
        **समर्थित भाषाएं**: भारत की 22 आधिकारिक भाषाएं
        """
    )

# Launch configuration
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True,
        share=False
    )