Spaces:

AvtnshM
/

ShukaAI_ASR

Sleeping

App Files Files Community

AvtnshM commited on 15 days ago

Commit

40d87de

verified ·

1 Parent(s): c41cc32

Upload 2 files

Browse files

Files changed (2) hide show

app.py +208 -0
requirements.txt +23 -0

app.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import gradio as gr
+import torch
+import librosa
+import numpy as np
+from transformers import pipeline
+import gc
+import warnings
+warnings.filterwarnings("ignore")
+class OptimizedShukaASR:
+    def __init__(self):
+        self.pipe = None
+        self.load_model()
+    def load_model(self):
+        """Load model with optimizations for CPU inference"""
+        try:
+            # Force CPU usage and optimize for inference
+            self.pipe = pipeline(
+                model='sarvamai/shuka_v1',
+                trust_remote_code=True,
+                device=-1,  # Force CPU
+                torch_dtype=torch.float16,  # Use half precision
+                model_kwargs={
+                    "torch_dtype": torch.float16,
+                    "low_cpu_mem_usage": True,
+                    "use_cache": True,
+                }
+            )
+            # Set to eval mode and optimize
+            if hasattr(self.pipe.model, 'eval'):
+                self.pipe.model.eval()
+            # Compile for faster inference (PyTorch 2.0+)
+            try:
+                self.pipe.model = torch.compile(self.pipe.model, mode="reduce-overhead")
+            except:
+                pass  # Skip if torch.compile not available
+            print("Model loaded successfully with optimizations")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            self.pipe = None
+    def preprocess_audio(self, audio_input, target_sr=16000, max_duration=30):
+        """Preprocess audio with length limiting and optimization"""
+        try:
+            if isinstance(audio_input, tuple):
+                sr, audio_data = audio_input
+                audio_data = audio_data.astype(np.float32)
+                if len(audio_data.shape) > 1:
+                    audio_data = audio_data.mean(axis=1)  # Convert to mono
+                audio_data = audio_data / np.max(np.abs(audio_data))  # Normalize
+            else:
+                audio_data, sr = librosa.load(audio_input, sr=target_sr)
+            # Limit audio duration to reduce processing time
+            max_samples = int(max_duration * target_sr)
+            if len(audio_data) > max_samples:
+                audio_data = audio_data[:max_samples]
+                print(f"Audio truncated to {max_duration} seconds")
+            # Resample if needed
+            if sr != target_sr:
+                audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=target_sr)
+            return audio_data, target_sr
+        except Exception as e:
+            raise Exception(f"Audio preprocessing failed: {e}")
+    def transcribe(self, audio_input, language="auto"):
+        """Transcribe audio to text"""
+        if self.pipe is None:
+            return "Model not loaded. Please check the setup."
+        try:
+            # Preprocess audio
+            audio, sr = self.preprocess_audio(audio_input)
+            # Prepare system prompt for ASR only
+            if language == "auto":
+                system_prompt = "Transcribe the following audio accurately. Only provide the transcription, nothing else."
+            else:
+                system_prompt = f"Transcribe the following audio in {language}. Only provide the transcription, nothing else."
+            turns = [
+                {'role': 'system', 'content': system_prompt},
+                {'role': 'user', 'content': '<|audio|>'}
+            ]
+            # Run inference with memory optimization
+            with torch.no_grad():
+                result = self.pipe(
+                    {
+                        'audio': audio,
+                        'turns': turns,
+                        'sampling_rate': sr
+                    },
+                    max_new_tokens=256,  # Reduced for ASR only
+                    do_sample=False,     # Deterministic output
+                    temperature=0.1,     # Low temperature for accuracy
+                    pad_token_id=self.pipe.tokenizer.eos_token_id
+                )
+            # Clean up memory
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            gc.collect()
+            # Extract transcription
+            if isinstance(result, list) and len(result) > 0:
+                transcription = result[0].get('generated_text', '').strip()
+            elif isinstance(result, dict):
+                transcription = result.get('generated_text', '').strip()
+            else:
+                transcription = str(result).strip()
+            return transcription
+        except Exception as e:
+            return f"Transcription failed: {str(e)}"
+# Initialize the ASR system
+asr_system = OptimizedShukaASR()
+def transcribe_audio(audio, language):
+    """Gradio interface function"""
+    if audio is None:
+        return "Please provide an audio file."
+    result = asr_system.transcribe(audio, language)
+    return result
+# Language options
+languages = [
+    ("Auto-detect", "auto"),
+    ("English", "english"),
+    ("Hindi", "hindi"),
+    ("Bengali", "bengali"),
+    ("Gujarati", "gujarati"),
+    ("Kannada", "kannada"),
+    ("Malayalam", "malayalam"),
+    ("Marathi", "marathi"),
+    ("Oriya", "oriya"),
+    ("Punjabi", "punjabi"),
+    ("Tamil", "tamil"),
+    ("Telugu", "telugu")
+]
+# Create Gradio interface
+with gr.Blocks(title="Shuka v1 ASR - Multilingual Speech Recognition") as demo:
+    gr.Markdown("# 🎙️ Shuka v1 ASR - Fast Multilingual Transcription")
+    gr.Markdown("Upload an audio file or record directly to get transcription in multiple Indic languages.")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                label="Audio Input",
+                type="filepath",
+                format="wav"
+            )
+            language_dropdown = gr.Dropdown(
+                choices=languages,
+                value="auto",
+                label="Language (optional)"
+            )
+            transcribe_btn = gr.Button("🚀 Transcribe", variant="primary")
+        with gr.Column():
+            output_text = gr.Textbox(
+                label="Transcription",
+                placeholder="Transcription will appear here...",
+                lines=10
+            )
+    # Event handlers
+    transcribe_btn.click(
+        fn=transcribe_audio,
+        inputs=[audio_input, language_dropdown],
+        outputs=output_text
+    )
+    # Auto-transcribe on audio upload
+    audio_input.change(
+        fn=transcribe_audio,
+        inputs=[audio_input, language_dropdown],
+        outputs=output_text
+    )
+    # Examples section
+    gr.Markdown("## 📝 Tips for best results:")
+    gr.Markdown("""
+    - Audio files are automatically limited to 30 seconds for faster processing
+    - Supported formats: WAV, MP3, M4A, WEBM
+    - For best accuracy, use clear audio with minimal background noise
+    - The model supports 11 Indic languages + English
+    """)
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# Core ML libraries
+torch==2.1.0
+transformers==4.41.2
+peft==0.11.1
+# Audio processing
+librosa==0.10.2
+soundfile==0.12.1
+# Gradio for web interface
+gradio==4.20.0
+# Utilities
+numpy==1.24.3
+scipy==1.11.1
+torchaudio==2.1.0
+# Optional optimizations
+accelerate==0.28.0
+bitsandbytes==0.43.0
+# System utilities
+psutil==5.9.5