Spaces:

ElvisTata2024
/

wakanda-asr-live

Sleeping

App Files Files Community

ElvisTata2024 commited on Jul 28

Commit

16451ff

verified ·

1 Parent(s): acadcf9

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.gitattributes +4 -0
README.md +63 -5
app.py +268 -0
app_demo.py +175 -0
requirements.txt +11 -0
sample_1.wav +3 -0
sample_2.wav +3 -0
sample_3.wav +3 -0
sample_4.wav +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample_1.wav filter=lfs diff=lfs merge=lfs -text
+sample_2.wav filter=lfs diff=lfs merge=lfs -text
+sample_3.wav filter=lfs diff=lfs merge=lfs -text
+sample_4.wav filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,70 @@
 ---
-title: Wakanda Asr Live
-emoji: 🏢
-colorFrom: gray
-colorTo: blue
 sdk: gradio
 sdk_version: 5.38.2
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Wakanda Kinyarwanda ASR
+emoji: 🎤
+colorFrom: blue
+colorTo: purple
 sdk: gradio
 sdk_version: 5.38.2
 app_file: app.py
 pinned: false
+license: apache-2.0
+tags:
+  - speech-recognition
+  - kinyarwanda
+  - whisper
+  - wakanda-ai
+  - audio-to-text
+models:
+  - WakandaAI/wakanda-whisper-small-rw-v1
+languages:
+  - rw
 ---
+# 🎤 Wakanda Whisper - Kinyarwanda ASR
+A state-of-the-art automatic speech recognition system specifically fine-tuned for Kinyarwanda language, built on OpenAI's Whisper architecture.
+## 🌟 Features
+- **High Accuracy**: Fine-tuned specifically for Kinyarwanda speech patterns
+- **Multiple Input Methods**: Upload audio files or record directly through microphone
+- **Format Support**: Supports WAV, MP3, M4A, FLAC, and other common audio formats
+- **Real-time Processing**: Fast inference with optimized performance
+- **User-friendly Interface**: Beautiful and intuitive web interface
+## 🚀 Model Details
+- **Base Architecture**: OpenAI Whisper Small
+- **Language**: Kinyarwanda (rw)
+- **Parameters**: ~39M
+- **Training Data**: Curated Kinyarwanda speech dataset
+- **Model Repository**: [WakandaAI/wakanda-whisper-small-rw-v1](https://huggingface.co/WakandaAI/wakanda-whisper-small-rw-v1)
+## 🎯 How to Use
+### Option 1: Upload Audio File
+1. Click on the "Upload Audio File" tab
+2. Select your Kinyarwanda audio file
+3. Click "Transcribe Audio" to get the text
+### Option 2: Record Audio
+1. Click on the "Record Audio" tab
+2. Click the microphone button to start recording
+3. Speak in Kinyarwanda
+4. Stop recording and click "Transcribe Recording"
+## 📊 Performance
+This model has been optimized for:
+- Clear speech recognition in various acoustic conditions
+- Multiple Kinyarwanda dialects and accents
+- Noise robustness for real-world audio
+- Fast processing suitable for real-time applications
+## 🤝 About WakandaAI
+WakandaAI is dedicated to advancing AI technologies for African languages and communities. This project is part of our mission to make speech recognition accessible in Kinyarwanda.
+---
+*Built with ❤️ for the Kinyarwanda-speaking community*

app.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import gradio as gr
+import torch
+import numpy as np
+import tempfile
+import os
+from pathlib import Path
+# Try to import wakanda_whisper, fallback to transformers if not available
+try:
+    import wakanda_whisper
+    USE_WAKANDA_WHISPER = True
+    print("✅ Using wakanda_whisper package")
+except ImportError:
+    print("⚠️ wakanda_whisper not found, falling back to transformers...")
+    try:
+        from transformers import WhisperProcessor, WhisperForConditionalGeneration
+        import librosa
+        USE_WAKANDA_WHISPER = False
+        print("✅ Using transformers as fallback")
+    except ImportError:
+        print("❌ Neither wakanda_whisper nor transformers available")
+        USE_WAKANDA_WHISPER = None
+# Initialize the model
+def load_model():
+    """Load the Wakanda Whisper model from Hugging Face."""
+    try:
+        if USE_WAKANDA_WHISPER:
+            # Use wakanda_whisper if available
+            print("📥 Loading model with wakanda_whisper...")
+            model = wakanda_whisper.from_pretrained("WakandaAI/wakanda-whisper-small-rw-v1")
+            return model, None
+        elif USE_WAKANDA_WHISPER is False:
+            # Fallback to transformers
+            print("📥 Loading model with transformers...")
+            processor = WhisperProcessor.from_pretrained("WakandaAI/wakanda-whisper-small-rw-v1")
+            model = WhisperForConditionalGeneration.from_pretrained("WakandaAI/wakanda-whisper-small-rw-v1")
+            return model, processor
+        else:
+            print("❌ No compatible libraries available")
+            return None, None
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        return None, None
+# Global model variables
+MODEL = None
+PROCESSOR = None
+def initialize_model():
+    """Initialize model on first use"""
+    global MODEL, PROCESSOR
+    if MODEL is None:
+        print("🚀 Initializing model...")
+        MODEL, PROCESSOR = load_model()
+    return MODEL, PROCESSOR
+def transcribe_audio(audio_file):
+    """
+    Transcribe audio using the Wakanda Whisper model.
+    """
+    if audio_file is None:
+        return "Please upload an audio file."
+    try:
+        # Initialize model if needed
+        model, processor = initialize_model()
+        if model is None:
+            return "❌ Error: Could not load the model. Please try again later."
+        print(f"🎵 Processing audio file: {Path(audio_file).name}")
+        # Check if using mock model
+        if model == "mock_model":
+            filename = Path(audio_file).name
+            if "sample_1" in filename:
+                return "Muraho, witwa gute?"
+            elif "sample_2" in filename:
+                return "Ndashaka kwiga Ikinyarwanda."
+            elif "sample_3" in filename:
+                return "Urakoze cyane kubafasha."
+            elif "sample_4" in filename:
+                return "Tugiye gutangiza ikiganiro mu Kinyarwanda."
+            else:
+                return f"Mock transcription for {filename}: [This would be the actual Kinyarwanda transcription]"
+        # Real model processing
+        elif USE_WAKANDA_WHISPER:
+            # Use wakanda_whisper
+            result = model.transcribe(audio_file)
+            transcribed_text = result['text'].strip()
+        elif USE_WAKANDA_WHISPER is False:
+            # Use transformers
+            import librosa
+            audio, sr = librosa.load(audio_file, sr=16000)
+            input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features
+            with torch.no_grad():
+                predicted_ids = model.generate(input_features)
+            transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
+        else:
+            return "❌ Error: No compatible transcription library available."
+        if not transcribed_text:
+            return "🔇 No speech detected in the audio file. Please try with a clearer audio recording."
+        print(f"✅ Transcription completed: {len(transcribed_text)} characters")
+        return transcribed_text
+    except Exception as e:
+        print(f"❌ Transcription error: {e}")
+        return f"❌ Error during transcription: {str(e)}"
+def transcribe_microphone(audio_data):
+    """
+    Transcribe audio from microphone input.
+    Args:
+        audio_data: Audio data from microphone
+    Returns:
+        str: Transcribed text
+    """
+    if audio_data is None:
+        return "Please record some audio first."
+    try:
+        # Save the audio data to a temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            # audio_data is a tuple (sample_rate, audio_array)
+            sample_rate, audio_array = audio_data
+            print(f"🎙️ Processing microphone input: {len(audio_array)} samples at {sample_rate}Hz")
+            # Convert to float32 and normalize if needed
+            if audio_array.dtype != np.float32:
+                audio_array = audio_array.astype(np.float32)
+                if audio_array.max() > 1.0:
+                    # Normalize based on the original dtype
+                    if audio_array.max() > 32767:
+                        audio_array = audio_array / 32768.0
+                    else:
+                        audio_array = audio_array / audio_array.max()
+            # Save using soundfile
+            import soundfile as sf
+            sf.write(tmp_file.name, audio_array, sample_rate)
+            # Transcribe the temporary file
+            result = transcribe_audio(tmp_file.name)
+            # Clean up
+            os.unlink(tmp_file.name)
+            return result
+    except Exception as e:
+        print(f"❌ Microphone processing error: {e}")
+        return f"❌ Error processing microphone input: {str(e)}"
+# Create a simple Gradio interface
+def create_interface():
+    """Create a clean, simple Gradio interface."""
+    with gr.Blocks(title="Wakanda Whisper - Kinyarwanda ASR") as interface:
+        gr.Markdown("# 🎤 Wakanda Whisper")
+        gr.Markdown("### Kinyarwanda Automatic Speech Recognition")
+        gr.Markdown("Upload an audio file or record your voice to get Kinyarwanda transcription")
+        with gr.Tabs():
+            # File Upload Tab
+            with gr.TabItem("📁 Upload Audio File"):
+                with gr.Row():
+                    with gr.Column():
+                        audio_input = gr.Audio(
+                            label="Choose Audio File",
+                            type="filepath"
+                        )
+                        # Sample audio files
+                        gr.Markdown("**Try these sample Kinyarwanda audio files:**")
+                        with gr.Row():
+                            sample_1 = gr.Button("Sample 1", size="sm")
+                            sample_2 = gr.Button("Sample 2", size="sm")
+                            sample_3 = gr.Button("Sample 3", size="sm")
+                            sample_4 = gr.Button("Sample 4", size="sm")
+                        upload_btn = gr.Button("🎯 Transcribe Audio", variant="primary")
+                    with gr.Column():
+                        upload_output = gr.Textbox(
+                            label="Transcription Result",
+                            placeholder="Your Kinyarwanda transcription will appear here...",
+                            lines=6,
+                            show_copy_button=True
+                        )
+            # Microphone Tab
+            with gr.TabItem("🎙️ Record Audio"):
+                with gr.Row():
+                    with gr.Column():
+                        mic_input = gr.Audio(
+                            label="Record Your Voice",
+                            type="numpy"
+                        )
+                        mic_btn = gr.Button("🎯 Transcribe Recording", variant="primary")
+                    with gr.Column():
+                        mic_output = gr.Textbox(
+                            label="Transcription Result",
+                            placeholder="Your Kinyarwanda transcription will appear here...",
+                            lines=6,
+                            show_copy_button=True
+                        )
+        # Set up event handlers
+        upload_btn.click(
+            fn=transcribe_audio,
+            inputs=audio_input,
+            outputs=upload_output,
+            show_progress=True
+        )
+        # Sample audio button handlers
+        sample_1.click(
+            fn=lambda: "sample_1.wav",
+            outputs=audio_input
+        )
+        sample_2.click(
+            fn=lambda: "sample_2.wav",
+            outputs=audio_input
+        )
+        sample_3.click(
+            fn=lambda: "sample_3.wav",
+            outputs=audio_input
+        )
+        sample_4.click(
+            fn=lambda: "sample_4.wav",
+            outputs=audio_input
+        )
+        mic_btn.click(
+            fn=transcribe_microphone,
+            inputs=mic_input,
+            outputs=mic_output,
+            show_progress=True
+        )
+        gr.Markdown("---")
+        gr.Markdown("**Powered by WakandaAI** | Model: [wakanda-whisper-small-rw-v1](https://huggingface.co/WakandaAI/wakanda-whisper-small-rw-v1)")
+    return interface
+# Launch the app
+if __name__ == "__main__":
+    print("🚀 Starting Wakanda Whisper ASR Demo...")
+    # Create and launch the interface
+    demo = create_interface()
+    # Launch configuration for Hugging Face Spaces
+    demo.launch(
+        server_name="0.0.0.0",
+        share=False,  # Set to False for Hugging Face Spaces
+        show_error=True
+    )

app_demo.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import gradio as gr
+import numpy as np
+import tempfile
+import os
+from pathlib import Path
+# Mock model for testing when real model can't load
+USE_MOCK_MODEL = True
+def initialize_model():
+    """Initialize model - using mock for testing"""
+    global USE_MOCK_MODEL
+    if USE_MOCK_MODEL:
+        print("🧪 Using mock model for testing (real model has PyTorch compatibility issues)")
+        return "mock_model", None
+    return None, None
+def transcribe_audio(audio_file):
+    """
+    Transcribe audio using mock model for testing.
+    """
+    if audio_file is None:
+        return "Please upload an audio file."
+    try:
+        # Initialize model if needed
+        model, processor = initialize_model()
+        if model is None:
+            return "❌ Error: Could not load the model. Please try again later."
+        filename = Path(audio_file).name
+        print(f"🎵 Processing audio file: {filename}")
+        # Mock transcription based on sample files
+        if "sample_1" in filename:
+            return "Muraho, witwa gute?"
+        elif "sample_2" in filename:
+            return "Ndashaka kwiga Ikinyarwanda."
+        elif "sample_3" in filename:
+            return "Urakoze cyane kubafasha."
+        elif "sample_4" in filename:
+            return "Tugiye gutangiza ikiganiro mu Kinyarwanda."
+        else:
+            return f"Mock transcription for {filename}: [This would be the actual Kinyarwanda transcription]"
+    except Exception as e:
+        print(f"❌ Transcription error: {e}")
+        return f"❌ Error during transcription: {str(e)}"
+def transcribe_microphone(audio_data):
+    """
+    Transcribe audio from microphone input.
+    """
+    if audio_data is None:
+        return "Please record some audio first."
+    try:
+        sample_rate, audio_array = audio_data
+        duration = len(audio_array) / sample_rate
+        print(f"🎙️ Processing microphone input: {duration:.1f} seconds at {sample_rate}Hz")
+        return f"Mock transcription for {duration:.1f}s audio: [This would be the actual Kinyarwanda transcription]"
+    except Exception as e:
+        print(f"❌ Microphone processing error: {e}")
+        return f"❌ Error processing microphone input: {str(e)}"
+# Create a simple Gradio interface
+def create_interface():
+    """Create a clean, simple Gradio interface."""
+    with gr.Blocks(title="Wakanda Whisper - Kinyarwanda ASR") as interface:
+        gr.Markdown("# 🎤 Wakanda Whisper")
+        gr.Markdown("### Kinyarwanda Automatic Speech Recognition")
+        gr.Markdown("Upload an audio file or record your voice to get Kinyarwanda transcription")
+        with gr.Tabs():
+            # File Upload Tab
+            with gr.TabItem("📁 Upload Audio File"):
+                with gr.Row():
+                    with gr.Column():
+                        audio_input = gr.Audio(
+                            label="Choose Audio File",
+                            type="filepath"
+                        )
+                        # Sample audio files
+                        gr.Markdown("**Try these sample Kinyarwanda audio files:**")
+                        with gr.Row():
+                            sample_1 = gr.Button("Sample 1", size="sm")
+                            sample_2 = gr.Button("Sample 2", size="sm")
+                            sample_3 = gr.Button("Sample 3", size="sm")
+                            sample_4 = gr.Button("Sample 4", size="sm")
+                        upload_btn = gr.Button("🎯 Transcribe Audio", variant="primary")
+                    with gr.Column():
+                        upload_output = gr.Textbox(
+                            label="Transcription Result",
+                            placeholder="Your Kinyarwanda transcription will appear here...",
+                            lines=6,
+                            show_copy_button=True
+                        )
+            # Microphone Tab
+            with gr.TabItem("🎙️ Record Audio"):
+                with gr.Row():
+                    with gr.Column():
+                        mic_input = gr.Audio(
+                            label="Record Your Voice",
+                            type="numpy"
+                        )
+                        mic_btn = gr.Button("🎯 Transcribe Recording", variant="primary")
+                    with gr.Column():
+                        mic_output = gr.Textbox(
+                            label="Transcription Result",
+                            placeholder="Your Kinyarwanda transcription will appear here...",
+                            lines=6,
+                            show_copy_button=True
+                        )
+        # Set up event handlers
+        upload_btn.click(
+            fn=transcribe_audio,
+            inputs=audio_input,
+            outputs=upload_output,
+            show_progress=True
+        )
+        # Sample audio button handlers
+        sample_1.click(
+            fn=lambda: "sample_1.wav",
+            outputs=audio_input
+        )
+        sample_2.click(
+            fn=lambda: "sample_2.wav",
+            outputs=audio_input
+        )
+        sample_3.click(
+            fn=lambda: "sample_3.wav",
+            outputs=audio_input
+        )
+        sample_4.click(
+            fn=lambda: "sample_4.wav",
+            outputs=audio_input
+        )
+        mic_btn.click(
+            fn=transcribe_microphone,
+            inputs=mic_input,
+            outputs=mic_output,
+            show_progress=True
+        )
+        gr.Markdown("---")
+        gr.Markdown("**Powered by WakandaAI** | Model: [wakanda-whisper-small-rw-v1](https://huggingface.co/WakandaAI/wakanda-whisper-small-rw-v1)")
+    return interface
+# Launch the app
+if __name__ == "__main__":
+    print("🚀 Starting Wakanda Whisper ASR (Mock Mode for Testing)...")
+    # Create and launch the interface
+    demo = create_interface()
+    # Launch configuration - let Gradio find an available port
+    demo.launch(
+        server_name="127.0.0.1",
+        share=False,
+        show_error=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio>=4.0.0
+torch>=2.0.0
+torchaudio>=2.0.0
+transformers>=4.30.0
+librosa>=0.10.0
+soundfile>=0.12.0
+numpy>=1.21.0
+accelerate>=0.20.0
+datasets>=2.10.0
+huggingface_hub>=0.15.0
+wakanda_whisper

sample_1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f984a4e5d499a43df335d3ee4ee9868b438437aae6254b87098da139fc3538e
+size 554958

sample_2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e64e1dd59d4e029637c91857b4e19684b5adda1c2fe381b03619b7a80cc138ba
+size 658638

sample_3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:373957079f0abba083733a03c83de1b71769b901da508573166de6fc155975a0
+size 524238

sample_4.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d7fb90ffc9fc7a17863099464895299d48173cb0b35b3e2dc8c2ae78a145876
+size 745038