import gradio as gr import requests import os from typing import Optional # Hugging Face Inference API API_URL = "https://api-inference.huggingface.co/models/ibm-granite/granite-speech-3.3-2b" def query_inference_api(audio_file_path: str, hf_token: Optional[str] = None) -> str: """ Query the Hugging Face Inference API for speech transcription """ headers = {} if hf_token: headers["Authorization"] = f"Bearer {hf_token}" try: with open(audio_file_path, "rb") as f: data = f.read() response = requests.post(API_URL, headers=headers, data=data, timeout=60) if response.status_code == 200: result = response.json() if isinstance(result, dict) and 'text' in result: return result['text'] elif isinstance(result, list) and len(result) > 0: return result[0].get('generated_text', str(result)) else: return str(result) else: return f"API Error {response.status_code}: {response.text}" except requests.exceptions.Timeout: return "❌ Request timed out. The model might be loading. Please try again in a few minutes." except Exception as e: return f"❌ Error: {str(e)}" def transcribe_with_local_processing(audio_file_path: str) -> str: """ Fallback: Simple local audio processing without heavy models """ try: import soundfile as sf # Read audio file info data, samplerate = sf.read(audio_file_path) duration = len(data) / samplerate return f""" 📊 **Audio File Analysis:** - Duration: {duration:.2f} seconds - Sample Rate: {samplerate} Hz - Channels: {'Mono' if len(data.shape) == 1 else 'Stereo'} ⚠️ **For actual transcription**: This demo shows the file was processed successfully. For full transcription, you would need: 1. A Hugging Face token (free to get) 2. Or run this on hardware with more resources The Granite Speech 3.3-2B model supports: - English, French, German, Spanish, Portuguese - Speech-to-text transcription - Speech translation to English """ except Exception as e: return f"❌ Error processing audio: {str(e)}" def process_audio(audio_file, hf_token): """Main processing function""" if audio_file is None: return "❌ Please upload an audio file." # Try Inference API first if token provided if hf_token and hf_token.strip(): result = query_inference_api(audio_file, hf_token.strip()) if not result.startswith("❌"): return f"🎤 **Transcription Result:**\n\n{result}" # Fallback to local processing return transcribe_with_local_processing(audio_file) def create_interface(): """Create the Gradio interface""" with gr.Blocks( title="Granite Speech Demo", theme=gr.themes.Soft(), css="footer {visibility: hidden}" ) as demo: gr.Markdown(""" # 🎤 IBM Granite Speech 3.3-2B Demo **Two ways to use this demo:** 1. **With HF Token** (recommended): Get free token from [Hugging Face Settings](https://huggingface.co/settings/tokens) 2. **Without Token**: Basic audio file analysis **Supported Languages**: English, French, German, Spanish, Portuguese """) with gr.Row(): with gr.Column(scale=1): # Token input hf_token = gr.Textbox( label="🔑 Hugging Face Token (Optional)", placeholder="hf_xxx... (get from huggingface.co/settings/tokens)", type="password", info="Paste your free HF token for full transcription" ) # Audio input audio_input = gr.Audio( label="📁 Upload Audio File", type="filepath", format="wav" ) # Process button process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg") # Example info gr.Markdown(""" ### 💡 Tips: - **Get HF Token**: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) → "New token" → "Read" access - **Audio format**: WAV, MP3, M4A supported - **Length**: Keep under 1 minute for best results - **Quality**: Clear speech works best """) with gr.Column(scale=2): # Output output = gr.Textbox( label="📝 Results", lines=12, interactive=False, placeholder="Upload audio and click 'Process Audio' to see transcription..." ) # Event handler process_btn.click( fn=process_audio, inputs=[audio_input, hf_token], outputs=output ) # Footer info gr.Markdown(""" --- **About**: This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition. Model supports multilingual transcription and translation capabilities. """) return demo # Launch the app if __name__ == "__main__": demo = create_interface() demo.launch(server_name="0.0.0.0", server_port=7860)