import gradio as gr
import requests
import os
from typing import Optional

# Hugging Face Inference API
API_URL = "https://api-inference.huggingface.co/models/ibm-granite/granite-speech-3.3-2b"

def query_inference_api(audio_file_path: str, hf_token: Optional[str] = None) -> str:
    """
    Query the Hugging Face Inference API for speech transcription
    """
    headers = {}
    if hf_token:
        headers["Authorization"] = f"Bearer {hf_token}"
    
    try:
        with open(audio_file_path, "rb") as f:
            data = f.read()
        
        response = requests.post(API_URL, headers=headers, data=data, timeout=60)
        
        if response.status_code == 200:
            result = response.json()
            if isinstance(result, dict) and 'text' in result:
                return result['text']
            elif isinstance(result, list) and len(result) > 0:
                return result[0].get('generated_text', str(result))
            else:
                return str(result)
        else:
            return f"API Error {response.status_code}: {response.text}"
            
    except requests.exceptions.Timeout:
        return "❌ Request timed out. The model might be loading. Please try again in a few minutes."
    except Exception as e:
        return f"❌ Error: {str(e)}"

def transcribe_with_local_processing(audio_file_path: str) -> str:
    """
    Fallback: Simple local audio processing without heavy models
    """
    try:
        import soundfile as sf
        
        # Read audio file info
        data, samplerate = sf.read(audio_file_path)
        duration = len(data) / samplerate
        
        return f"""
📊 **Audio File Analysis:**
- Duration: {duration:.2f} seconds
- Sample Rate: {samplerate} Hz
- Channels: {'Mono' if len(data.shape) == 1 else 'Stereo'}

⚠️ **For actual transcription**: 
This demo shows the file was processed successfully. 
For full transcription, you would need:
1. A Hugging Face token (free to get)
2. Or run this on hardware with more resources

The Granite Speech 3.3-2B model supports:
- English, French, German, Spanish, Portuguese
- Speech-to-text transcription
- Speech translation to English
        """
        
    except Exception as e:
        return f"❌ Error processing audio: {str(e)}"

def process_audio(audio_file, hf_token):
    """Main processing function"""
    if audio_file is None:
        return "❌ Please upload an audio file."
    
    # Try Inference API first if token provided
    if hf_token and hf_token.strip():
        result = query_inference_api(audio_file, hf_token.strip())
        if not result.startswith("❌"):
            return f"🎤 **Transcription Result:**\n\n{result}"
    
    # Fallback to local processing
    return transcribe_with_local_processing(audio_file)

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(
        title="Granite Speech Demo", 
        theme=gr.themes.Soft(),
        css="footer {visibility: hidden}"
    ) as demo:
        
        gr.Markdown("""
        # 🎤 IBM Granite Speech 3.3-2B Demo
        
        **Two ways to use this demo:**
        1. **With HF Token** (recommended): Get free token from [Hugging Face Settings](https://huggingface.co/settings/tokens)
        2. **Without Token**: Basic audio file analysis
        
        **Supported Languages**: English, French, German, Spanish, Portuguese
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                # Token input
                hf_token = gr.Textbox(
                    label="🔑 Hugging Face Token (Optional)",
                    placeholder="hf_xxx... (get from huggingface.co/settings/tokens)",
                    type="password",
                    info="Paste your free HF token for full transcription"
                )
                
                # Audio input
                audio_input = gr.Audio(
                    label="📁 Upload Audio File",
                    type="filepath",
                    format="wav"
                )
                
                # Process button
                process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg")
                
                # Example info
                gr.Markdown("""
                ### 💡 Tips:
                - **Get HF Token**: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) → "New token" → "Read" access
                - **Audio format**: WAV, MP3, M4A supported
                - **Length**: Keep under 1 minute for best results
                - **Quality**: Clear speech works best
                """)
            
            with gr.Column(scale=2):
                # Output
                output = gr.Textbox(
                    label="📝 Results",
                    lines=12,
                    interactive=False,
                    placeholder="Upload audio and click 'Process Audio' to see transcription..."
                )
        
        # Event handler
        process_btn.click(
            fn=process_audio,
            inputs=[audio_input, hf_token],
            outputs=output
        )
        
        # Footer info
        gr.Markdown("""
        ---
        **About**: This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition.
        Model supports multilingual transcription and translation capabilities.
        """)
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(server_name="0.0.0.0", server_port=7860)