import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np
import tempfile

@spaces.GPU(duration=60)
def transcribe_and_respond(audio_file, email):
    try:
        pipe = transformers.pipeline(
            model='sarvamai/shuka_v1',
            trust_remote_code=True,
            device=0,
            torch_dtype=torch.bfloat16
        )

        # Load the audio file at 16kHz
        audio, sr = librosa.load(audio_file, sr=16000)
        # Convert the audio to a contiguous float32 array
        audio = np.ascontiguousarray(audio, dtype=np.float32)
        # If audio is multi-channel, convert to mono by averaging channels
        if audio.ndim > 1:
            audio = np.mean(audio, axis=-1)
        
        # Debug: Print audio properties
        print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
        
        # Set up the prompt to get key takeaways
        turns = [
            {'role': 'system', 'content': 'You are an exact echo assistant. Output the previous text exactly as given, without any modifications.'},
            {'role': 'user', 'content': '<|audio|>'}
        ]
        print(f"Initial turns: {turns}")
        
        # Run the model inference (this call is synchronous)
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=10000)
        print(f"Model output: {output}")
        
        # Extract transcript text from the output
        transcript = str(output)
        if email and email.strip():
            transcript = f"Email provided: {email}\n\n{transcript}"
        
        # Write the transcript to a temporary file for download
        with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as tmp:
            tmp.write(transcript)
            transcript_file = tmp.name
        
        # Return transcript text and file download path
        return transcript, transcript_file

    except Exception as e:
        return f"Error: {str(e)}", ""

iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=[
        gr.Audio(sources=["upload", "microphone"], type="filepath"),
        # gr.Textbox(label="Email", placeholder="Enter your email address (optional)")
    ],
    outputs=[
        gr.Textbox(label="Transcript"),
        gr.File(label="Download Transcript")
    ],
    title="ShukaNotesApp",
    description="Upload or record your meeting audio, and download the transcript."
)

if __name__ == "__main__":
    iface.launch()