import transformers import gradio as gr import librosa import torch import spaces import numpy as np import tempfile @spaces.GPU(duration=60) def transcribe_and_respond(audio_file, email): try: pipe = transformers.pipeline( model='sarvamai/shuka_v1', trust_remote_code=True, device=0, torch_dtype=torch.bfloat16 ) # Load the audio file at 16kHz audio, sr = librosa.load(audio_file, sr=16000) # Convert the audio to a contiguous float32 array audio = np.ascontiguousarray(audio, dtype=np.float32) # If audio is multi-channel, convert to mono by averaging channels if audio.ndim > 1: audio = np.mean(audio, axis=-1) # Debug: Print audio properties print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}") # Set up the prompt to get key takeaways turns = [ {'role': 'system', 'content': 'You are an exact echo assistant. Output the previous text exactly as given, without any modifications.'}, {'role': 'user', 'content': '<|audio|>'} ] print(f"Initial turns: {turns}") # Run the model inference (this call is synchronous) output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=10000) print(f"Model output: {output}") # Extract transcript text from the output transcript = str(output) if email and email.strip(): transcript = f"Email provided: {email}\n\n{transcript}" # Write the transcript to a temporary file for download with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as tmp: tmp.write(transcript) transcript_file = tmp.name # Return transcript text and file download path return transcript, transcript_file except Exception as e: return f"Error: {str(e)}", "" iface = gr.Interface( fn=transcribe_and_respond, inputs=[ gr.Audio(sources=["upload", "microphone"], type="filepath"), # gr.Textbox(label="Email", placeholder="Enter your email address (optional)") ], outputs=[ gr.Textbox(label="Transcript"), gr.File(label="Download Transcript") ], title="ShukaNotesApp", description="Upload or record your meeting audio, and download the transcript." ) if __name__ == "__main__": iface.launch()