|
import transformers |
|
import gradio as gr |
|
import librosa |
|
import torch |
|
import spaces |
|
import numpy as np |
|
import tempfile |
|
|
|
@spaces.GPU(duration=60) |
|
def transcribe_and_respond(audio_file, email): |
|
try: |
|
pipe = transformers.pipeline( |
|
model='sarvamai/shuka_v1', |
|
trust_remote_code=True, |
|
device=0, |
|
torch_dtype=torch.bfloat16 |
|
) |
|
|
|
|
|
audio, sr = librosa.load(audio_file, sr=16000) |
|
|
|
audio = np.ascontiguousarray(audio, dtype=np.float32) |
|
|
|
if audio.ndim > 1: |
|
audio = np.mean(audio, axis=-1) |
|
|
|
|
|
print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}") |
|
|
|
|
|
turns = [ |
|
{'role': 'system', 'content': 'You are an exact echo assistant. Output the previous text exactly as given, without any modifications.'}, |
|
{'role': 'user', 'content': '<|audio|>'} |
|
] |
|
print(f"Initial turns: {turns}") |
|
|
|
|
|
output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=10000) |
|
print(f"Model output: {output}") |
|
|
|
|
|
transcript = str(output) |
|
if email and email.strip(): |
|
transcript = f"Email provided: {email}\n\n{transcript}" |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as tmp: |
|
tmp.write(transcript) |
|
transcript_file = tmp.name |
|
|
|
|
|
return transcript, transcript_file |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}", "" |
|
|
|
iface = gr.Interface( |
|
fn=transcribe_and_respond, |
|
inputs=[ |
|
gr.Audio(sources=["upload", "microphone"], type="filepath"), |
|
|
|
], |
|
outputs=[ |
|
gr.Textbox(label="Transcript"), |
|
gr.File(label="Download Transcript") |
|
], |
|
title="ShukaNotesApp", |
|
description="Upload or record your meeting audio, and download the transcript." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|