|
import transformers |
|
import gradio as gr |
|
import librosa |
|
import torch |
|
import spaces |
|
import numpy as np |
|
|
|
@spaces.GPU(duration=60) |
|
def transcribe_and_respond(audio_file): |
|
try: |
|
pipe = transformers.pipeline( |
|
model='sarvamai/shuka_v1', |
|
trust_remote_code=True, |
|
device=0, |
|
torch_dtype=torch.bfloat16 |
|
) |
|
|
|
|
|
audio, sr = librosa.load(audio_file, sr=16000) |
|
|
|
|
|
print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}") |
|
|
|
turns = [ |
|
{'role': 'system', 'content': 'Repeat the following text exactly, without any changes'}, |
|
{'role': 'user', 'content': '<|audio|>'} |
|
] |
|
|
|
|
|
print(f"Initial turns: {turns}") |
|
|
|
|
|
output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512) |
|
|
|
|
|
print(f"Model output: {output}") |
|
|
|
return output |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
iface = gr.Interface( |
|
fn=transcribe_and_respond, |
|
inputs=gr.Audio(sources="microphone", type="filepath"), |
|
outputs="text", |
|
title="Live Transcription and Response", |
|
description="Speak into your microphone, and the model will respond naturally and informatively.", |
|
live=True |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |