File size: 2,524 Bytes
e2f65f6
8c679c2
e2f65f6
 
8c679c2
86fab4a
a9b9492
98333ca
8c679c2
a9b9492
fbc6758
8c679c2
 
 
 
 
 
 
66c2b05
8c679c2
a9b9492
 
 
 
 
 
 
8c679c2
a9b9492
 
8c679c2
7638418
8c679c2
 
 
a9b9492
 
 
8c679c2
a9b9492
 
 
 
 
 
 
 
 
 
 
 
 
8c679c2
fbc6758
a9b9492
e2f65f6
 
a0b460e
57740a6
 
7638418
57740a6
66c2b05
02b1ff9
 
57740a6
289e5e4
7638418
e2f65f6
 
 
a9b9492
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np
import tempfile

@spaces.GPU(duration=60)
def transcribe_and_respond(audio_file, email):
    try:
        pipe = transformers.pipeline(
            model='sarvamai/shuka_v1',
            trust_remote_code=True,
            device=0,
            torch_dtype=torch.bfloat16
        )

        # Load the audio file at 16kHz
        audio, sr = librosa.load(audio_file, sr=16000)
        # Convert the audio to a contiguous float32 array
        audio = np.ascontiguousarray(audio, dtype=np.float32)
        # If audio is multi-channel, convert to mono by averaging channels
        if audio.ndim > 1:
            audio = np.mean(audio, axis=-1)
        
        # Debug: Print audio properties
        print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
        
        # Set up the prompt to get key takeaways
        turns = [
            {'role': 'system', 'content': 'You are an exact echo assistant. Output the previous text exactly as given, without any modifications.'},
            {'role': 'user', 'content': '<|audio|>'}
        ]
        print(f"Initial turns: {turns}")
        
        # Run the model inference (this call is synchronous)
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=10000)
        print(f"Model output: {output}")
        
        # Extract transcript text from the output
        transcript = str(output)
        if email and email.strip():
            transcript = f"Email provided: {email}\n\n{transcript}"
        
        # Write the transcript to a temporary file for download
        with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as tmp:
            tmp.write(transcript)
            transcript_file = tmp.name
        
        # Return transcript text and file download path
        return transcript, transcript_file

    except Exception as e:
        return f"Error: {str(e)}", ""

iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=[
        gr.Audio(sources=["upload", "microphone"], type="filepath"),
        # gr.Textbox(label="Email", placeholder="Enter your email address (optional)")
    ],
    outputs=[
        gr.Textbox(label="Transcript"),
        gr.File(label="Download Transcript")
    ],
    title="ShukaNotesApp",
    description="Upload or record your meeting audio, and download the transcript."
)

if __name__ == "__main__":
    iface.launch()