File size: 2,450 Bytes
1324088
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import time

import gradio as gr
import numpy as np
import soundfile as sf

from groq import Groq
from openai import OpenAI

groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))
openai_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def transcribe(audio_path):
    """
    Transcribe the audio segment using Whisper.
    """
    with open(audio_path, 'rb') as audio_file:
        transcription = openai_client.audio.transcriptions.create(
            file=audio_file,
            language="en",
            model="whisper-1"
        )
    return transcription.text

def autocomplete(text):  
    """
    Autocomplete the text using Gemma.
    """
    if text != "":
        response = groq_client.chat.completions.create(
            model='gemma-7b-it',
            messages=[{"role": "system", "content": "You are a friendly assistant."},
                      {"role": "user", "content": text}]
            )
            
        return response.choices[0].message.content

def process_audio(input_audio):
    """
    Process the audio input by transcribing and completing the sentences.
    """
    # Now you can use the audio_file_path with soundfile.read()
    audio_data, sample_rate = sf.read(input_audio)
    
    # Ensure mono audio
    if len(audio_data.shape) > 1:
        audio_data = np.mean(audio_data, axis=1)
    
    transcription_list = []
    for start in range(0, len(audio_data), sample_rate):
        end = start + sample_rate
        segment = audio_data[start:end]
        
        # Temporarily saving each segment to a file (Whisper requires a file input)
        segment_filename = f"/tmp/audio_segment_{start}.wav"
        sf.write(segment_filename, segment, sample_rate)
        
        # Transcribe the audio segment
        transcription = transcribe(segment_filename)
        
        transcription_list.append(transcription)

    # # Send the transcription for completion
    completion_result = autocomplete(transcription)
    
    text = f"Qn: {transcription} \n \n Ans: {completion_result}"

    return text

# Define the Gradio interface
interface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(sources="microphone", streaming=True, type="filepath"),
    outputs=gr.Markdown(),
    title="Dear Gemma",
    description="Talk to the AI assistant. It completes your sentences in real time.",
    live=True,
    allow_flagging="never"
)

if __name__ == "__main__":
    interface.launch()