|
|
|
import os |
|
import gradio as gr |
|
from groq import Groq |
|
from gtts import gTTS |
|
import tempfile |
|
import whisper |
|
|
|
|
|
|
|
|
|
GROQ_API_KEY = "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" |
|
client = Groq(api_key = GROQ_API_KEY) |
|
|
|
|
|
|
|
whisper_model = whisper.load_model("base") |
|
|
|
|
|
def audio_to_text(audio_file): |
|
audio = whisper.load_audio(audio_file) |
|
audio = whisper.pad_or_trim(audio) |
|
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device) |
|
options = whisper.DecodingOptions(fp16=False) |
|
result = whisper.decode(whisper_model, mel, options) |
|
return result.text |
|
|
|
|
|
def interact_with_groq(user_input): |
|
try: |
|
chat_completion = client.chat.completions.create( |
|
messages=[{"role": "user", "content": user_input}], |
|
model="llama3-8b-8192", |
|
stream=False, |
|
) |
|
return chat_completion.choices[0].message.content |
|
except Exception as e: |
|
return f"Error interacting with Groq API: {e}" |
|
|
|
|
|
def text_to_audio(response_text): |
|
tts = gTTS(response_text) |
|
output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name |
|
tts.save(output_path) |
|
return output_path |
|
|
|
|
|
def voice_to_voice(audio_file): |
|
try: |
|
|
|
print("Transcribing audio...") |
|
transcribed_text = audio_to_text(audio_file) |
|
print(f"Transcribed Text: {transcribed_text}") |
|
|
|
|
|
print("Getting LLM response...") |
|
response_text = interact_with_groq(transcribed_text) |
|
print(f"LLM Response: {response_text}") |
|
|
|
|
|
print("Generating audio response...") |
|
audio_response = text_to_audio(response_text) |
|
return transcribed_text, audio_response |
|
except Exception as e: |
|
return f"Error processing request: {e}", None |
|
|
|
|
|
interface = gr.Interface( |
|
fn=voice_to_voice, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs=[gr.Textbox(label="Transcribed Text"), gr.Audio(label="Response Audio")], |
|
title="Real-Time Voice-to-Voice Chatbot", |
|
description="A real-time voice-to-voice chatbot using Whisper for transcription, Groq API for LLM, and gTTS for audio response.", |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
interface.launch() |
|
|