Spaces:
Sleeping
Sleeping
import os | |
import google.generativeai as genai | |
import speech_recognition as sr | |
from gtts import gTTS # Replacing pyttsx3 with gTTS for text-to-speech | |
from dotenv import load_dotenv | |
import gradio as gr | |
import tempfile | |
from pydub import AudioSegment # Importing for audio conversion | |
# Load environment variables | |
load_dotenv() | |
def speak_and_save(text): | |
"""Use gTTS to speak the given text and save it as an audio file.""" | |
tts = gTTS(text) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: | |
audio_path = fp.name | |
tts.save(audio_path) | |
return audio_path | |
def recognize_speech_from_audio(audio_file): | |
"""Capture and recognize speech from the audio file.""" | |
recognizer = sr.Recognizer() | |
# Debug print to check if audio file exists | |
print(f"Processing audio file: {audio_file}") | |
with sr.AudioFile(audio_file) as source: | |
audio = recognizer.record(source) | |
try: | |
command = recognizer.recognize_google(audio) | |
print(f"You said: {command}") | |
return command | |
except sr.UnknownValueError: | |
print("Could not understand audio.") | |
return None | |
except sr.RequestError: | |
print("Error with the speech recognition service.") | |
return None | |
def process_command(command): | |
"""Generate a response based on the voice command using the AI model.""" | |
if command: | |
response = model.generate_content([command]) | |
reply = response.text.strip() | |
print(f"AI Response: {reply}") | |
return reply | |
# Convert any audio file to WAV format to ensure compatibility with speech_recognition | |
def convert_to_wav(audio_path): | |
"""Convert any audio file to a WAV format.""" | |
audio = AudioSegment.from_file(audio_path) | |
wav_path = tempfile.mktemp(suffix=".wav") | |
audio.export(wav_path, format="wav") | |
return wav_path | |
# Main Code with Generative AI Setup | |
api_key = os.getenv("MY_API_KEY") | |
if api_key is None: | |
raise ValueError("API key not found in environment variables") | |
# Configure the AI model | |
genai.configure(api_key=api_key) | |
generation_config = { | |
"temperature": 1, | |
"top_p": 0.95, | |
"top_k": 64, | |
"max_output_tokens": 8192, | |
"response_mime_type": "text/plain", | |
} | |
model = genai.GenerativeModel( | |
model_name="gemini-1.5-flash-8b-exp-0827", | |
generation_config=generation_config, | |
) | |
wake_word = "hello" | |
def assistant(audio): | |
# Check if the audio file path is provided | |
print(f"Audio file received: {audio}") | |
# Check if the file exists before processing | |
if not audio or not os.path.exists(audio): | |
print(f"Audio file does not exist or is not provided: {audio}") | |
return "No audio provided.", None | |
# Convert to WAV format before processing | |
audio_wav = convert_to_wav(audio) | |
# Process the speech from the audio | |
command = recognize_speech_from_audio(audio_wav) | |
if command and wake_word in command.lower(): | |
response_text = process_command(command) | |
audio_response = speak_and_save(response_text) | |
return response_text, audio_response | |
else: | |
return "Wake word not detected.", None | |
# Gradio Interface | |
gr.Interface( | |
fn=assistant, # Function to call when the interface is run | |
inputs=gr.Audio(type="filepath"), # Audio input, expecting a file path from the microphone | |
outputs=[gr.Textbox(), gr.Audio(type="filepath", label="Response Audio")], # Outputs text and the response audio | |
title="Sema Voice Assistant" | |
).launch(share=True) | |