import os import google.generativeai as genai import speech_recognition as sr from gtts import gTTS # Replacing pyttsx3 with gTTS for text-to-speech from dotenv import load_dotenv import gradio as gr import tempfile from pydub import AudioSegment # Importing for audio conversion # Load environment variables load_dotenv() def speak_and_save(text): """Use gTTS to speak the given text and save it as an audio file.""" tts = gTTS(text) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: audio_path = fp.name tts.save(audio_path) return audio_path def recognize_speech_from_audio(audio_file): """Capture and recognize speech from the audio file.""" recognizer = sr.Recognizer() # Debug print to check if audio file exists print(f"Processing audio file: {audio_file}") with sr.AudioFile(audio_file) as source: audio = recognizer.record(source) try: command = recognizer.recognize_google(audio) print(f"You said: {command}") return command except sr.UnknownValueError: print("Could not understand audio.") return None except sr.RequestError: print("Error with the speech recognition service.") return None def process_command(command): """Generate a response based on the voice command using the AI model.""" if command: response = model.generate_content([command]) reply = response.text.strip() print(f"AI Response: {reply}") return reply # Convert any audio file to WAV format to ensure compatibility with speech_recognition def convert_to_wav(audio_path): """Convert any audio file to a WAV format.""" audio = AudioSegment.from_file(audio_path) wav_path = tempfile.mktemp(suffix=".wav") audio.export(wav_path, format="wav") return wav_path # Main Code with Generative AI Setup api_key = os.getenv("MY_API_KEY") if api_key is None: raise ValueError("API key not found in environment variables") # Configure the AI model genai.configure(api_key=api_key) generation_config = { "temperature": 1, "top_p": 0.95, "top_k": 64, "max_output_tokens": 8192, "response_mime_type": "text/plain", } model = genai.GenerativeModel( model_name="gemini-1.5-flash-8b-exp-0827", generation_config=generation_config, ) wake_word = "hello" def assistant(audio): # Check if the audio file path is provided print(f"Audio file received: {audio}") # Check if the file exists before processing if not audio or not os.path.exists(audio): print(f"Audio file does not exist or is not provided: {audio}") return "No audio provided.", None # Convert to WAV format before processing audio_wav = convert_to_wav(audio) # Process the speech from the audio command = recognize_speech_from_audio(audio_wav) if command and wake_word in command.lower(): response_text = process_command(command) audio_response = speak_and_save(response_text) return response_text, audio_response else: return "Wake word not detected.", None # Gradio Interface gr.Interface( fn=assistant, # Function to call when the interface is run inputs=gr.Audio(type="filepath"), # Audio input, expecting a file path from the microphone outputs=[gr.Textbox(), gr.Audio(type="filepath", label="Response Audio")], # Outputs text and the response audio title="Sema Voice Assistant" ).launch(share=True)