Spaces:

art-manuh
/

sema_voice_assistant

Sleeping

App Files Files Community

sema_voice_assistant / app.py

art-manuh

Update app.py

fc2b862 verified 6 months ago

raw

history blame contribute delete

3.51 kB

	import os
	import google.generativeai as genai
	import speech_recognition as sr
	from gtts import gTTS # Replacing pyttsx3 with gTTS for text-to-speech
	from dotenv import load_dotenv
	import gradio as gr
	import tempfile
	from pydub import AudioSegment # Importing for audio conversion

	# Load environment variables
	load_dotenv()

	def speak_and_save(text):
	"""Use gTTS to speak the given text and save it as an audio file."""
	tts = gTTS(text)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
	audio_path = fp.name
	tts.save(audio_path)
	return audio_path

	def recognize_speech_from_audio(audio_file):
	"""Capture and recognize speech from the audio file."""
	recognizer = sr.Recognizer()

	# Debug print to check if audio file exists
	print(f"Processing audio file: {audio_file}")

	with sr.AudioFile(audio_file) as source:
	audio = recognizer.record(source)
	try:
	command = recognizer.recognize_google(audio)
	print(f"You said: {command}")
	return command
	except sr.UnknownValueError:
	print("Could not understand audio.")
	return None
	except sr.RequestError:
	print("Error with the speech recognition service.")
	return None

	def process_command(command):
	"""Generate a response based on the voice command using the AI model."""
	if command:
	response = model.generate_content([command])
	reply = response.text.strip()
	print(f"AI Response: {reply}")
	return reply

	# Convert any audio file to WAV format to ensure compatibility with speech_recognition
	def convert_to_wav(audio_path):
	"""Convert any audio file to a WAV format."""
	audio = AudioSegment.from_file(audio_path)
	wav_path = tempfile.mktemp(suffix=".wav")
	audio.export(wav_path, format="wav")
	return wav_path

	# Main Code with Generative AI Setup
	api_key = os.getenv("MY_API_KEY")

	if api_key is None:
	raise ValueError("API key not found in environment variables")

	# Configure the AI model
	genai.configure(api_key=api_key)

	generation_config = {
	"temperature": 1,
	"top_p": 0.95,
	"top_k": 64,
	"max_output_tokens": 8192,
	"response_mime_type": "text/plain",
	}

	model = genai.GenerativeModel(
	model_name="gemini-1.5-flash-8b-exp-0827",
	generation_config=generation_config,
	)

	wake_word = "hello"

	def assistant(audio):
	# Check if the audio file path is provided
	print(f"Audio file received: {audio}")

	# Check if the file exists before processing
	if not audio or not os.path.exists(audio):
	print(f"Audio file does not exist or is not provided: {audio}")
	return "No audio provided.", None

	# Convert to WAV format before processing
	audio_wav = convert_to_wav(audio)

	# Process the speech from the audio
	command = recognize_speech_from_audio(audio_wav)

	if command and wake_word in command.lower():
	response_text = process_command(command)
	audio_response = speak_and_save(response_text)
	return response_text, audio_response
	else:
	return "Wake word not detected.", None

	# Gradio Interface
	gr.Interface(
	fn=assistant, # Function to call when the interface is run
	inputs=gr.Audio(type="filepath"), # Audio input, expecting a file path from the microphone
	outputs=[gr.Textbox(), gr.Audio(type="filepath", label="Response Audio")], # Outputs text and the response audio
	title="Sema Voice Assistant"
	).launch(share=True)