nehulagrawal commited on
Commit
43a4e2c
·
verified ·
1 Parent(s): 0778def

Upload 3 files

Browse files
Files changed (3) hide show
  1. Voice_Assistant.py +99 -0
  2. readme.md +77 -0
  3. requirements.txt +12 -0
Voice_Assistant.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import speech_recognition as sr
2
+ from transformers import pipeline
3
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
4
+
5
+ from langchain.llms import HuggingFacePipeline
6
+ from langchain.chains import RetrievalQA
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.document_loaders import UnstructuredFileLoader
10
+
11
+ import os
12
+ import torch
13
+ import pyttsx3
14
+
15
+ import soundfile as sf
16
+ from playsound import playsound
17
+ from TTS.api import TTS
18
+ from langchain.llms import Ollama
19
+
20
+
21
+ # Loading RAG data
22
+ loader = UnstructuredFileLoader("Foduu_KnowledgeBase.pdf")
23
+ documents = loader.load()
24
+
25
+ # Open-source embedding model
26
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
27
+ vectorstore = FAISS.from_documents(documents, embeddings)
28
+
29
+ # Ollama Model
30
+ ollama = Ollama(base_url='http://localhost:11434',model="llama3")
31
+ qa = RetrievalQA.from_chain_type(llm=ollama, chain_type="stuff", retriever=vectorstore.as_retriever())
32
+
33
+ # Speech recognition setup
34
+ r = sr.Recognizer()
35
+
36
+ # Initialize TTS with a model
37
+ tts = TTS(model_name="tts_models/en/ljspeech/glow-tts")
38
+
39
+ ## Using Mozilla TTS (TTS)
40
+ def speak(text):
41
+ """
42
+ Converts text to speech using Mozilla TTS, plays the audio, and then deletes the file.
43
+ """
44
+ try:
45
+ # Generate speech
46
+ output_file = "output.wav"
47
+ tts.tts_to_file(text=text, file_path=output_file)
48
+
49
+ # Play the speech
50
+ playsound(output_file)
51
+ os.remove(output_file)
52
+ print(f"Speech played and file {output_file} removed.")
53
+
54
+ except Exception as e:
55
+ print(f"Error: {e}")
56
+
57
+
58
+ def listen():
59
+ """
60
+ Records audio and converts it to text using speech recognition.
61
+ """
62
+ with sr.Microphone() as source:
63
+ print("Listening...")
64
+ audio = r.listen(source)
65
+ try:
66
+ text = r.recognize_google(audio)
67
+ print(f"You said: {text}")
68
+ return text
69
+ except sr.UnknownValueError:
70
+ print("Could not understand audio")
71
+ speak('could not understand audio')
72
+ return None
73
+ except sr.RequestError as e:
74
+ print(f"Could not request results from Google Speech Recognition service; {e}")
75
+ return None
76
+
77
+
78
+ def process_audio(text):
79
+ if text is not None:
80
+ try:
81
+ response = qa.run(text)
82
+ print(response)
83
+ speak(response)
84
+ except Exception as e:
85
+ print(f"An error occurred: {e}")
86
+ speak("Sorry, I'm having trouble processing that right now.")
87
+
88
+
89
+ def main():
90
+ """
91
+ Main loop for the voice assistant.
92
+ """
93
+ while True:
94
+ text = listen()
95
+ process_audio(text)
96
+
97
+
98
+ if __name__ == "__main__":
99
+ main()
readme.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Voice Assistant with RAG and Speech Recognition
2
+
3
+ This project implements a voice assistant that uses Retrieval-Augmented Generation (RAG) and speech recognition to provide responses to user queries. The assistant can listen to voice input, process it, and respond with synthesized speech based on the knowledge base you passed.
4
+
5
+ ## Features
6
+
7
+ - Speech recognition using Google Speech Recognition
8
+ - Text-to-Speech (TTS) using Mozilla TTS
9
+ - RAG-based question answering using Langchain and FAISS
10
+ - Integration with Ollama for language model processing
11
+
12
+ ## Prerequisites
13
+
14
+ Before running this project, make sure you have the following dependencies installed:
15
+
16
+ - Python 3.7+
17
+ - PyTorch
18
+ - Transformers
19
+ - SpeechRecognition
20
+ - pyttsx3
21
+ - soundfile
22
+ - playsound
23
+ - TTS
24
+ - Langchain
25
+ - FAISS
26
+ - Ollama
27
+
28
+ Create an Conda environment
29
+ ```
30
+ conda create -n VoiceAI python==3.10
31
+ conda activate VoiceAI
32
+ ```
33
+ You can install most of these dependencies using pip:
34
+ ```
35
+ pip install torch transformers speechrecognition pyttsx3 soundfile playsound TTS langchain faiss-cpu
36
+ ```
37
+
38
+ For Ollama, follow the installation instructions on their official website https://ollama.com/library/llama3.
39
+
40
+ ## Setup
41
+
42
+ 1. Clone this repository to your local machine.
43
+ 2. Install the required dependencies as mentioned above.
44
+ 3. Make sure you have the `KnowledgeBase.pdf` file in the same directory as the script. This file will be used to create the knowledge base for the RAG system.
45
+ 4. Ensure that Ollama is running on `http://localhost:11434` with the `llama3` model loaded.
46
+
47
+ ## Usage
48
+
49
+ To run the voice assistant, execute the following command in your terminal:
50
+
51
+ ```
52
+ python voice_assistant.py
53
+ ```
54
+
55
+ The assistant will start listening for your voice input. Speak clearly into your microphone to ask questions or give commands. The assistant will process your input and respond with synthesized speech.
56
+
57
+ ## How It Works
58
+
59
+ 1. The script loads the knowledge base from `KnowledgeBase.pdf` and creates a FAISS vector store using sentence embeddings.
60
+ 2. It sets up a Retrieval QA chain using Ollama as the language model and the FAISS vector store as the retriever.
61
+ 3. The main loop continuously listens for voice input using the computer's microphone.
62
+ 4. When speech is detected, it's converted to text using Google's Speech Recognition service.
63
+ 5. The text query is then processed by the RAG system to generate a response.
64
+ 6. The response is converted to speech using Mozilla TTS and played back to the user.
65
+
66
+ ## Customization
67
+
68
+ - To use a different knowledge base, replace `KnowledgeBase.pdf` with your own PDF file and update the filename in the script.
69
+ - You can experiment with different embedding models by changing the `model_name` in the `HuggingFaceEmbeddings` initialization.
70
+ - To use a different Ollama model, update the `model` parameter in the `Ollama` initialization.
71
+ - Try to use other TTS frameworks - MeloTTS, coquiTTS, Mars5TTS.
72
+
73
+ ## Troubleshooting
74
+
75
+ - If you encounter issues with speech recognition, ensure your microphone is properly connected and configured.
76
+ - For TTS issues, make sure you have the necessary audio drivers installed on your system.
77
+ - If the RAG system is not working as expected, check that your knowledge base PDF is properly formatted and contains relevant information.
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SpeechRecognition==3.8.1
2
+ transformers==4.29.2
3
+ torch==1.13.1
4
+ pyttsx3==2.90
5
+ soundfile==0.10.3.post1
6
+ playsound==1.2.2
7
+ TTS==0.8.0
8
+ langchain==0.0.184
9
+ faiss-cpu==1.7.3
10
+ Unstructured==0.6.6
11
+ sentence-transformers==2.2.2
12
+ pydantic==1.10.8