Spaces:
Build error
Build error
XThomasBU
commited on
Commit
·
a2ac5f7
1
Parent(s):
e5cd1d3
more modularization for vectorestore and retriever
Browse files- code/modules/chat/llm_tutor.py +2 -11
- code/modules/config/config.yml +1 -1
- code/modules/retriever/__init__.py +2 -0
- code/modules/retriever/colbert_retriever.py +10 -0
- code/modules/retriever/retriever.py +24 -0
- code/modules/vectorstore/store_manager.py +20 -25
- code/modules/vectorstore/vectorstore.py +50 -0
code/modules/chat/llm_tutor.py
CHANGED
|
@@ -10,7 +10,7 @@ from modules.chat.helpers import get_prompt
|
|
| 10 |
from modules.chat.chat_model_loader import ChatModelLoader
|
| 11 |
from modules.vectorstore.store_manager import VectorStoreManager
|
| 12 |
|
| 13 |
-
from modules.retriever import
|
| 14 |
|
| 15 |
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
| 16 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
|
@@ -159,16 +159,7 @@ class LLMTutor:
|
|
| 159 |
# Retrieval QA Chain
|
| 160 |
def retrieval_qa_chain(self, llm, prompt, db):
|
| 161 |
|
| 162 |
-
|
| 163 |
-
retriever = FaissRetriever().return_retriever(db, self.config)
|
| 164 |
-
|
| 165 |
-
elif self.config["vectorstore"]["db_option"] == "Chroma":
|
| 166 |
-
retriever = ChromaRetriever().return_retriever(db, self.config)
|
| 167 |
-
|
| 168 |
-
elif self.config["vectorstore"]["db_option"] == "RAGatouille":
|
| 169 |
-
retriever = db.as_langchain_retriever(
|
| 170 |
-
k=self.config["vectorstore"]["search_top_k"]
|
| 171 |
-
)
|
| 172 |
|
| 173 |
if self.config["llm_params"]["use_history"]:
|
| 174 |
memory = ConversationBufferWindowMemory(
|
|
|
|
| 10 |
from modules.chat.chat_model_loader import ChatModelLoader
|
| 11 |
from modules.vectorstore.store_manager import VectorStoreManager
|
| 12 |
|
| 13 |
+
from modules.retriever import Retriever
|
| 14 |
|
| 15 |
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
| 16 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
|
|
|
| 159 |
# Retrieval QA Chain
|
| 160 |
def retrieval_qa_chain(self, llm, prompt, db):
|
| 161 |
|
| 162 |
+
retriever = Retriever(self.config)._return_retriever(db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
if self.config["llm_params"]["use_history"]:
|
| 165 |
memory = ConversationBufferWindowMemory(
|
code/modules/config/config.yml
CHANGED
|
@@ -7,7 +7,7 @@ vectorstore:
|
|
| 7 |
data_path: '../storage/data' # str
|
| 8 |
url_file_path: '../storage/data/urls.txt' # str
|
| 9 |
expand_urls: False # bool
|
| 10 |
-
db_option : '
|
| 11 |
db_path : '../vectorstores' # str
|
| 12 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
| 13 |
search_top_k : 3 # int
|
|
|
|
| 7 |
data_path: '../storage/data' # str
|
| 8 |
url_file_path: '../storage/data/urls.txt' # str
|
| 9 |
expand_urls: False # bool
|
| 10 |
+
db_option : 'Chroma' # str [FAISS, Chroma, RAGatouille]
|
| 11 |
db_path : '../vectorstores' # str
|
| 12 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
| 13 |
search_top_k : 3 # int
|
code/modules/retriever/__init__.py
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
from .faiss_retriever import FaissRetriever
|
| 2 |
from .chroma_retriever import ChromaRetriever
|
|
|
|
|
|
|
|
|
| 1 |
from .faiss_retriever import FaissRetriever
|
| 2 |
from .chroma_retriever import ChromaRetriever
|
| 3 |
+
from .colbert_retriever import ColbertRetriever
|
| 4 |
+
from .retriever import Retriever
|
code/modules/retriever/colbert_retriever.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base import BaseRetriever
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class ColbertRetriever(BaseRetriever):
|
| 5 |
+
def __init__(self):
|
| 6 |
+
pass
|
| 7 |
+
|
| 8 |
+
def return_retriever(self, db, config):
|
| 9 |
+
retriever = db.as_retriever()
|
| 10 |
+
return retriever
|
code/modules/retriever/retriever.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from modules.retriever.faiss_retriever import FaissRetriever
|
| 2 |
+
from modules.retriever.chroma_retriever import ChromaRetriever
|
| 3 |
+
from modules.retriever.colbert_retriever import ColbertRetriever
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Retriever:
|
| 7 |
+
def __init__(self, config):
|
| 8 |
+
self.config = config
|
| 9 |
+
self._create_retriever()
|
| 10 |
+
|
| 11 |
+
def _create_retriever(self):
|
| 12 |
+
if self.config["vectorstore"]["db_option"] == "FAISS":
|
| 13 |
+
self.retriever = FaissRetriever()
|
| 14 |
+
elif self.config["vectorstore"]["db_option"] == "Chroma":
|
| 15 |
+
self.retriever = ChromaRetriever()
|
| 16 |
+
elif self.config["vectorstore"]["db_option"] == "RAGatouille":
|
| 17 |
+
self.retriever = ColbertRetriever()
|
| 18 |
+
else:
|
| 19 |
+
raise ValueError(
|
| 20 |
+
"Invalid db_option: {}".format(self.config["vectorstore"]["db_option"])
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
def _return_retriever(self, db):
|
| 24 |
+
return self.retriever.return_retriever(db, self.config)
|
code/modules/vectorstore/store_manager.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
from modules.vectorstore.
|
| 2 |
-
from modules.vectorstore.chroma import ChromaVectorStore
|
| 3 |
-
from modules.vectorstore.colbert import ColbertVectorStore
|
| 4 |
from modules.vectorstore.helpers import *
|
| 5 |
from modules.dataloader.webpage_crawler import WebpageCrawler
|
| 6 |
from modules.dataloader.data_loader import DataLoader
|
|
@@ -15,7 +13,6 @@ import asyncio
|
|
| 15 |
class VectorStoreManager:
|
| 16 |
def __init__(self, config, logger=None):
|
| 17 |
self.config = config
|
| 18 |
-
self.db_option = config["vectorstore"]["db_option"]
|
| 19 |
self.document_names = None
|
| 20 |
|
| 21 |
# Set up logging to both console and a file
|
|
@@ -47,9 +44,12 @@ class VectorStoreManager:
|
|
| 47 |
|
| 48 |
self.webpage_crawler = WebpageCrawler()
|
| 49 |
|
|
|
|
|
|
|
| 50 |
self.logger.info("VectorDB instance instantiated")
|
| 51 |
|
| 52 |
def load_files(self):
|
|
|
|
| 53 |
files = os.listdir(self.config["vectorstore"]["data_path"])
|
| 54 |
files = [
|
| 55 |
os.path.join(self.config["vectorstore"]["data_path"], file)
|
|
@@ -71,6 +71,7 @@ class VectorStoreManager:
|
|
| 71 |
return files, urls
|
| 72 |
|
| 73 |
def create_embedding_model(self):
|
|
|
|
| 74 |
self.logger.info("Creating embedding function")
|
| 75 |
embedding_model_loader = EmbeddingModelLoader(self.config)
|
| 76 |
embedding_model = embedding_model_loader.load_embedding_model()
|
|
@@ -83,22 +84,23 @@ class VectorStoreManager:
|
|
| 83 |
documents: list,
|
| 84 |
document_metadata: list,
|
| 85 |
):
|
| 86 |
-
if self.db_option in ["FAISS", "Chroma"]:
|
| 87 |
self.embedding_model = self.create_embedding_model()
|
| 88 |
|
| 89 |
self.logger.info("Initializing vector_db")
|
| 90 |
-
self.logger.info(
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
self.
|
| 99 |
-
|
| 100 |
|
| 101 |
def create_database(self):
|
|
|
|
| 102 |
start_time = time.time() # Start time for creating database
|
| 103 |
data_loader = DataLoader(self.config, self.logger)
|
| 104 |
self.logger.info("Loading data")
|
|
@@ -126,18 +128,11 @@ class VectorStoreManager:
|
|
| 126 |
)
|
| 127 |
|
| 128 |
def load_database(self):
|
|
|
|
| 129 |
start_time = time.time() # Start time for loading database
|
| 130 |
-
if self.db_option in ["FAISS", "Chroma"]:
|
| 131 |
self.embedding_model = self.create_embedding_model()
|
| 132 |
-
|
| 133 |
-
self.vector_db = FaissVectorStore(self.config)
|
| 134 |
-
self.loaded_vector_db = self.vector_db.load_database(self.embedding_model)
|
| 135 |
-
elif self.db_option == "Chroma":
|
| 136 |
-
self.vector_db = ChromaVectorStore(self.config)
|
| 137 |
-
self.loaded_vector_db = self.vector_db.load_database(self.embedding_model)
|
| 138 |
-
elif self.db_option == "RAGatouille":
|
| 139 |
-
self.vector_db = ColbertVectorStore(self.config)
|
| 140 |
-
self.loaded_vector_db = self.vector_db.load_database()
|
| 141 |
end_time = time.time() # End time for loading database
|
| 142 |
self.logger.info(
|
| 143 |
f"Time taken to load database: {end_time - start_time} seconds"
|
|
|
|
| 1 |
+
from modules.vectorstore.vectorstore import VectorStore
|
|
|
|
|
|
|
| 2 |
from modules.vectorstore.helpers import *
|
| 3 |
from modules.dataloader.webpage_crawler import WebpageCrawler
|
| 4 |
from modules.dataloader.data_loader import DataLoader
|
|
|
|
| 13 |
class VectorStoreManager:
|
| 14 |
def __init__(self, config, logger=None):
|
| 15 |
self.config = config
|
|
|
|
| 16 |
self.document_names = None
|
| 17 |
|
| 18 |
# Set up logging to both console and a file
|
|
|
|
| 44 |
|
| 45 |
self.webpage_crawler = WebpageCrawler()
|
| 46 |
|
| 47 |
+
self.vector_db = VectorStore(self.config)
|
| 48 |
+
|
| 49 |
self.logger.info("VectorDB instance instantiated")
|
| 50 |
|
| 51 |
def load_files(self):
|
| 52 |
+
|
| 53 |
files = os.listdir(self.config["vectorstore"]["data_path"])
|
| 54 |
files = [
|
| 55 |
os.path.join(self.config["vectorstore"]["data_path"], file)
|
|
|
|
| 71 |
return files, urls
|
| 72 |
|
| 73 |
def create_embedding_model(self):
|
| 74 |
+
|
| 75 |
self.logger.info("Creating embedding function")
|
| 76 |
embedding_model_loader = EmbeddingModelLoader(self.config)
|
| 77 |
embedding_model = embedding_model_loader.load_embedding_model()
|
|
|
|
| 84 |
documents: list,
|
| 85 |
document_metadata: list,
|
| 86 |
):
|
| 87 |
+
if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma"]:
|
| 88 |
self.embedding_model = self.create_embedding_model()
|
| 89 |
|
| 90 |
self.logger.info("Initializing vector_db")
|
| 91 |
+
self.logger.info(
|
| 92 |
+
"\tUsing {} as db_option".format(self.config["vectorstore"]["db_option"])
|
| 93 |
+
)
|
| 94 |
+
self.vector_db._create_database(
|
| 95 |
+
document_chunks,
|
| 96 |
+
document_names,
|
| 97 |
+
documents,
|
| 98 |
+
document_metadata,
|
| 99 |
+
self.embedding_model,
|
| 100 |
+
)
|
| 101 |
|
| 102 |
def create_database(self):
|
| 103 |
+
|
| 104 |
start_time = time.time() # Start time for creating database
|
| 105 |
data_loader = DataLoader(self.config, self.logger)
|
| 106 |
self.logger.info("Loading data")
|
|
|
|
| 128 |
)
|
| 129 |
|
| 130 |
def load_database(self):
|
| 131 |
+
|
| 132 |
start_time = time.time() # Start time for loading database
|
| 133 |
+
if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma"]:
|
| 134 |
self.embedding_model = self.create_embedding_model()
|
| 135 |
+
self.loaded_vector_db = self.vector_db._load_database(self.embedding_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
end_time = time.time() # End time for loading database
|
| 137 |
self.logger.info(
|
| 138 |
f"Time taken to load database: {end_time - start_time} seconds"
|
code/modules/vectorstore/vectorstore.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from modules.vectorstore.faiss import FaissVectorStore
|
| 2 |
+
from modules.vectorstore.chroma import ChromaVectorStore
|
| 3 |
+
from modules.vectorstore.colbert import ColbertVectorStore
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class VectorStore:
|
| 7 |
+
def __init__(self, config):
|
| 8 |
+
self.config = config
|
| 9 |
+
self.vectorstore = None
|
| 10 |
+
|
| 11 |
+
def _create_database(
|
| 12 |
+
self,
|
| 13 |
+
document_chunks,
|
| 14 |
+
document_names,
|
| 15 |
+
documents,
|
| 16 |
+
document_metadata,
|
| 17 |
+
embedding_model,
|
| 18 |
+
):
|
| 19 |
+
if self.config["vectorstore"]["db_option"] == "FAISS":
|
| 20 |
+
self.vectorstore = FaissVectorStore(self.config)
|
| 21 |
+
self.vectorstore.create_database(document_chunks, embedding_model)
|
| 22 |
+
elif self.config["vectorstore"]["db_option"] == "Chroma":
|
| 23 |
+
self.vectorstore = ChromaVectorStore(self.config)
|
| 24 |
+
self.vectorstore.create_database(document_chunks, embedding_model)
|
| 25 |
+
elif self.config["vectorstore"]["db_option"] == "RAGatouille":
|
| 26 |
+
self.vectorstore = ColbertVectorStore(self.config)
|
| 27 |
+
self.vectorstore.create_database(
|
| 28 |
+
documents, document_names, document_metadata
|
| 29 |
+
)
|
| 30 |
+
else:
|
| 31 |
+
raise ValueError(
|
| 32 |
+
"Invalid db_option: {}".format(self.config["vectorstore"]["db_option"])
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
def _load_database(self, embedding_model):
|
| 36 |
+
if self.config["vectorstore"]["db_option"] == "FAISS":
|
| 37 |
+
self.vectorstore = FaissVectorStore(self.config)
|
| 38 |
+
return self.vectorstore.load_database(embedding_model)
|
| 39 |
+
elif self.config["vectorstore"]["db_option"] == "Chroma":
|
| 40 |
+
self.vectorstore = ChromaVectorStore(self.config)
|
| 41 |
+
return self.vectorstore.load_database(embedding_model)
|
| 42 |
+
elif self.config["vectorstore"]["db_option"] == "RAGatouille":
|
| 43 |
+
self.vectorstore = ColbertVectorStore(self.config)
|
| 44 |
+
return self.vectorstore.load_database()
|
| 45 |
+
|
| 46 |
+
def _as_retriever(self):
|
| 47 |
+
return self.vectorstore.as_retriever()
|
| 48 |
+
|
| 49 |
+
def _get_vectorstore(self):
|
| 50 |
+
return self.vectorstore
|