Spaces:
Running
Running
File size: 1,416 Bytes
d9e62f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from src.logger import logger
class RetrievalModule:
def __init__(self, embedding_model="all-MiniLM-L6-v2", persist_dir="./chroma_db"):
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
self.vector_store = None
self.persist_dir = persist_dir # Persistent storage
def build_vector_store(self, texts):
"""Build Chroma vector store with better logging."""
if not texts:
logger.warning("No texts provided. Skipping vector store creation.")
return
self.vector_store = Chroma.from_texts(
texts, self.embeddings, persist_directory=self.persist_dir
)
self.vector_store.persist()
logger.info("Chroma vector store successfully built.")
def retrieve_relevant(self, query, k=2):
"""Fetch top-k relevant documents, logging warnings if store is empty."""
if not self.vector_store:
logger.warning("Vector store is empty. Run `build_vector_store` first.")
return []
top_docs = self.vector_store.similarity_search(query, k=k)
retrieved = [doc.page_content for doc in top_docs] if top_docs else []
logger.info(f"Retrieved {len(retrieved)} relevant papers for query: '{query}'.")
return retrieved
|