Spaces:
Running
Running
from langchain.vectorstores import Chroma | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from src.logger import logger | |
class RetrievalModule: | |
def __init__(self, embedding_model="all-MiniLM-L6-v2", persist_dir="./chroma_db"): | |
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
self.vector_store = None | |
self.persist_dir = persist_dir # Persistent storage | |
def build_vector_store(self, texts): | |
"""Build Chroma vector store with better logging.""" | |
if not texts: | |
logger.warning("No texts provided. Skipping vector store creation.") | |
return | |
self.vector_store = Chroma.from_texts( | |
texts, self.embeddings, persist_directory=self.persist_dir | |
) | |
self.vector_store.persist() | |
logger.info("Chroma vector store successfully built.") | |
def retrieve_relevant(self, query, k=2): | |
"""Fetch top-k relevant documents, logging warnings if store is empty.""" | |
if not self.vector_store: | |
logger.warning("Vector store is empty. Run `build_vector_store` first.") | |
return [] | |
top_docs = self.vector_store.similarity_search(query, k=k) | |
retrieved = [doc.page_content for doc in top_docs] if top_docs else [] | |
logger.info(f"Retrieved {len(retrieved)} relevant papers for query: '{query}'.") | |
return retrieved | |