Spaces:
Sleeping
Sleeping
import chromadb | |
from sentence_transformers import SentenceTransformer | |
import os | |
# --- Constants --- | |
MODEL_NAME = "all-MiniLM-L6-v2" | |
COLLECTION_NAME = "aura_mind_knowledge" | |
KNOWLEDGE_BASE_DIR = "knowledge_base_data" | |
# --- Initialize ChromaDB and Model --- | |
client = chromadb.PersistentClient(path="chroma_db") | |
model = SentenceTransformer(MODEL_NAME) | |
collection = client.get_or_create_collection(name=COLLECTION_NAME) | |
def embed_and_store_documents(): | |
""" | |
Reads documents from the knowledge base directory, generates embeddings, | |
and stores them in ChromaDB. | |
""" | |
if collection.count() > 0: | |
print("β Knowledge base is already loaded into ChromaDB.") | |
return | |
print("Embedding and storing documents in ChromaDB...") | |
documents = [] | |
ids = [] | |
for filename in os.listdir(KNOWLEDGE_BASE_DIR): | |
if filename.endswith(".txt"): | |
with open(os.path.join(KNOWLEDGE_BASE_DIR, filename), "r") as f: | |
documents.append(f.read()) | |
ids.append(filename) | |
if documents: | |
embeddings = model.encode(documents).tolist() | |
collection.add( | |
embeddings=embeddings, | |
documents=documents, | |
ids=ids | |
) | |
print(f"β Successfully stored {len(documents)} documents in ChromaDB.") | |
def search_documents(query: str, n_results: int = 1) -> list: | |
""" | |
Searches for relevant documents in ChromaDB based on a query. | |
Args: | |
query: The search query. | |
n_results: The number of results to return. | |
Returns: | |
A list of relevant documents. | |
""" | |
if not query: | |
return [] | |
query_embedding = model.encode([query]).tolist() | |
results = collection.query( | |
query_embeddings=query_embedding, | |
n_results=n_results, | |
) | |
return results['documents'][0] if results['documents'] else [] | |