import os from sentence_transformers import SentenceTransformer from database import init_db, check_if_indexed, delete_database_and_index, get_db_connection from create_index import create_initial_index as build_secure_index from search import search as secure_search from ingest_document import ingest_pdf # Use a CLIP model that can handle both text and images MODEL_NAME = 'clip-ViT-B-32' class KnowledgeBase: def __init__(self): self.model = SentenceTransformer(MODEL_NAME) # Ensure the database is initialized init_db() # Check if the index exists, if not, build it from initial data if not check_if_indexed(): print("Local knowledge base not found. Building initial knowledge base...") self._build_initial_knowledge_base() def _build_initial_knowledge_base(self): current_dir = os.path.dirname(__file__) knowledge_base_data_dir = os.path.join(current_dir, "knowledge_base_data") document_filenames = [ "healthy_maize_remedy.txt", "maize_phosphorus_deficiency_remedy.txt", "comic_relief.txt" ] documents_content = {} for filename in document_filenames: file_path = os.path.join(knowledge_base_data_dir, filename) try: with open(file_path, 'r', encoding='utf-8') as f: documents_content[filename] = f.read() except FileNotFoundError: print(f"Warning: Knowledge base file not found, skipping: {file_path}") if documents_content: build_secure_index(documents_content) else: print("No initial knowledge base documents found to index.") def create_initial_index(self, documents_dict): # This method now directly calls the external build_secure_index build_secure_index(documents_dict) def rebuild_from_default_files(self): # This method orchestrates rebuilding the index from the default knowledge_base_data files self._build_initial_knowledge_base() def ingest_pdf(self, file_path, file_name): # This method now directly calls the external ingest_pdf ingest_pdf(file_path, file_name) def search(self, query, k=1): # This method now directly calls the external secure_search return secure_search(query, k) def get_retriever(): kb = KnowledgeBase() class Retriever: def __init__(self, kb): self.kb = kb def get_relevant_documents(self, query): results = self.kb.search(query) from langchain.schema import Document # Ensure that only text content is passed to Document # For image results, you might need a different handling or filter them out if Langchain Document doesn't support them directly. text_documents = [Document(page_content=r['content']) for r in results if r['type'] == 'text'] return text_documents return Retriever(kb)