import numpy as np import faiss from sentence_transformers import SentenceTransformer import os from database import init_db, get_db_connection, INDEX_FILE, DB_FILE, delete_database_and_index from security import encrypt_data # Use a CLIP model that can handle both text and images MODEL_NAME = 'clip-ViT-B-32' def create_initial_index(documents_dict): """ Creates an initial encrypted, persistent index from a dictionary of text documents. This will delete any existing database to ensure a clean start. """ print("Performing a clean rebuild of the knowledge base...") delete_database_and_index() init_db() conn = get_db_connection() cursor = conn.cursor() model = SentenceTransformer(MODEL_NAME) all_chunks = [] all_embeddings = [] for name, content in documents_dict.items(): # Add document to documents table cursor.execute("INSERT INTO documents (name) VALUES (?)", (name,)) doc_id = cursor.lastrowid # For initial docs, we treat the whole content as one chunk chunk_text = content all_chunks.append((doc_id, 'text', encrypt_data(chunk_text.encode('utf-8')), 1)) # Create text embedding text_embedding = model.encode([chunk_text]) all_embeddings.append(text_embedding) # Batch insert chunks cursor.executemany( "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)", all_chunks ) conn.commit() conn.close() if not all_embeddings: print("No content to index.") return # Create and save the FAISS index embeddings_np = np.vstack(all_embeddings).astype('float32') dimension = embeddings_np.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings_np) faiss.write_index(index, INDEX_FILE) print(f"Initial encrypted index created with {len(all_chunks)} chunks.") print(f"Database: {DB_FILE}, FAISS Index: {INDEX_FILE}") if __name__ == '__main__': document_files = ["healthy_maize_remedy.txt", "maize_phosphorus_deficiency_remedy.txt", "comic_relief.txt"] documents_content = [] for file_path in document_files: try: with open(file_path, 'r', encoding='utf-8') as f: documents_content.append(f.read()) except FileNotFoundError: print(f"Warning: File not found, skipping: {file_path}") create_initial_index(documents_content)