import faiss from sentence_transformers import SentenceTransformer import fitz # PyMuPDF from PIL import Image import io import numpy as np import os from database import get_db_connection, INDEX_FILE from security import encrypt_data MODEL_NAME = 'clip-ViT-B-32' def ingest_pdf(file_path, file_name): """Parses a PDF, encrypts its content (text+images), and adds it to the database and FAISS index.""" print(f"Starting ingestion for: {file_name}") model = SentenceTransformer(MODEL_NAME) conn = get_db_connection() cursor = conn.cursor() # Add document to documents table, or get its ID if it exists try: cursor.execute("INSERT INTO documents (name) VALUES (?)", (file_name,)) doc_id = cursor.lastrowid except conn.IntegrityError: print("Document already exists in DB. Skipping doc table insert.") doc_id = cursor.execute("SELECT id FROM documents WHERE name=?", (file_name,)).fetchone()['id'] doc = fitz.open(file_path) new_embeddings = [] # Load existing FAISS index or create a new one if os.path.exists(INDEX_FILE): index = faiss.read_index(INDEX_FILE) else: # Get dimension from the model if index is new dimension = model.encode(["test"]).shape[1] index = faiss.IndexFlatL2(dimension) for page_num, page in enumerate(doc): # 1. Process Text text = page.get_text() if text.strip(): encrypted_text = encrypt_data(text.encode('utf-8')) cursor.execute( "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)", (doc_id, 'text', encrypted_text, page_num + 1) ) text_embedding = model.encode([text]) new_embeddings.append(text_embedding) # 2. Process Images image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] encrypted_image = encrypt_data(image_bytes) cursor.execute( "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)", (doc_id, 'image', encrypted_image, page_num + 1) ) pil_image = Image.open(io.BytesIO(image_bytes)) image_embedding = model.encode(pil_image) new_embeddings.append(image_embedding.reshape(1, -1)) conn.commit() conn.close() if new_embeddings: # Add new embeddings to the FAISS index embeddings_np = np.vstack(new_embeddings).astype('float32') index.add(embeddings_np) faiss.write_index(index, INDEX_FILE) print(f"Successfully ingested {file_name} and added {len(new_embeddings)} new chunks to the knowledge base.") else: print(f"No new content found to ingest in {file_name}.")