File size: 2,987 Bytes
2cbbef6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import faiss
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
from PIL import Image
import io
import numpy as np
import os

from database import get_db_connection, INDEX_FILE
from security import encrypt_data

MODEL_NAME = 'clip-ViT-B-32'

def ingest_pdf(file_path, file_name):
    """Parses a PDF, encrypts its content (text+images), and adds it to the database and FAISS index."""
    print(f"Starting ingestion for: {file_name}")
    model = SentenceTransformer(MODEL_NAME)
    conn = get_db_connection()
    cursor = conn.cursor()

    # Add document to documents table, or get its ID if it exists
    try:
        cursor.execute("INSERT INTO documents (name) VALUES (?)", (file_name,))
        doc_id = cursor.lastrowid
    except conn.IntegrityError:
        print("Document already exists in DB. Skipping doc table insert.")
        doc_id = cursor.execute("SELECT id FROM documents WHERE name=?", (file_name,)).fetchone()['id']

    doc = fitz.open(file_path)
    new_embeddings = []
    
    # Load existing FAISS index or create a new one
    if os.path.exists(INDEX_FILE):
        index = faiss.read_index(INDEX_FILE)
    else:
        # Get dimension from the model if index is new
        dimension = model.encode(["test"]).shape[1]
        index = faiss.IndexFlatL2(dimension)

    for page_num, page in enumerate(doc):
        # 1. Process Text
        text = page.get_text()
        if text.strip():
            encrypted_text = encrypt_data(text.encode('utf-8'))
            cursor.execute(
                "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
                (doc_id, 'text', encrypted_text, page_num + 1)
            )
            text_embedding = model.encode([text])
            new_embeddings.append(text_embedding)

        # 2. Process Images
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            encrypted_image = encrypt_data(image_bytes)
            cursor.execute(
                "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
                (doc_id, 'image', encrypted_image, page_num + 1)
            )
            pil_image = Image.open(io.BytesIO(image_bytes))
            image_embedding = model.encode(pil_image)
            new_embeddings.append(image_embedding.reshape(1, -1))

    conn.commit()
    conn.close()

    if new_embeddings:
        # Add new embeddings to the FAISS index
        embeddings_np = np.vstack(new_embeddings).astype('float32')
        index.add(embeddings_np)
        faiss.write_index(index, INDEX_FILE)
        print(f"Successfully ingested {file_name} and added {len(new_embeddings)} new chunks to the knowledge base.")
    else:
        print(f"No new content found to ingest in {file_name}.")