Spaces:
Sleeping
Sleeping
Commit
·
2cbbef6
1
Parent(s):
8ff7567
symphony
Browse files- app.py +1 -5
- create_index.py +75 -0
- database.py +57 -0
- ingest_document.py +78 -0
- knowledge_base.py +45 -197
- search.py +52 -0
- security.py +37 -0
app.py
CHANGED
|
@@ -356,11 +356,7 @@ def create_kb_management_ui():
|
|
| 356 |
def rebuild_kb():
|
| 357 |
yield "Rebuilding knowledge base..."
|
| 358 |
try:
|
| 359 |
-
|
| 360 |
-
"Healthy Maize Plant": "For a Healthy Maize Plant, ensure proper watering and sunlight. No special remedy is needed. Continue good farming practices.",
|
| 361 |
-
"Maize Phosphorus Deficiency": "Phosphorus deficiency in maize is characterized by stunted growth and purplish discoloration of leaves. To remedy this, apply a phosphorus-rich fertilizer like DAP (Di-Ammonium Phosphate) or bone meal to the soil. Follow package instructions for application rates."
|
| 362 |
-
}
|
| 363 |
-
KB.create_initial_index(docs)
|
| 364 |
yield "Knowledge base rebuilt successfully."
|
| 365 |
except Exception as e:
|
| 366 |
yield f"Error rebuilding knowledge base: {e}"
|
|
|
|
| 356 |
def rebuild_kb():
|
| 357 |
yield "Rebuilding knowledge base..."
|
| 358 |
try:
|
| 359 |
+
KB.create_initial_index() # Call without arguments, as it now rebuilds from files
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
yield "Knowledge base rebuilt successfully."
|
| 361 |
except Exception as e:
|
| 362 |
yield f"Error rebuilding knowledge base: {e}"
|
create_index.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import faiss
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
from database import init_db, get_db_connection, INDEX_FILE, DB_FILE, delete_database_and_index
|
| 7 |
+
from security import encrypt_data
|
| 8 |
+
|
| 9 |
+
# Use a CLIP model that can handle both text and images
|
| 10 |
+
MODEL_NAME = 'clip-ViT-B-32'
|
| 11 |
+
|
| 12 |
+
def create_initial_index(documents_dict):
|
| 13 |
+
"""
|
| 14 |
+
Creates an initial encrypted, persistent index from a dictionary of text documents.
|
| 15 |
+
This will delete any existing database to ensure a clean start.
|
| 16 |
+
"""
|
| 17 |
+
print("Performing a clean rebuild of the knowledge base...")
|
| 18 |
+
delete_database_and_index()
|
| 19 |
+
init_db()
|
| 20 |
+
|
| 21 |
+
conn = get_db_connection()
|
| 22 |
+
cursor = conn.cursor()
|
| 23 |
+
model = SentenceTransformer(MODEL_NAME)
|
| 24 |
+
|
| 25 |
+
all_chunks = []
|
| 26 |
+
all_embeddings = []
|
| 27 |
+
|
| 28 |
+
for name, content in documents_dict.items():
|
| 29 |
+
# Add document to documents table
|
| 30 |
+
cursor.execute("INSERT INTO documents (name) VALUES (?)", (name,))
|
| 31 |
+
doc_id = cursor.lastrowid
|
| 32 |
+
|
| 33 |
+
# For initial docs, we treat the whole content as one chunk
|
| 34 |
+
chunk_text = content
|
| 35 |
+
all_chunks.append((doc_id, 'text', encrypt_data(chunk_text.encode('utf-8')), 1))
|
| 36 |
+
|
| 37 |
+
# Create text embedding
|
| 38 |
+
text_embedding = model.encode([chunk_text])
|
| 39 |
+
all_embeddings.append(text_embedding)
|
| 40 |
+
|
| 41 |
+
# Batch insert chunks
|
| 42 |
+
cursor.executemany(
|
| 43 |
+
"INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
|
| 44 |
+
all_chunks
|
| 45 |
+
)
|
| 46 |
+
conn.commit()
|
| 47 |
+
conn.close()
|
| 48 |
+
|
| 49 |
+
if not all_embeddings:
|
| 50 |
+
print("No content to index.")
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
# Create and save the FAISS index
|
| 54 |
+
embeddings_np = np.vstack(all_embeddings).astype('float32')
|
| 55 |
+
dimension = embeddings_np.shape[1]
|
| 56 |
+
index = faiss.IndexFlatL2(dimension)
|
| 57 |
+
index.add(embeddings_np)
|
| 58 |
+
faiss.write_index(index, INDEX_FILE)
|
| 59 |
+
|
| 60 |
+
print(f"Initial encrypted index created with {len(all_chunks)} chunks.")
|
| 61 |
+
print(f"Database: {DB_FILE}, FAISS Index: {INDEX_FILE}")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
if __name__ == '__main__':
|
| 66 |
+
document_files = ["healthy_maize_remedy.txt", "maize_phosphorus_deficiency_remedy.txt", "comic_relief.txt"]
|
| 67 |
+
documents_content = []
|
| 68 |
+
for file_path in document_files:
|
| 69 |
+
try:
|
| 70 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 71 |
+
documents_content.append(f.read())
|
| 72 |
+
except FileNotFoundError:
|
| 73 |
+
print(f"Warning: File not found, skipping: {file_path}")
|
| 74 |
+
|
| 75 |
+
create_initial_index(documents_content)
|
database.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
DB_FILE = "auramind_local.db"
|
| 5 |
+
INDEX_FILE = "auramind_faiss.index"
|
| 6 |
+
|
| 7 |
+
def init_db():
|
| 8 |
+
"""
|
| 9 |
+
Initializes a more robust database schema for multimodal data.
|
| 10 |
+
- 'documents' table tracks the source files.
|
| 11 |
+
- 'chunks' table stores the individual encrypted text/image chunks.
|
| 12 |
+
"""
|
| 13 |
+
conn = sqlite3.connect(DB_FILE)
|
| 14 |
+
cursor = conn.cursor()
|
| 15 |
+
|
| 16 |
+
# Table to track the source documents (e.g., 'healthy_maize.txt', 'user_guide.pdf')
|
| 17 |
+
cursor.execute('''
|
| 18 |
+
CREATE TABLE IF NOT EXISTS documents (
|
| 19 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 20 |
+
name TEXT NOT NULL UNIQUE
|
| 21 |
+
)
|
| 22 |
+
''')
|
| 23 |
+
|
| 24 |
+
# Table to store each chunk of content (text or image)
|
| 25 |
+
# The faiss_id will correspond to the row number in the FAISS index
|
| 26 |
+
cursor.execute('''
|
| 27 |
+
CREATE TABLE IF NOT EXISTS chunks (
|
| 28 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 29 |
+
doc_id INTEGER,
|
| 30 |
+
content_type TEXT NOT NULL, -- 'text' or 'image'
|
| 31 |
+
encrypted_content BLOB NOT NULL,
|
| 32 |
+
page_num INTEGER,
|
| 33 |
+
FOREIGN KEY (doc_id) REFERENCES documents (id)
|
| 34 |
+
)
|
| 35 |
+
''')
|
| 36 |
+
conn.commit()
|
| 37 |
+
conn.close()
|
| 38 |
+
|
| 39 |
+
def get_db_connection():
|
| 40 |
+
"""Establishes a connection to the database."""
|
| 41 |
+
conn = sqlite3.connect(DB_FILE)
|
| 42 |
+
conn.row_factory = sqlite3.Row
|
| 43 |
+
return conn
|
| 44 |
+
|
| 45 |
+
def check_if_indexed():
|
| 46 |
+
"""Checks if the initial database and index file exist."""
|
| 47 |
+
# A basic check. A more robust check might query the db for content.
|
| 48 |
+
return os.path.exists(DB_FILE) and os.path.exists(INDEX_FILE)
|
| 49 |
+
|
| 50 |
+
def delete_database_and_index():
|
| 51 |
+
"""Deletes existing db and index files for a clean rebuild."""
|
| 52 |
+
if os.path.exists(DB_FILE):
|
| 53 |
+
os.remove(DB_FILE)
|
| 54 |
+
print(f"Removed old database: {DB_FILE}")
|
| 55 |
+
if os.path.exists(INDEX_FILE):
|
| 56 |
+
os.remove(INDEX_FILE)
|
| 57 |
+
print(f"Removed old index: {INDEX_FILE}")
|
ingest_document.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
import fitz # PyMuPDF
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import io
|
| 6 |
+
import numpy as np
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
from database import get_db_connection, INDEX_FILE
|
| 10 |
+
from security import encrypt_data
|
| 11 |
+
|
| 12 |
+
MODEL_NAME = 'clip-ViT-B-32'
|
| 13 |
+
|
| 14 |
+
def ingest_pdf(file_path, file_name):
|
| 15 |
+
"""Parses a PDF, encrypts its content (text+images), and adds it to the database and FAISS index."""
|
| 16 |
+
print(f"Starting ingestion for: {file_name}")
|
| 17 |
+
model = SentenceTransformer(MODEL_NAME)
|
| 18 |
+
conn = get_db_connection()
|
| 19 |
+
cursor = conn.cursor()
|
| 20 |
+
|
| 21 |
+
# Add document to documents table, or get its ID if it exists
|
| 22 |
+
try:
|
| 23 |
+
cursor.execute("INSERT INTO documents (name) VALUES (?)", (file_name,))
|
| 24 |
+
doc_id = cursor.lastrowid
|
| 25 |
+
except conn.IntegrityError:
|
| 26 |
+
print("Document already exists in DB. Skipping doc table insert.")
|
| 27 |
+
doc_id = cursor.execute("SELECT id FROM documents WHERE name=?", (file_name,)).fetchone()['id']
|
| 28 |
+
|
| 29 |
+
doc = fitz.open(file_path)
|
| 30 |
+
new_embeddings = []
|
| 31 |
+
|
| 32 |
+
# Load existing FAISS index or create a new one
|
| 33 |
+
if os.path.exists(INDEX_FILE):
|
| 34 |
+
index = faiss.read_index(INDEX_FILE)
|
| 35 |
+
else:
|
| 36 |
+
# Get dimension from the model if index is new
|
| 37 |
+
dimension = model.encode(["test"]).shape[1]
|
| 38 |
+
index = faiss.IndexFlatL2(dimension)
|
| 39 |
+
|
| 40 |
+
for page_num, page in enumerate(doc):
|
| 41 |
+
# 1. Process Text
|
| 42 |
+
text = page.get_text()
|
| 43 |
+
if text.strip():
|
| 44 |
+
encrypted_text = encrypt_data(text.encode('utf-8'))
|
| 45 |
+
cursor.execute(
|
| 46 |
+
"INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
|
| 47 |
+
(doc_id, 'text', encrypted_text, page_num + 1)
|
| 48 |
+
)
|
| 49 |
+
text_embedding = model.encode([text])
|
| 50 |
+
new_embeddings.append(text_embedding)
|
| 51 |
+
|
| 52 |
+
# 2. Process Images
|
| 53 |
+
image_list = page.get_images(full=True)
|
| 54 |
+
for img_index, img in enumerate(image_list):
|
| 55 |
+
xref = img[0]
|
| 56 |
+
base_image = doc.extract_image(xref)
|
| 57 |
+
image_bytes = base_image["image"]
|
| 58 |
+
|
| 59 |
+
encrypted_image = encrypt_data(image_bytes)
|
| 60 |
+
cursor.execute(
|
| 61 |
+
"INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
|
| 62 |
+
(doc_id, 'image', encrypted_image, page_num + 1)
|
| 63 |
+
)
|
| 64 |
+
pil_image = Image.open(io.BytesIO(image_bytes))
|
| 65 |
+
image_embedding = model.encode(pil_image)
|
| 66 |
+
new_embeddings.append(image_embedding.reshape(1, -1))
|
| 67 |
+
|
| 68 |
+
conn.commit()
|
| 69 |
+
conn.close()
|
| 70 |
+
|
| 71 |
+
if new_embeddings:
|
| 72 |
+
# Add new embeddings to the FAISS index
|
| 73 |
+
embeddings_np = np.vstack(new_embeddings).astype('float32')
|
| 74 |
+
index.add(embeddings_np)
|
| 75 |
+
faiss.write_index(index, INDEX_FILE)
|
| 76 |
+
print(f"Successfully ingested {file_name} and added {len(new_embeddings)} new chunks to the knowledge base.")
|
| 77 |
+
else:
|
| 78 |
+
print(f"No new content found to ingest in {file_name}.")
|
knowledge_base.py
CHANGED
|
@@ -1,219 +1,67 @@
|
|
| 1 |
import os
|
| 2 |
-
import sqlite3
|
| 3 |
-
import faiss
|
| 4 |
-
import numpy as np
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
-
import fitz # PyMuPDF
|
| 7 |
-
from PIL import Image
|
| 8 |
-
import io
|
| 9 |
-
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
| 10 |
-
from cryptography.hazmat.primitives import padding
|
| 11 |
-
from cryptography.hazmat.backends import default_backend
|
| 12 |
-
import config
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
padder = padding.PKCS7(algorithms.AES.block_size).padder()
|
| 22 |
-
padded_data = padder.update(data) + padder.finalize()
|
| 23 |
-
cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
|
| 24 |
-
encryptor = cipher.encryptor()
|
| 25 |
-
encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
|
| 26 |
-
return iv + encrypted_data
|
| 27 |
|
| 28 |
-
def decrypt_data(encrypted_data_with_iv: bytes) -> bytes:
|
| 29 |
-
iv = encrypted_data_with_iv[:16]
|
| 30 |
-
encrypted_data = encrypted_data_with_iv[16:]
|
| 31 |
-
cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
|
| 32 |
-
decryptor = cipher.decryptor()
|
| 33 |
-
padded_data = decryptor.update(encrypted_data) + decryptor.finalize()
|
| 34 |
-
unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
|
| 35 |
-
data = unpadder.update(padded_data) + unpadder.finalize()
|
| 36 |
-
return data
|
| 37 |
-
|
| 38 |
-
# --- KnowledgeBase Class ---
|
| 39 |
class KnowledgeBase:
|
| 40 |
-
def __init__(self
|
| 41 |
-
self.
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
def get_db_connection(self):
|
| 69 |
-
conn = sqlite3.connect(self.db_file)
|
| 70 |
-
conn.row_factory = sqlite3.Row
|
| 71 |
-
return conn
|
| 72 |
-
|
| 73 |
-
def delete_database_and_index(self):
|
| 74 |
-
if os.path.exists(self.db_file):
|
| 75 |
-
os.remove(self.db_file)
|
| 76 |
-
print(f"Removed old database: {self.db_file}")
|
| 77 |
-
if os.path.exists(self.index_file):
|
| 78 |
-
os.remove(self.index_file)
|
| 79 |
-
print(f"Removed old index: {self.index_file}")
|
| 80 |
|
| 81 |
def create_initial_index(self, documents_dict):
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
self.init_db()
|
| 85 |
-
|
| 86 |
-
conn = self.get_db_connection()
|
| 87 |
-
cursor = conn.cursor()
|
| 88 |
-
|
| 89 |
-
all_chunks = []
|
| 90 |
-
all_embeddings = []
|
| 91 |
-
|
| 92 |
-
for name, content in documents_dict.items():
|
| 93 |
-
cursor.execute("INSERT INTO documents (name) VALUES (?)", (name,))
|
| 94 |
-
doc_id = cursor.lastrowid
|
| 95 |
-
chunk_text = content
|
| 96 |
-
all_chunks.append((doc_id, 'text', encrypt_data(chunk_text.encode('utf-8')), 1))
|
| 97 |
-
text_embedding = self.model.encode([chunk_text])
|
| 98 |
-
all_embeddings.append(text_embedding)
|
| 99 |
-
|
| 100 |
-
cursor.executemany(
|
| 101 |
-
"INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
|
| 102 |
-
all_chunks
|
| 103 |
-
)
|
| 104 |
-
conn.commit()
|
| 105 |
-
conn.close()
|
| 106 |
-
|
| 107 |
-
if not all_embeddings:
|
| 108 |
-
print("No content to index.")
|
| 109 |
-
return
|
| 110 |
-
|
| 111 |
-
embeddings_np = np.vstack(all_embeddings).astype('float32')
|
| 112 |
-
dimension = embeddings_np.shape[1]
|
| 113 |
-
index = faiss.IndexFlatL2(dimension)
|
| 114 |
-
index.add(embeddings_np)
|
| 115 |
-
faiss.write_index(index, self.index_file)
|
| 116 |
-
print(f"Initial encrypted index created with {len(all_chunks)} chunks.")
|
| 117 |
|
| 118 |
def ingest_pdf(self, file_path, file_name):
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
cursor = conn.cursor()
|
| 122 |
-
|
| 123 |
-
try:
|
| 124 |
-
cursor.execute("INSERT INTO documents (name) VALUES (?)", (file_name,))
|
| 125 |
-
doc_id = cursor.lastrowid
|
| 126 |
-
except conn.IntegrityError:
|
| 127 |
-
print("Document already exists in DB. Skipping doc table insert.")
|
| 128 |
-
doc_id = cursor.execute("SELECT id FROM documents WHERE name=?", (file_name,)).fetchone()['id']
|
| 129 |
-
|
| 130 |
-
doc = fitz.open(file_path)
|
| 131 |
-
new_embeddings = []
|
| 132 |
-
|
| 133 |
-
if os.path.exists(self.index_file):
|
| 134 |
-
index = faiss.read_index(self.index_file)
|
| 135 |
-
else:
|
| 136 |
-
dimension = self.model.encode(["test"]).shape[1]
|
| 137 |
-
index = faiss.IndexFlatL2(dimension)
|
| 138 |
-
|
| 139 |
-
for page_num, page in enumerate(doc):
|
| 140 |
-
text = page.get_text()
|
| 141 |
-
if text.strip():
|
| 142 |
-
encrypted_text = encrypt_data(text.encode('utf-8'))
|
| 143 |
-
cursor.execute(
|
| 144 |
-
"INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
|
| 145 |
-
(doc_id, 'text', encrypted_text, page_num + 1)
|
| 146 |
-
)
|
| 147 |
-
text_embedding = self.model.encode([text])
|
| 148 |
-
new_embeddings.append(text_embedding)
|
| 149 |
-
|
| 150 |
-
image_list = page.get_images(full=True)
|
| 151 |
-
for img_index, img in enumerate(image_list):
|
| 152 |
-
xref = img[0]
|
| 153 |
-
base_image = doc.extract_image(xref)
|
| 154 |
-
image_bytes = base_image["image"]
|
| 155 |
-
encrypted_image = encrypt_data(image_bytes)
|
| 156 |
-
cursor.execute(
|
| 157 |
-
"INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
|
| 158 |
-
(doc_id, 'image', encrypted_image, page_num + 1)
|
| 159 |
-
)
|
| 160 |
-
pil_image = Image.open(io.BytesIO(image_bytes))
|
| 161 |
-
image_embedding = self.model.encode(pil_image)
|
| 162 |
-
new_embeddings.append(image_embedding.reshape(1, -1))
|
| 163 |
-
|
| 164 |
-
conn.commit()
|
| 165 |
-
conn.close()
|
| 166 |
-
|
| 167 |
-
if new_embeddings:
|
| 168 |
-
embeddings_np = np.vstack(new_embeddings).astype('float32')
|
| 169 |
-
index.add(embeddings_np)
|
| 170 |
-
faiss.write_index(index, self.index_file)
|
| 171 |
-
print(f"Successfully ingested {file_name} and added {len(new_embeddings)} new chunks.")
|
| 172 |
-
else:
|
| 173 |
-
print(f"No new content found to ingest in {file_name}.")
|
| 174 |
|
| 175 |
def search(self, query, k=1):
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
index = faiss.read_index(self.index_file)
|
| 180 |
-
query_embedding = self.model.encode([query]).astype('float32')
|
| 181 |
-
distances, indices = index.search(query_embedding, k)
|
| 182 |
-
|
| 183 |
-
results = []
|
| 184 |
-
conn = self.get_db_connection()
|
| 185 |
-
for i, faiss_id in enumerate(indices[0]):
|
| 186 |
-
if faiss_id != -1:
|
| 187 |
-
sql_id = int(faiss_id) + 1
|
| 188 |
-
chunk_record = conn.execute('SELECT * FROM chunks WHERE id = ?', (sql_id,)).fetchone()
|
| 189 |
-
if chunk_record:
|
| 190 |
-
content_type = chunk_record['content_type']
|
| 191 |
-
decrypted_content_bytes = decrypt_data(chunk_record['encrypted_content'])
|
| 192 |
-
if content_type == 'text':
|
| 193 |
-
content = decrypted_content_bytes.decode('utf-8')
|
| 194 |
-
elif content_type == 'image':
|
| 195 |
-
content = Image.open(io.BytesIO(decrypted_content_bytes))
|
| 196 |
-
results.append({
|
| 197 |
-
'distance': distances[0][i],
|
| 198 |
-
'content': content,
|
| 199 |
-
'type': content_type,
|
| 200 |
-
'page': chunk_record['page_num']
|
| 201 |
-
})
|
| 202 |
-
conn.close()
|
| 203 |
-
return results
|
| 204 |
|
| 205 |
def get_retriever():
|
| 206 |
kb = KnowledgeBase()
|
| 207 |
-
# This is a placeholder to maintain compatibility with the existing code.
|
| 208 |
-
# The actual search will be done using kb.search()
|
| 209 |
class Retriever:
|
| 210 |
def __init__(self, kb):
|
| 211 |
self.kb = kb
|
| 212 |
def get_relevant_documents(self, query):
|
| 213 |
results = self.kb.search(query)
|
| 214 |
-
# Langchain retrievers expect a list of Document objects.
|
| 215 |
-
# We will return the content of the documents for now.
|
| 216 |
from langchain.schema import Document
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
-
return Retriever(kb)
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
from database import init_db, check_if_indexed, delete_database_and_index, get_db_connection
|
| 5 |
+
from create_index import create_initial_index as build_secure_index
|
| 6 |
+
from search import search as secure_search
|
| 7 |
+
from ingest_document import ingest_pdf
|
| 8 |
|
| 9 |
+
# Use a CLIP model that can handle both text and images
|
| 10 |
+
MODEL_NAME = 'clip-ViT-B-32'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
class KnowledgeBase:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.model = SentenceTransformer(MODEL_NAME)
|
| 15 |
+
# Ensure the database is initialized
|
| 16 |
+
init_db()
|
| 17 |
+
# Check if the index exists, if not, build it from initial data
|
| 18 |
+
if not check_if_indexed():
|
| 19 |
+
print("Local knowledge base not found. Building initial knowledge base...")
|
| 20 |
+
self._build_initial_knowledge_base()
|
| 21 |
+
|
| 22 |
+
def _build_initial_knowledge_base(self):
|
| 23 |
+
document_files = [
|
| 24 |
+
"/Users/surfiniaburger/Desktop/glow/aura-mind-glow/knowledge_base_data/healthy_maize_remedy.txt",
|
| 25 |
+
"/Users/surfiniaburger/Desktop/glow/aura-mind-glow/knowledge_base_data/maize_phosphorus_deficiency_remedy.txt",
|
| 26 |
+
"/Users/surfiniaburger/Desktop/glow/aura-mind-glow/knowledge_base_data/comic_relief.txt"
|
| 27 |
+
]
|
| 28 |
+
documents_content = {}
|
| 29 |
+
for file_path in document_files:
|
| 30 |
+
try:
|
| 31 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 32 |
+
# Use the base name of the file as the document name
|
| 33 |
+
documents_content[os.path.basename(file_path)] = f.read()
|
| 34 |
+
except FileNotFoundError:
|
| 35 |
+
print(f"Warning: Knowledge base file not found, skipping: {file_path}")
|
| 36 |
+
|
| 37 |
+
if documents_content:
|
| 38 |
+
build_secure_index(documents_content)
|
| 39 |
+
else:
|
| 40 |
+
print("No initial knowledge base documents found to index.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
def create_initial_index(self, documents_dict):
|
| 43 |
+
# This method now directly calls the external build_secure_index
|
| 44 |
+
build_secure_index(documents_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
def ingest_pdf(self, file_path, file_name):
|
| 47 |
+
# This method now directly calls the external ingest_pdf
|
| 48 |
+
ingest_pdf(file_path, file_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def search(self, query, k=1):
|
| 51 |
+
# This method now directly calls the external secure_search
|
| 52 |
+
return secure_search(query, k)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
def get_retriever():
|
| 55 |
kb = KnowledgeBase()
|
|
|
|
|
|
|
| 56 |
class Retriever:
|
| 57 |
def __init__(self, kb):
|
| 58 |
self.kb = kb
|
| 59 |
def get_relevant_documents(self, query):
|
| 60 |
results = self.kb.search(query)
|
|
|
|
|
|
|
| 61 |
from langchain.schema import Document
|
| 62 |
+
# Ensure that only text content is passed to Document
|
| 63 |
+
# For image results, you might need a different handling or filter them out if Langchain Document doesn't support them directly.
|
| 64 |
+
text_documents = [Document(page_content=r['content']) for r in results if r['type'] == 'text']
|
| 65 |
+
return text_documents
|
| 66 |
|
| 67 |
+
return Retriever(kb)
|
search.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import io
|
| 6 |
+
|
| 7 |
+
from database import get_db_connection, INDEX_FILE, check_if_indexed
|
| 8 |
+
from security import decrypt_data
|
| 9 |
+
|
| 10 |
+
MODEL_NAME = 'clip-ViT-B-32'
|
| 11 |
+
|
| 12 |
+
def search(query, k=1):
|
| 13 |
+
"""
|
| 14 |
+
Searches the multimodal FAISS index. The query can be text, and the result can be text or an image.
|
| 15 |
+
"""
|
| 16 |
+
if not check_if_indexed():
|
| 17 |
+
return []
|
| 18 |
+
|
| 19 |
+
model = SentenceTransformer(MODEL_NAME)
|
| 20 |
+
index = faiss.read_index(INDEX_FILE)
|
| 21 |
+
|
| 22 |
+
# Create an embedding for the text query
|
| 23 |
+
query_embedding = model.encode([query]).astype('float32')
|
| 24 |
+
distances, indices = index.search(query_embedding, k)
|
| 25 |
+
|
| 26 |
+
results = []
|
| 27 |
+
conn = get_db_connection()
|
| 28 |
+
for i, faiss_id in enumerate(indices[0]):
|
| 29 |
+
if faiss_id != -1:
|
| 30 |
+
# The faiss_id is the row number, which corresponds to the chunk's primary key 'id'
|
| 31 |
+
sql_id = int(faiss_id) + 1
|
| 32 |
+
|
| 33 |
+
chunk_record = conn.execute('SELECT * FROM chunks WHERE id = ?', (sql_id,)).fetchone()
|
| 34 |
+
|
| 35 |
+
if chunk_record:
|
| 36 |
+
content_type = chunk_record['content_type']
|
| 37 |
+
decrypted_content_bytes = decrypt_data(chunk_record['encrypted_content'])
|
| 38 |
+
|
| 39 |
+
# Prepare content based on its type
|
| 40 |
+
if content_type == 'text':
|
| 41 |
+
content = decrypted_content_bytes.decode('utf-8')
|
| 42 |
+
elif content_type == 'image':
|
| 43 |
+
content = Image.open(io.BytesIO(decrypted_content_bytes))
|
| 44 |
+
|
| 45 |
+
results.append({
|
| 46 |
+
'distance': distances[0][i],
|
| 47 |
+
'content': content,
|
| 48 |
+
'type': content_type,
|
| 49 |
+
'page': chunk_record['page_num']
|
| 50 |
+
})
|
| 51 |
+
conn.close()
|
| 52 |
+
return results
|
security.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
| 3 |
+
from cryptography.hazmat.primitives import padding
|
| 4 |
+
from cryptography.hazmat.backends import default_backend
|
| 5 |
+
import base64
|
| 6 |
+
|
| 7 |
+
# In a real mobile app, this key would be securely managed by
|
| 8 |
+
# the Android Keystore or iOS Keychain. For this skeleton, we'll
|
| 9 |
+
# use an environment variable for demonstration.
|
| 10 |
+
SECRET_KEY = os.environ.get("AURA_MIND_SECRET_KEY", "a_default_secret_key_32_bytes_!!").encode()
|
| 11 |
+
|
| 12 |
+
if len(SECRET_KEY) != 32:
|
| 13 |
+
raise ValueError("SECRET_KEY must be 32 bytes long for AES-256.")
|
| 14 |
+
|
| 15 |
+
def encrypt_data(data: bytes) -> bytes:
|
| 16 |
+
"""Encrypts data using AES-CBC."""
|
| 17 |
+
iv = os.urandom(16)
|
| 18 |
+
padder = padding.PKCS7(algorithms.AES.block_size).padder()
|
| 19 |
+
padded_data = padder.update(data) + padder.finalize()
|
| 20 |
+
|
| 21 |
+
cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
|
| 22 |
+
encryptor = cipher.encryptor()
|
| 23 |
+
encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
|
| 24 |
+
return iv + encrypted_data
|
| 25 |
+
|
| 26 |
+
def decrypt_data(encrypted_data_with_iv: bytes) -> bytes:
|
| 27 |
+
"""Decrypts data using AES-CBC."""
|
| 28 |
+
iv = encrypted_data_with_iv[:16]
|
| 29 |
+
encrypted_data = encrypted_data_with_iv[16:]
|
| 30 |
+
|
| 31 |
+
cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
|
| 32 |
+
decryptor = cipher.decryptor()
|
| 33 |
+
padded_data = decryptor.update(encrypted_data) + decryptor.finalize()
|
| 34 |
+
|
| 35 |
+
unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
|
| 36 |
+
data = unpadder.update(padded_data) + unpadder.finalize()
|
| 37 |
+
return data
|