surfiniaburger commited on
Commit
2cbbef6
·
1 Parent(s): 8ff7567
Files changed (7) hide show
  1. app.py +1 -5
  2. create_index.py +75 -0
  3. database.py +57 -0
  4. ingest_document.py +78 -0
  5. knowledge_base.py +45 -197
  6. search.py +52 -0
  7. security.py +37 -0
app.py CHANGED
@@ -356,11 +356,7 @@ def create_kb_management_ui():
356
  def rebuild_kb():
357
  yield "Rebuilding knowledge base..."
358
  try:
359
- docs = {
360
- "Healthy Maize Plant": "For a Healthy Maize Plant, ensure proper watering and sunlight. No special remedy is needed. Continue good farming practices.",
361
- "Maize Phosphorus Deficiency": "Phosphorus deficiency in maize is characterized by stunted growth and purplish discoloration of leaves. To remedy this, apply a phosphorus-rich fertilizer like DAP (Di-Ammonium Phosphate) or bone meal to the soil. Follow package instructions for application rates."
362
- }
363
- KB.create_initial_index(docs)
364
  yield "Knowledge base rebuilt successfully."
365
  except Exception as e:
366
  yield f"Error rebuilding knowledge base: {e}"
 
356
  def rebuild_kb():
357
  yield "Rebuilding knowledge base..."
358
  try:
359
+ KB.create_initial_index() # Call without arguments, as it now rebuilds from files
 
 
 
 
360
  yield "Knowledge base rebuilt successfully."
361
  except Exception as e:
362
  yield f"Error rebuilding knowledge base: {e}"
create_index.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import faiss
3
+ from sentence_transformers import SentenceTransformer
4
+ import os
5
+
6
+ from database import init_db, get_db_connection, INDEX_FILE, DB_FILE, delete_database_and_index
7
+ from security import encrypt_data
8
+
9
+ # Use a CLIP model that can handle both text and images
10
+ MODEL_NAME = 'clip-ViT-B-32'
11
+
12
+ def create_initial_index(documents_dict):
13
+ """
14
+ Creates an initial encrypted, persistent index from a dictionary of text documents.
15
+ This will delete any existing database to ensure a clean start.
16
+ """
17
+ print("Performing a clean rebuild of the knowledge base...")
18
+ delete_database_and_index()
19
+ init_db()
20
+
21
+ conn = get_db_connection()
22
+ cursor = conn.cursor()
23
+ model = SentenceTransformer(MODEL_NAME)
24
+
25
+ all_chunks = []
26
+ all_embeddings = []
27
+
28
+ for name, content in documents_dict.items():
29
+ # Add document to documents table
30
+ cursor.execute("INSERT INTO documents (name) VALUES (?)", (name,))
31
+ doc_id = cursor.lastrowid
32
+
33
+ # For initial docs, we treat the whole content as one chunk
34
+ chunk_text = content
35
+ all_chunks.append((doc_id, 'text', encrypt_data(chunk_text.encode('utf-8')), 1))
36
+
37
+ # Create text embedding
38
+ text_embedding = model.encode([chunk_text])
39
+ all_embeddings.append(text_embedding)
40
+
41
+ # Batch insert chunks
42
+ cursor.executemany(
43
+ "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
44
+ all_chunks
45
+ )
46
+ conn.commit()
47
+ conn.close()
48
+
49
+ if not all_embeddings:
50
+ print("No content to index.")
51
+ return
52
+
53
+ # Create and save the FAISS index
54
+ embeddings_np = np.vstack(all_embeddings).astype('float32')
55
+ dimension = embeddings_np.shape[1]
56
+ index = faiss.IndexFlatL2(dimension)
57
+ index.add(embeddings_np)
58
+ faiss.write_index(index, INDEX_FILE)
59
+
60
+ print(f"Initial encrypted index created with {len(all_chunks)} chunks.")
61
+ print(f"Database: {DB_FILE}, FAISS Index: {INDEX_FILE}")
62
+
63
+
64
+
65
+ if __name__ == '__main__':
66
+ document_files = ["healthy_maize_remedy.txt", "maize_phosphorus_deficiency_remedy.txt", "comic_relief.txt"]
67
+ documents_content = []
68
+ for file_path in document_files:
69
+ try:
70
+ with open(file_path, 'r', encoding='utf-8') as f:
71
+ documents_content.append(f.read())
72
+ except FileNotFoundError:
73
+ print(f"Warning: File not found, skipping: {file_path}")
74
+
75
+ create_initial_index(documents_content)
database.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import os
3
+
4
+ DB_FILE = "auramind_local.db"
5
+ INDEX_FILE = "auramind_faiss.index"
6
+
7
+ def init_db():
8
+ """
9
+ Initializes a more robust database schema for multimodal data.
10
+ - 'documents' table tracks the source files.
11
+ - 'chunks' table stores the individual encrypted text/image chunks.
12
+ """
13
+ conn = sqlite3.connect(DB_FILE)
14
+ cursor = conn.cursor()
15
+
16
+ # Table to track the source documents (e.g., 'healthy_maize.txt', 'user_guide.pdf')
17
+ cursor.execute('''
18
+ CREATE TABLE IF NOT EXISTS documents (
19
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
20
+ name TEXT NOT NULL UNIQUE
21
+ )
22
+ ''')
23
+
24
+ # Table to store each chunk of content (text or image)
25
+ # The faiss_id will correspond to the row number in the FAISS index
26
+ cursor.execute('''
27
+ CREATE TABLE IF NOT EXISTS chunks (
28
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
29
+ doc_id INTEGER,
30
+ content_type TEXT NOT NULL, -- 'text' or 'image'
31
+ encrypted_content BLOB NOT NULL,
32
+ page_num INTEGER,
33
+ FOREIGN KEY (doc_id) REFERENCES documents (id)
34
+ )
35
+ ''')
36
+ conn.commit()
37
+ conn.close()
38
+
39
+ def get_db_connection():
40
+ """Establishes a connection to the database."""
41
+ conn = sqlite3.connect(DB_FILE)
42
+ conn.row_factory = sqlite3.Row
43
+ return conn
44
+
45
+ def check_if_indexed():
46
+ """Checks if the initial database and index file exist."""
47
+ # A basic check. A more robust check might query the db for content.
48
+ return os.path.exists(DB_FILE) and os.path.exists(INDEX_FILE)
49
+
50
+ def delete_database_and_index():
51
+ """Deletes existing db and index files for a clean rebuild."""
52
+ if os.path.exists(DB_FILE):
53
+ os.remove(DB_FILE)
54
+ print(f"Removed old database: {DB_FILE}")
55
+ if os.path.exists(INDEX_FILE):
56
+ os.remove(INDEX_FILE)
57
+ print(f"Removed old index: {INDEX_FILE}")
ingest_document.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ from sentence_transformers import SentenceTransformer
3
+ import fitz # PyMuPDF
4
+ from PIL import Image
5
+ import io
6
+ import numpy as np
7
+ import os
8
+
9
+ from database import get_db_connection, INDEX_FILE
10
+ from security import encrypt_data
11
+
12
+ MODEL_NAME = 'clip-ViT-B-32'
13
+
14
+ def ingest_pdf(file_path, file_name):
15
+ """Parses a PDF, encrypts its content (text+images), and adds it to the database and FAISS index."""
16
+ print(f"Starting ingestion for: {file_name}")
17
+ model = SentenceTransformer(MODEL_NAME)
18
+ conn = get_db_connection()
19
+ cursor = conn.cursor()
20
+
21
+ # Add document to documents table, or get its ID if it exists
22
+ try:
23
+ cursor.execute("INSERT INTO documents (name) VALUES (?)", (file_name,))
24
+ doc_id = cursor.lastrowid
25
+ except conn.IntegrityError:
26
+ print("Document already exists in DB. Skipping doc table insert.")
27
+ doc_id = cursor.execute("SELECT id FROM documents WHERE name=?", (file_name,)).fetchone()['id']
28
+
29
+ doc = fitz.open(file_path)
30
+ new_embeddings = []
31
+
32
+ # Load existing FAISS index or create a new one
33
+ if os.path.exists(INDEX_FILE):
34
+ index = faiss.read_index(INDEX_FILE)
35
+ else:
36
+ # Get dimension from the model if index is new
37
+ dimension = model.encode(["test"]).shape[1]
38
+ index = faiss.IndexFlatL2(dimension)
39
+
40
+ for page_num, page in enumerate(doc):
41
+ # 1. Process Text
42
+ text = page.get_text()
43
+ if text.strip():
44
+ encrypted_text = encrypt_data(text.encode('utf-8'))
45
+ cursor.execute(
46
+ "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
47
+ (doc_id, 'text', encrypted_text, page_num + 1)
48
+ )
49
+ text_embedding = model.encode([text])
50
+ new_embeddings.append(text_embedding)
51
+
52
+ # 2. Process Images
53
+ image_list = page.get_images(full=True)
54
+ for img_index, img in enumerate(image_list):
55
+ xref = img[0]
56
+ base_image = doc.extract_image(xref)
57
+ image_bytes = base_image["image"]
58
+
59
+ encrypted_image = encrypt_data(image_bytes)
60
+ cursor.execute(
61
+ "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
62
+ (doc_id, 'image', encrypted_image, page_num + 1)
63
+ )
64
+ pil_image = Image.open(io.BytesIO(image_bytes))
65
+ image_embedding = model.encode(pil_image)
66
+ new_embeddings.append(image_embedding.reshape(1, -1))
67
+
68
+ conn.commit()
69
+ conn.close()
70
+
71
+ if new_embeddings:
72
+ # Add new embeddings to the FAISS index
73
+ embeddings_np = np.vstack(new_embeddings).astype('float32')
74
+ index.add(embeddings_np)
75
+ faiss.write_index(index, INDEX_FILE)
76
+ print(f"Successfully ingested {file_name} and added {len(new_embeddings)} new chunks to the knowledge base.")
77
+ else:
78
+ print(f"No new content found to ingest in {file_name}.")
knowledge_base.py CHANGED
@@ -1,219 +1,67 @@
1
  import os
2
- import sqlite3
3
- import faiss
4
- import numpy as np
5
  from sentence_transformers import SentenceTransformer
6
- import fitz # PyMuPDF
7
- from PIL import Image
8
- import io
9
- from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
10
- from cryptography.hazmat.primitives import padding
11
- from cryptography.hazmat.backends import default_backend
12
- import config
13
 
14
- # --- Security ---
15
- SECRET_KEY = os.environ.get("AURA_MIND_SECRET_KEY", "a_default_secret_key_32_bytes_!!").encode()
16
- if len(SECRET_KEY) != 32:
17
- raise ValueError("SECRET_KEY must be 32 bytes long for AES-256.")
18
 
19
- def encrypt_data(data: bytes) -> bytes:
20
- iv = os.urandom(16)
21
- padder = padding.PKCS7(algorithms.AES.block_size).padder()
22
- padded_data = padder.update(data) + padder.finalize()
23
- cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
24
- encryptor = cipher.encryptor()
25
- encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
26
- return iv + encrypted_data
27
 
28
- def decrypt_data(encrypted_data_with_iv: bytes) -> bytes:
29
- iv = encrypted_data_with_iv[:16]
30
- encrypted_data = encrypted_data_with_iv[16:]
31
- cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
32
- decryptor = cipher.decryptor()
33
- padded_data = decryptor.update(encrypted_data) + decryptor.finalize()
34
- unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
35
- data = unpadder.update(padded_data) + unpadder.finalize()
36
- return data
37
-
38
- # --- KnowledgeBase Class ---
39
  class KnowledgeBase:
40
- def __init__(self, db_file="auramind_local.db", index_file="auramind_faiss.index", model_name='clip-ViT-B-32'):
41
- self.db_file = db_file
42
- self.index_file = index_file
43
- self.model = SentenceTransformer(model_name)
44
- self.init_db()
45
-
46
- def init_db(self):
47
- conn = sqlite3.connect(self.db_file)
48
- cursor = conn.cursor()
49
- cursor.execute('''
50
- CREATE TABLE IF NOT EXISTS documents (
51
- id INTEGER PRIMARY KEY AUTOINCREMENT,
52
- name TEXT NOT NULL UNIQUE
53
- )
54
- ''')
55
- cursor.execute('''
56
- CREATE TABLE IF NOT EXISTS chunks (
57
- id INTEGER PRIMARY KEY AUTOINCREMENT,
58
- doc_id INTEGER,
59
- content_type TEXT NOT NULL, -- 'text' or 'image'
60
- encrypted_content BLOB NOT NULL,
61
- page_num INTEGER,
62
- FOREIGN KEY (doc_id) REFERENCES documents (id)
63
- )
64
- ''')
65
- conn.commit()
66
- conn.close()
67
-
68
- def get_db_connection(self):
69
- conn = sqlite3.connect(self.db_file)
70
- conn.row_factory = sqlite3.Row
71
- return conn
72
-
73
- def delete_database_and_index(self):
74
- if os.path.exists(self.db_file):
75
- os.remove(self.db_file)
76
- print(f"Removed old database: {self.db_file}")
77
- if os.path.exists(self.index_file):
78
- os.remove(self.index_file)
79
- print(f"Removed old index: {self.index_file}")
80
 
81
  def create_initial_index(self, documents_dict):
82
- print("Performing a clean rebuild of the knowledge base...")
83
- self.delete_database_and_index()
84
- self.init_db()
85
-
86
- conn = self.get_db_connection()
87
- cursor = conn.cursor()
88
-
89
- all_chunks = []
90
- all_embeddings = []
91
-
92
- for name, content in documents_dict.items():
93
- cursor.execute("INSERT INTO documents (name) VALUES (?)", (name,))
94
- doc_id = cursor.lastrowid
95
- chunk_text = content
96
- all_chunks.append((doc_id, 'text', encrypt_data(chunk_text.encode('utf-8')), 1))
97
- text_embedding = self.model.encode([chunk_text])
98
- all_embeddings.append(text_embedding)
99
-
100
- cursor.executemany(
101
- "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
102
- all_chunks
103
- )
104
- conn.commit()
105
- conn.close()
106
-
107
- if not all_embeddings:
108
- print("No content to index.")
109
- return
110
-
111
- embeddings_np = np.vstack(all_embeddings).astype('float32')
112
- dimension = embeddings_np.shape[1]
113
- index = faiss.IndexFlatL2(dimension)
114
- index.add(embeddings_np)
115
- faiss.write_index(index, self.index_file)
116
- print(f"Initial encrypted index created with {len(all_chunks)} chunks.")
117
 
118
  def ingest_pdf(self, file_path, file_name):
119
- print(f"Starting ingestion for: {file_name}")
120
- conn = self.get_db_connection()
121
- cursor = conn.cursor()
122
-
123
- try:
124
- cursor.execute("INSERT INTO documents (name) VALUES (?)", (file_name,))
125
- doc_id = cursor.lastrowid
126
- except conn.IntegrityError:
127
- print("Document already exists in DB. Skipping doc table insert.")
128
- doc_id = cursor.execute("SELECT id FROM documents WHERE name=?", (file_name,)).fetchone()['id']
129
-
130
- doc = fitz.open(file_path)
131
- new_embeddings = []
132
-
133
- if os.path.exists(self.index_file):
134
- index = faiss.read_index(self.index_file)
135
- else:
136
- dimension = self.model.encode(["test"]).shape[1]
137
- index = faiss.IndexFlatL2(dimension)
138
-
139
- for page_num, page in enumerate(doc):
140
- text = page.get_text()
141
- if text.strip():
142
- encrypted_text = encrypt_data(text.encode('utf-8'))
143
- cursor.execute(
144
- "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
145
- (doc_id, 'text', encrypted_text, page_num + 1)
146
- )
147
- text_embedding = self.model.encode([text])
148
- new_embeddings.append(text_embedding)
149
-
150
- image_list = page.get_images(full=True)
151
- for img_index, img in enumerate(image_list):
152
- xref = img[0]
153
- base_image = doc.extract_image(xref)
154
- image_bytes = base_image["image"]
155
- encrypted_image = encrypt_data(image_bytes)
156
- cursor.execute(
157
- "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
158
- (doc_id, 'image', encrypted_image, page_num + 1)
159
- )
160
- pil_image = Image.open(io.BytesIO(image_bytes))
161
- image_embedding = self.model.encode(pil_image)
162
- new_embeddings.append(image_embedding.reshape(1, -1))
163
-
164
- conn.commit()
165
- conn.close()
166
-
167
- if new_embeddings:
168
- embeddings_np = np.vstack(new_embeddings).astype('float32')
169
- index.add(embeddings_np)
170
- faiss.write_index(index, self.index_file)
171
- print(f"Successfully ingested {file_name} and added {len(new_embeddings)} new chunks.")
172
- else:
173
- print(f"No new content found to ingest in {file_name}.")
174
 
175
  def search(self, query, k=1):
176
- if not os.path.exists(self.index_file):
177
- return []
178
-
179
- index = faiss.read_index(self.index_file)
180
- query_embedding = self.model.encode([query]).astype('float32')
181
- distances, indices = index.search(query_embedding, k)
182
-
183
- results = []
184
- conn = self.get_db_connection()
185
- for i, faiss_id in enumerate(indices[0]):
186
- if faiss_id != -1:
187
- sql_id = int(faiss_id) + 1
188
- chunk_record = conn.execute('SELECT * FROM chunks WHERE id = ?', (sql_id,)).fetchone()
189
- if chunk_record:
190
- content_type = chunk_record['content_type']
191
- decrypted_content_bytes = decrypt_data(chunk_record['encrypted_content'])
192
- if content_type == 'text':
193
- content = decrypted_content_bytes.decode('utf-8')
194
- elif content_type == 'image':
195
- content = Image.open(io.BytesIO(decrypted_content_bytes))
196
- results.append({
197
- 'distance': distances[0][i],
198
- 'content': content,
199
- 'type': content_type,
200
- 'page': chunk_record['page_num']
201
- })
202
- conn.close()
203
- return results
204
 
205
  def get_retriever():
206
  kb = KnowledgeBase()
207
- # This is a placeholder to maintain compatibility with the existing code.
208
- # The actual search will be done using kb.search()
209
  class Retriever:
210
  def __init__(self, kb):
211
  self.kb = kb
212
  def get_relevant_documents(self, query):
213
  results = self.kb.search(query)
214
- # Langchain retrievers expect a list of Document objects.
215
- # We will return the content of the documents for now.
216
  from langchain.schema import Document
217
- return [Document(page_content=r['content']) if r['type'] == 'text' else r['content'] for r in results]
 
 
 
218
 
219
- return Retriever(kb)
 
1
  import os
 
 
 
2
  from sentence_transformers import SentenceTransformer
 
 
 
 
 
 
 
3
 
4
+ from database import init_db, check_if_indexed, delete_database_and_index, get_db_connection
5
+ from create_index import create_initial_index as build_secure_index
6
+ from search import search as secure_search
7
+ from ingest_document import ingest_pdf
8
 
9
+ # Use a CLIP model that can handle both text and images
10
+ MODEL_NAME = 'clip-ViT-B-32'
 
 
 
 
 
 
11
 
 
 
 
 
 
 
 
 
 
 
 
12
  class KnowledgeBase:
13
+ def __init__(self):
14
+ self.model = SentenceTransformer(MODEL_NAME)
15
+ # Ensure the database is initialized
16
+ init_db()
17
+ # Check if the index exists, if not, build it from initial data
18
+ if not check_if_indexed():
19
+ print("Local knowledge base not found. Building initial knowledge base...")
20
+ self._build_initial_knowledge_base()
21
+
22
+ def _build_initial_knowledge_base(self):
23
+ document_files = [
24
+ "/Users/surfiniaburger/Desktop/glow/aura-mind-glow/knowledge_base_data/healthy_maize_remedy.txt",
25
+ "/Users/surfiniaburger/Desktop/glow/aura-mind-glow/knowledge_base_data/maize_phosphorus_deficiency_remedy.txt",
26
+ "/Users/surfiniaburger/Desktop/glow/aura-mind-glow/knowledge_base_data/comic_relief.txt"
27
+ ]
28
+ documents_content = {}
29
+ for file_path in document_files:
30
+ try:
31
+ with open(file_path, 'r', encoding='utf-8') as f:
32
+ # Use the base name of the file as the document name
33
+ documents_content[os.path.basename(file_path)] = f.read()
34
+ except FileNotFoundError:
35
+ print(f"Warning: Knowledge base file not found, skipping: {file_path}")
36
+
37
+ if documents_content:
38
+ build_secure_index(documents_content)
39
+ else:
40
+ print("No initial knowledge base documents found to index.")
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  def create_initial_index(self, documents_dict):
43
+ # This method now directly calls the external build_secure_index
44
+ build_secure_index(documents_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def ingest_pdf(self, file_path, file_name):
47
+ # This method now directly calls the external ingest_pdf
48
+ ingest_pdf(file_path, file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def search(self, query, k=1):
51
+ # This method now directly calls the external secure_search
52
+ return secure_search(query, k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def get_retriever():
55
  kb = KnowledgeBase()
 
 
56
  class Retriever:
57
  def __init__(self, kb):
58
  self.kb = kb
59
  def get_relevant_documents(self, query):
60
  results = self.kb.search(query)
 
 
61
  from langchain.schema import Document
62
+ # Ensure that only text content is passed to Document
63
+ # For image results, you might need a different handling or filter them out if Langchain Document doesn't support them directly.
64
+ text_documents = [Document(page_content=r['content']) for r in results if r['type'] == 'text']
65
+ return text_documents
66
 
67
+ return Retriever(kb)
search.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from PIL import Image
5
+ import io
6
+
7
+ from database import get_db_connection, INDEX_FILE, check_if_indexed
8
+ from security import decrypt_data
9
+
10
+ MODEL_NAME = 'clip-ViT-B-32'
11
+
12
+ def search(query, k=1):
13
+ """
14
+ Searches the multimodal FAISS index. The query can be text, and the result can be text or an image.
15
+ """
16
+ if not check_if_indexed():
17
+ return []
18
+
19
+ model = SentenceTransformer(MODEL_NAME)
20
+ index = faiss.read_index(INDEX_FILE)
21
+
22
+ # Create an embedding for the text query
23
+ query_embedding = model.encode([query]).astype('float32')
24
+ distances, indices = index.search(query_embedding, k)
25
+
26
+ results = []
27
+ conn = get_db_connection()
28
+ for i, faiss_id in enumerate(indices[0]):
29
+ if faiss_id != -1:
30
+ # The faiss_id is the row number, which corresponds to the chunk's primary key 'id'
31
+ sql_id = int(faiss_id) + 1
32
+
33
+ chunk_record = conn.execute('SELECT * FROM chunks WHERE id = ?', (sql_id,)).fetchone()
34
+
35
+ if chunk_record:
36
+ content_type = chunk_record['content_type']
37
+ decrypted_content_bytes = decrypt_data(chunk_record['encrypted_content'])
38
+
39
+ # Prepare content based on its type
40
+ if content_type == 'text':
41
+ content = decrypted_content_bytes.decode('utf-8')
42
+ elif content_type == 'image':
43
+ content = Image.open(io.BytesIO(decrypted_content_bytes))
44
+
45
+ results.append({
46
+ 'distance': distances[0][i],
47
+ 'content': content,
48
+ 'type': content_type,
49
+ 'page': chunk_record['page_num']
50
+ })
51
+ conn.close()
52
+ return results
security.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
3
+ from cryptography.hazmat.primitives import padding
4
+ from cryptography.hazmat.backends import default_backend
5
+ import base64
6
+
7
+ # In a real mobile app, this key would be securely managed by
8
+ # the Android Keystore or iOS Keychain. For this skeleton, we'll
9
+ # use an environment variable for demonstration.
10
+ SECRET_KEY = os.environ.get("AURA_MIND_SECRET_KEY", "a_default_secret_key_32_bytes_!!").encode()
11
+
12
+ if len(SECRET_KEY) != 32:
13
+ raise ValueError("SECRET_KEY must be 32 bytes long for AES-256.")
14
+
15
+ def encrypt_data(data: bytes) -> bytes:
16
+ """Encrypts data using AES-CBC."""
17
+ iv = os.urandom(16)
18
+ padder = padding.PKCS7(algorithms.AES.block_size).padder()
19
+ padded_data = padder.update(data) + padder.finalize()
20
+
21
+ cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
22
+ encryptor = cipher.encryptor()
23
+ encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
24
+ return iv + encrypted_data
25
+
26
+ def decrypt_data(encrypted_data_with_iv: bytes) -> bytes:
27
+ """Decrypts data using AES-CBC."""
28
+ iv = encrypted_data_with_iv[:16]
29
+ encrypted_data = encrypted_data_with_iv[16:]
30
+
31
+ cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
32
+ decryptor = cipher.decryptor()
33
+ padded_data = decryptor.update(encrypted_data) + decryptor.finalize()
34
+
35
+ unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
36
+ data = unpadder.update(padded_data) + unpadder.finalize()
37
+ return data