Spaces:

surfiniaburger
/

aura-mind-glow

Sleeping

App Files Files Community

surfiniaburger commited on Aug 17

Commit

2cbbef6

1 Parent(s): 8ff7567

symphony

Browse files

Files changed (7) hide show

app.py +1 -5
create_index.py +75 -0
database.py +57 -0
ingest_document.py +78 -0
knowledge_base.py +45 -197
search.py +52 -0
security.py +37 -0

app.py CHANGED Viewed

@@ -356,11 +356,7 @@ def create_kb_management_ui():
         def rebuild_kb():
             yield "Rebuilding knowledge base..."
             try:
-                docs = {
-                    "Healthy Maize Plant": "For a Healthy Maize Plant, ensure proper watering and sunlight. No special remedy is needed. Continue good farming practices.",
-                    "Maize Phosphorus Deficiency": "Phosphorus deficiency in maize is characterized by stunted growth and purplish discoloration of leaves. To remedy this, apply a phosphorus-rich fertilizer like DAP (Di-Ammonium Phosphate) or bone meal to the soil. Follow package instructions for application rates."
-                }
-                KB.create_initial_index(docs)
                 yield "Knowledge base rebuilt successfully."
             except Exception as e:
                 yield f"Error rebuilding knowledge base: {e}"

         def rebuild_kb():
             yield "Rebuilding knowledge base..."
             try:
+                KB.create_initial_index() # Call without arguments, as it now rebuilds from files
                 yield "Knowledge base rebuilt successfully."
             except Exception as e:
                 yield f"Error rebuilding knowledge base: {e}"

create_index.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+import os
+from database import init_db, get_db_connection, INDEX_FILE, DB_FILE, delete_database_and_index
+from security import encrypt_data
+# Use a CLIP model that can handle both text and images
+MODEL_NAME = 'clip-ViT-B-32'
+def create_initial_index(documents_dict):
+    """
+    Creates an initial encrypted, persistent index from a dictionary of text documents.
+    This will delete any existing database to ensure a clean start.
+    """
+    print("Performing a clean rebuild of the knowledge base...")
+    delete_database_and_index()
+    init_db()
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    model = SentenceTransformer(MODEL_NAME)
+    all_chunks = []
+    all_embeddings = []
+    for name, content in documents_dict.items():
+        # Add document to documents table
+        cursor.execute("INSERT INTO documents (name) VALUES (?)", (name,))
+        doc_id = cursor.lastrowid
+        # For initial docs, we treat the whole content as one chunk
+        chunk_text = content
+        all_chunks.append((doc_id, 'text', encrypt_data(chunk_text.encode('utf-8')), 1))
+        # Create text embedding
+        text_embedding = model.encode([chunk_text])
+        all_embeddings.append(text_embedding)
+    # Batch insert chunks
+    cursor.executemany(
+        "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
+        all_chunks
+    )
+    conn.commit()
+    conn.close()
+    if not all_embeddings:
+        print("No content to index.")
+        return
+    # Create and save the FAISS index
+    embeddings_np = np.vstack(all_embeddings).astype('float32')
+    dimension = embeddings_np.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings_np)
+    faiss.write_index(index, INDEX_FILE)
+    print(f"Initial encrypted index created with {len(all_chunks)} chunks.")
+    print(f"Database: {DB_FILE}, FAISS Index: {INDEX_FILE}")
+if __name__ == '__main__':
+    document_files = ["healthy_maize_remedy.txt", "maize_phosphorus_deficiency_remedy.txt", "comic_relief.txt"]
+    documents_content = []
+    for file_path in document_files:
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                documents_content.append(f.read())
+        except FileNotFoundError:
+            print(f"Warning: File not found, skipping: {file_path}")
+    create_initial_index(documents_content)

database.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import sqlite3
+import os
+DB_FILE = "auramind_local.db"
+INDEX_FILE = "auramind_faiss.index"
+def init_db():
+    """
+    Initializes a more robust database schema for multimodal data.
+    - 'documents' table tracks the source files.
+    - 'chunks' table stores the individual encrypted text/image chunks.
+    """
+    conn = sqlite3.connect(DB_FILE)
+    cursor = conn.cursor()
+    # Table to track the source documents (e.g., 'healthy_maize.txt', 'user_guide.pdf')
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS documents (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT NOT NULL UNIQUE
+        )
+    ''')
+    # Table to store each chunk of content (text or image)
+    # The faiss_id will correspond to the row number in the FAISS index
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS chunks (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            doc_id INTEGER,
+            content_type TEXT NOT NULL, -- 'text' or 'image'
+            encrypted_content BLOB NOT NULL,
+            page_num INTEGER,
+            FOREIGN KEY (doc_id) REFERENCES documents (id)
+        )
+    ''')
+    conn.commit()
+    conn.close()
+def get_db_connection():
+    """Establishes a connection to the database."""
+    conn = sqlite3.connect(DB_FILE)
+    conn.row_factory = sqlite3.Row
+    return conn
+def check_if_indexed():
+    """Checks if the initial database and index file exist."""
+    # A basic check. A more robust check might query the db for content.
+    return os.path.exists(DB_FILE) and os.path.exists(INDEX_FILE)
+def delete_database_and_index():
+    """Deletes existing db and index files for a clean rebuild."""
+    if os.path.exists(DB_FILE):
+        os.remove(DB_FILE)
+        print(f"Removed old database: {DB_FILE}")
+    if os.path.exists(INDEX_FILE):
+        os.remove(INDEX_FILE)
+        print(f"Removed old index: {INDEX_FILE}")

ingest_document.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import faiss
+from sentence_transformers import SentenceTransformer
+import fitz  # PyMuPDF
+from PIL import Image
+import io
+import numpy as np
+import os
+from database import get_db_connection, INDEX_FILE
+from security import encrypt_data
+MODEL_NAME = 'clip-ViT-B-32'
+def ingest_pdf(file_path, file_name):
+    """Parses a PDF, encrypts its content (text+images), and adds it to the database and FAISS index."""
+    print(f"Starting ingestion for: {file_name}")
+    model = SentenceTransformer(MODEL_NAME)
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    # Add document to documents table, or get its ID if it exists
+    try:
+        cursor.execute("INSERT INTO documents (name) VALUES (?)", (file_name,))
+        doc_id = cursor.lastrowid
+    except conn.IntegrityError:
+        print("Document already exists in DB. Skipping doc table insert.")
+        doc_id = cursor.execute("SELECT id FROM documents WHERE name=?", (file_name,)).fetchone()['id']
+    doc = fitz.open(file_path)
+    new_embeddings = []
+    # Load existing FAISS index or create a new one
+    if os.path.exists(INDEX_FILE):
+        index = faiss.read_index(INDEX_FILE)
+    else:
+        # Get dimension from the model if index is new
+        dimension = model.encode(["test"]).shape[1]
+        index = faiss.IndexFlatL2(dimension)
+    for page_num, page in enumerate(doc):
+        # 1. Process Text
+        text = page.get_text()
+        if text.strip():
+            encrypted_text = encrypt_data(text.encode('utf-8'))
+            cursor.execute(
+                "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
+                (doc_id, 'text', encrypted_text, page_num + 1)
+            )
+            text_embedding = model.encode([text])
+            new_embeddings.append(text_embedding)
+        # 2. Process Images
+        image_list = page.get_images(full=True)
+        for img_index, img in enumerate(image_list):
+            xref = img[0]
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            encrypted_image = encrypt_data(image_bytes)
+            cursor.execute(
+                "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
+                (doc_id, 'image', encrypted_image, page_num + 1)
+            )
+            pil_image = Image.open(io.BytesIO(image_bytes))
+            image_embedding = model.encode(pil_image)
+            new_embeddings.append(image_embedding.reshape(1, -1))
+    conn.commit()
+    conn.close()
+    if new_embeddings:
+        # Add new embeddings to the FAISS index
+        embeddings_np = np.vstack(new_embeddings).astype('float32')
+        index.add(embeddings_np)
+        faiss.write_index(index, INDEX_FILE)
+        print(f"Successfully ingested {file_name} and added {len(new_embeddings)} new chunks to the knowledge base.")
+    else:
+        print(f"No new content found to ingest in {file_name}.")

knowledge_base.py CHANGED Viewed

@@ -1,219 +1,67 @@
 import os
-import sqlite3
-import faiss
-import numpy as np
 from sentence_transformers import SentenceTransformer
-import fitz  # PyMuPDF
-from PIL import Image
-import io
-from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
-from cryptography.hazmat.primitives import padding
-from cryptography.hazmat.backends import default_backend
-import config
-# --- Security ---
-SECRET_KEY = os.environ.get("AURA_MIND_SECRET_KEY", "a_default_secret_key_32_bytes_!!").encode()
-if len(SECRET_KEY) != 32:
-    raise ValueError("SECRET_KEY must be 32 bytes long for AES-256.")
-def encrypt_data(data: bytes) -> bytes:
-    iv = os.urandom(16)
-    padder = padding.PKCS7(algorithms.AES.block_size).padder()
-    padded_data = padder.update(data) + padder.finalize()
-    cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
-    encryptor = cipher.encryptor()
-    encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
-    return iv + encrypted_data
-def decrypt_data(encrypted_data_with_iv: bytes) -> bytes:
-    iv = encrypted_data_with_iv[:16]
-    encrypted_data = encrypted_data_with_iv[16:]
-    cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
-    decryptor = cipher.decryptor()
-    padded_data = decryptor.update(encrypted_data) + decryptor.finalize()
-    unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
-    data = unpadder.update(padded_data) + unpadder.finalize()
-    return data
-# --- KnowledgeBase Class ---
 class KnowledgeBase:
-    def __init__(self, db_file="auramind_local.db", index_file="auramind_faiss.index", model_name='clip-ViT-B-32'):
-        self.db_file = db_file
-        self.index_file = index_file
-        self.model = SentenceTransformer(model_name)
-        self.init_db()
-    def init_db(self):
-        conn = sqlite3.connect(self.db_file)
-        cursor = conn.cursor()
-        cursor.execute('''
-            CREATE TABLE IF NOT EXISTS documents (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                name TEXT NOT NULL UNIQUE
-            )
-        ''')
-        cursor.execute('''
-            CREATE TABLE IF NOT EXISTS chunks (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                doc_id INTEGER,
-                content_type TEXT NOT NULL, -- 'text' or 'image'
-                encrypted_content BLOB NOT NULL,
-                page_num INTEGER,
-                FOREIGN KEY (doc_id) REFERENCES documents (id)
-            )
-        ''')
-        conn.commit()
-        conn.close()
-    def get_db_connection(self):
-        conn = sqlite3.connect(self.db_file)
-        conn.row_factory = sqlite3.Row
-        return conn
-    def delete_database_and_index(self):
-        if os.path.exists(self.db_file):
-            os.remove(self.db_file)
-            print(f"Removed old database: {self.db_file}")
-        if os.path.exists(self.index_file):
-            os.remove(self.index_file)
-            print(f"Removed old index: {self.index_file}")
     def create_initial_index(self, documents_dict):
-        print("Performing a clean rebuild of the knowledge base...")
-        self.delete_database_and_index()
-        self.init_db()
-        conn = self.get_db_connection()
-        cursor = conn.cursor()
-        all_chunks = []
-        all_embeddings = []
-        for name, content in documents_dict.items():
-            cursor.execute("INSERT INTO documents (name) VALUES (?)", (name,))
-            doc_id = cursor.lastrowid
-            chunk_text = content
-            all_chunks.append((doc_id, 'text', encrypt_data(chunk_text.encode('utf-8')), 1))
-            text_embedding = self.model.encode([chunk_text])
-            all_embeddings.append(text_embedding)
-        cursor.executemany(
-            "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
-            all_chunks
-        )
-        conn.commit()
-        conn.close()
-        if not all_embeddings:
-            print("No content to index.")
-            return
-        embeddings_np = np.vstack(all_embeddings).astype('float32')
-        dimension = embeddings_np.shape[1]
-        index = faiss.IndexFlatL2(dimension)
-        index.add(embeddings_np)
-        faiss.write_index(index, self.index_file)
-        print(f"Initial encrypted index created with {len(all_chunks)} chunks.")
     def ingest_pdf(self, file_path, file_name):
-        print(f"Starting ingestion for: {file_name}")
-        conn = self.get_db_connection()
-        cursor = conn.cursor()
-        try:
-            cursor.execute("INSERT INTO documents (name) VALUES (?)", (file_name,))
-            doc_id = cursor.lastrowid
-        except conn.IntegrityError:
-            print("Document already exists in DB. Skipping doc table insert.")
-            doc_id = cursor.execute("SELECT id FROM documents WHERE name=?", (file_name,)).fetchone()['id']
-        doc = fitz.open(file_path)
-        new_embeddings = []
-        if os.path.exists(self.index_file):
-            index = faiss.read_index(self.index_file)
-        else:
-            dimension = self.model.encode(["test"]).shape[1]
-            index = faiss.IndexFlatL2(dimension)
-        for page_num, page in enumerate(doc):
-            text = page.get_text()
-            if text.strip():
-                encrypted_text = encrypt_data(text.encode('utf-8'))
-                cursor.execute(
-                    "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
-                    (doc_id, 'text', encrypted_text, page_num + 1)
-                )
-                text_embedding = self.model.encode([text])
-                new_embeddings.append(text_embedding)
-            image_list = page.get_images(full=True)
-            for img_index, img in enumerate(image_list):
-                xref = img[0]
-                base_image = doc.extract_image(xref)
-                image_bytes = base_image["image"]
-                encrypted_image = encrypt_data(image_bytes)
-                cursor.execute(
-                    "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)",
-                    (doc_id, 'image', encrypted_image, page_num + 1)
-                )
-                pil_image = Image.open(io.BytesIO(image_bytes))
-                image_embedding = self.model.encode(pil_image)
-                new_embeddings.append(image_embedding.reshape(1, -1))
-        conn.commit()
-        conn.close()
-        if new_embeddings:
-            embeddings_np = np.vstack(new_embeddings).astype('float32')
-            index.add(embeddings_np)
-            faiss.write_index(index, self.index_file)
-            print(f"Successfully ingested {file_name} and added {len(new_embeddings)} new chunks.")
-        else:
-            print(f"No new content found to ingest in {file_name}.")
     def search(self, query, k=1):
-        if not os.path.exists(self.index_file):
-            return []
-        index = faiss.read_index(self.index_file)
-        query_embedding = self.model.encode([query]).astype('float32')
-        distances, indices = index.search(query_embedding, k)
-        results = []
-        conn = self.get_db_connection()
-        for i, faiss_id in enumerate(indices[0]):
-            if faiss_id != -1:
-                sql_id = int(faiss_id) + 1
-                chunk_record = conn.execute('SELECT * FROM chunks WHERE id = ?', (sql_id,)).fetchone()
-                if chunk_record:
-                    content_type = chunk_record['content_type']
-                    decrypted_content_bytes = decrypt_data(chunk_record['encrypted_content'])
-                    if content_type == 'text':
-                        content = decrypted_content_bytes.decode('utf-8')
-                    elif content_type == 'image':
-                        content = Image.open(io.BytesIO(decrypted_content_bytes))
-                    results.append({
-                        'distance': distances[0][i],
-                        'content': content,
-                        'type': content_type,
-                        'page': chunk_record['page_num']
-                    })
-        conn.close()
-        return results
 def get_retriever():
     kb = KnowledgeBase()
-    # This is a placeholder to maintain compatibility with the existing code.
-    # The actual search will be done using kb.search()
     class Retriever:
         def __init__(self, kb):
             self.kb = kb
         def get_relevant_documents(self, query):
             results = self.kb.search(query)
-            # Langchain retrievers expect a list of Document objects.
-            # We will return the content of the documents for now.
             from langchain.schema import Document
-            return [Document(page_content=r['content']) if r['type'] == 'text' else r['content'] for r in results]
-    return Retriever(kb)

 import os
 from sentence_transformers import SentenceTransformer
+from database import init_db, check_if_indexed, delete_database_and_index, get_db_connection
+from create_index import create_initial_index as build_secure_index
+from search import search as secure_search
+from ingest_document import ingest_pdf
+# Use a CLIP model that can handle both text and images
+MODEL_NAME = 'clip-ViT-B-32'
 class KnowledgeBase:
+    def __init__(self):
+        self.model = SentenceTransformer(MODEL_NAME)
+        # Ensure the database is initialized
+        init_db()
+        # Check if the index exists, if not, build it from initial data
+        if not check_if_indexed():
+            print("Local knowledge base not found. Building initial knowledge base...")
+            self._build_initial_knowledge_base()
+    def _build_initial_knowledge_base(self):
+        document_files = [
+            "/Users/surfiniaburger/Desktop/glow/aura-mind-glow/knowledge_base_data/healthy_maize_remedy.txt",
+            "/Users/surfiniaburger/Desktop/glow/aura-mind-glow/knowledge_base_data/maize_phosphorus_deficiency_remedy.txt",
+            "/Users/surfiniaburger/Desktop/glow/aura-mind-glow/knowledge_base_data/comic_relief.txt"
+        ]
+        documents_content = {}
+        for file_path in document_files:
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    # Use the base name of the file as the document name
+                    documents_content[os.path.basename(file_path)] = f.read()
+            except FileNotFoundError:
+                print(f"Warning: Knowledge base file not found, skipping: {file_path}")
+        if documents_content:
+            build_secure_index(documents_content)
+        else:
+            print("No initial knowledge base documents found to index.")
     def create_initial_index(self, documents_dict):
+        # This method now directly calls the external build_secure_index
+        build_secure_index(documents_dict)
     def ingest_pdf(self, file_path, file_name):
+        # This method now directly calls the external ingest_pdf
+        ingest_pdf(file_path, file_name)
     def search(self, query, k=1):
+        # This method now directly calls the external secure_search
+        return secure_search(query, k)
 def get_retriever():
     kb = KnowledgeBase()
     class Retriever:
         def __init__(self, kb):
             self.kb = kb
         def get_relevant_documents(self, query):
             results = self.kb.search(query)
             from langchain.schema import Document
+            # Ensure that only text content is passed to Document
+            # For image results, you might need a different handling or filter them out if Langchain Document doesn't support them directly.
+            text_documents = [Document(page_content=r['content']) for r in results if r['type'] == 'text']
+            return text_documents
+    return Retriever(kb)

search.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from PIL import Image
+import io
+from database import get_db_connection, INDEX_FILE, check_if_indexed
+from security import decrypt_data
+MODEL_NAME = 'clip-ViT-B-32'
+def search(query, k=1):
+    """
+    Searches the multimodal FAISS index. The query can be text, and the result can be text or an image.
+    """
+    if not check_if_indexed():
+        return []
+    model = SentenceTransformer(MODEL_NAME)
+    index = faiss.read_index(INDEX_FILE)
+    # Create an embedding for the text query
+    query_embedding = model.encode([query]).astype('float32')
+    distances, indices = index.search(query_embedding, k)
+    results = []
+    conn = get_db_connection()
+    for i, faiss_id in enumerate(indices[0]):
+        if faiss_id != -1:
+            # The faiss_id is the row number, which corresponds to the chunk's primary key 'id'
+            sql_id = int(faiss_id) + 1
+            chunk_record = conn.execute('SELECT * FROM chunks WHERE id = ?', (sql_id,)).fetchone()
+            if chunk_record:
+                content_type = chunk_record['content_type']
+                decrypted_content_bytes = decrypt_data(chunk_record['encrypted_content'])
+                # Prepare content based on its type
+                if content_type == 'text':
+                    content = decrypted_content_bytes.decode('utf-8')
+                elif content_type == 'image':
+                    content = Image.open(io.BytesIO(decrypted_content_bytes))
+                results.append({
+                    'distance': distances[0][i],
+                    'content': content,
+                    'type': content_type,
+                    'page': chunk_record['page_num']
+                })
+    conn.close()
+    return results

security.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
+from cryptography.hazmat.primitives import padding
+from cryptography.hazmat.backends import default_backend
+import base64
+# In a real mobile app, this key would be securely managed by
+# the Android Keystore or iOS Keychain. For this skeleton, we'll
+# use an environment variable for demonstration.
+SECRET_KEY = os.environ.get("AURA_MIND_SECRET_KEY", "a_default_secret_key_32_bytes_!!").encode()
+if len(SECRET_KEY) != 32:
+    raise ValueError("SECRET_KEY must be 32 bytes long for AES-256.")
+def encrypt_data(data: bytes) -> bytes:
+    """Encrypts data using AES-CBC."""
+    iv = os.urandom(16)
+    padder = padding.PKCS7(algorithms.AES.block_size).padder()
+    padded_data = padder.update(data) + padder.finalize()
+    cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
+    encryptor = cipher.encryptor()
+    encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
+    return iv + encrypted_data
+def decrypt_data(encrypted_data_with_iv: bytes) -> bytes:
+    """Decrypts data using AES-CBC."""
+    iv = encrypted_data_with_iv[:16]
+    encrypted_data = encrypted_data_with_iv[16:]
+    cipher = Cipher(algorithms.AES(SECRET_KEY), modes.CBC(iv), backend=default_backend())
+    decryptor = cipher.decryptor()
+    padded_data = decryptor.update(encrypted_data) + decryptor.finalize()
+    unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
+    data = unpadder.update(padded_data) + unpadder.finalize()
+    return data