Spaces:

opllegaltech
/

weknow

Sleeping

App Files Files Community

legaltechgc commited on Sep 22, 2024

Commit

42c8348

verified ·

1 Parent(s): 3bafdc0

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -58

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import streamlit as st
-import sqlite3
 import faiss
 import numpy as np
 from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
@@ -9,63 +8,20 @@ import PyMuPDF
 import requests
 from bs4 import BeautifulSoup
 from langdetect import detect
-import os
 # Initialize models and pipeline
 qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
 embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
-# FAISS index setup (in-memory for this example)
 dimension = 512  # Size of the embeddings
 index = faiss.IndexFlatL2(dimension)
 # Initialize translation model for on-the-fly translation
 tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-# SQLite Database Setup
-DB_PATH = "knowledge_base.db"
-def init_db():
-    """ Initialize the database and tables if they don't exist. """
-    conn = sqlite3.connect(DB_PATH)
-    c = conn.cursor()
-    c.execute('''
-        CREATE TABLE IF NOT EXISTS documents (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            content TEXT NOT NULL,
-            language TEXT,
-            embedding BLOB NOT NULL
-        )
-    ''')
-    conn.commit()
-    conn.close()
-def store_document(content, language, embedding):
-    """ Store document content, language, and embedding in the SQLite database. """
-    conn = sqlite3.connect(DB_PATH)
-    c = conn.cursor()
-    c.execute("INSERT INTO documents (content, language, embedding) VALUES (?, ?, ?)",
-              (content, language, embedding.tobytes()))
-    conn.commit()
-    conn.close()
-def load_documents():
-    """ Load all documents and embeddings from the SQLite database. """
-    conn = sqlite3.connect(DB_PATH)
-    c = conn.cursor()
-    c.execute("SELECT content, language, embedding FROM documents")
-    rows = c.fetchall()
-    conn.close()
-    documents = []
-    embeddings = []
-    for content, language, embedding_blob in rows:
-        documents.append(content)
-        embeddings.append(np.frombuffer(embedding_blob, dtype=np.float32))
-    return documents, np.array(embeddings)
 def translate_text(text, src_lang, tgt_lang):
     """ Translate text using the M2M100 model. """
     tokenizer.src_lang = src_lang
@@ -73,12 +29,6 @@ def translate_text(text, src_lang, tgt_lang):
     generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
     return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-# Initialize database and FAISS index
-init_db()
-documents, embeddings = load_documents()
-if len(embeddings) > 0:
-    index.add(embeddings)
 # Sidebar for navigation
 st.sidebar.title("Navigation")
 page = st.sidebar.radio("Go to", ["Upload Knowledge", "Q&A"])
@@ -111,9 +61,6 @@ if page == "Upload Knowledge":
             # Generate embeddings
             embedding = embedding_model.encode([text])[0]
-            # Store the document and embedding in the database
-            store_document(text, detected_lang, embedding)
             # Add the embedding to FAISS index
             index.add(np.array([embedding], dtype=np.float32))
             documents.append(text)
@@ -130,9 +77,6 @@ if page == "Upload Knowledge":
             # Generate embedding
             embedding = embedding_model.encode([text])[0]
-            # Store the document and embedding in the database
-            store_document(text, detected_lang, embedding)
             # Add the embedding to FAISS index
             index.add(np.array([embedding], dtype=np.float32))
             documents.append(text)

 import streamlit as st
 import faiss
 import numpy as np
 from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
 import requests
 from bs4 import BeautifulSoup
 from langdetect import detect
 # Initialize models and pipeline
 qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
 embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
+# FAISS index setup (in-memory)
 dimension = 512  # Size of the embeddings
 index = faiss.IndexFlatL2(dimension)
+documents = []
 # Initialize translation model for on-the-fly translation
 tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
 def translate_text(text, src_lang, tgt_lang):
     """ Translate text using the M2M100 model. """
     tokenizer.src_lang = src_lang
     generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
     return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
 # Sidebar for navigation
 st.sidebar.title("Navigation")
 page = st.sidebar.radio("Go to", ["Upload Knowledge", "Q&A"])
             # Generate embeddings
             embedding = embedding_model.encode([text])[0]
             # Add the embedding to FAISS index
             index.add(np.array([embedding], dtype=np.float32))
             documents.append(text)
             # Generate embedding
             embedding = embedding_model.encode([text])[0]
             # Add the embedding to FAISS index
             index.add(np.array([embedding], dtype=np.float32))
             documents.append(text)