legaltechgc commited on
Commit
42c8348
·
verified ·
1 Parent(s): 3bafdc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -58
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import streamlit as st
2
- import sqlite3
3
  import faiss
4
  import numpy as np
5
  from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
@@ -9,63 +8,20 @@ import PyMuPDF
9
  import requests
10
  from bs4 import BeautifulSoup
11
  from langdetect import detect
12
- import os
13
 
14
  # Initialize models and pipeline
15
  qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
16
  embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
17
 
18
- # FAISS index setup (in-memory for this example)
19
  dimension = 512 # Size of the embeddings
20
  index = faiss.IndexFlatL2(dimension)
 
21
 
22
  # Initialize translation model for on-the-fly translation
23
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
24
  model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
25
 
26
- # SQLite Database Setup
27
- DB_PATH = "knowledge_base.db"
28
-
29
- def init_db():
30
- """ Initialize the database and tables if they don't exist. """
31
- conn = sqlite3.connect(DB_PATH)
32
- c = conn.cursor()
33
- c.execute('''
34
- CREATE TABLE IF NOT EXISTS documents (
35
- id INTEGER PRIMARY KEY AUTOINCREMENT,
36
- content TEXT NOT NULL,
37
- language TEXT,
38
- embedding BLOB NOT NULL
39
- )
40
- ''')
41
- conn.commit()
42
- conn.close()
43
-
44
- def store_document(content, language, embedding):
45
- """ Store document content, language, and embedding in the SQLite database. """
46
- conn = sqlite3.connect(DB_PATH)
47
- c = conn.cursor()
48
- c.execute("INSERT INTO documents (content, language, embedding) VALUES (?, ?, ?)",
49
- (content, language, embedding.tobytes()))
50
- conn.commit()
51
- conn.close()
52
-
53
- def load_documents():
54
- """ Load all documents and embeddings from the SQLite database. """
55
- conn = sqlite3.connect(DB_PATH)
56
- c = conn.cursor()
57
- c.execute("SELECT content, language, embedding FROM documents")
58
- rows = c.fetchall()
59
- conn.close()
60
-
61
- documents = []
62
- embeddings = []
63
- for content, language, embedding_blob in rows:
64
- documents.append(content)
65
- embeddings.append(np.frombuffer(embedding_blob, dtype=np.float32))
66
-
67
- return documents, np.array(embeddings)
68
-
69
  def translate_text(text, src_lang, tgt_lang):
70
  """ Translate text using the M2M100 model. """
71
  tokenizer.src_lang = src_lang
@@ -73,12 +29,6 @@ def translate_text(text, src_lang, tgt_lang):
73
  generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
74
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
75
 
76
- # Initialize database and FAISS index
77
- init_db()
78
- documents, embeddings = load_documents()
79
- if len(embeddings) > 0:
80
- index.add(embeddings)
81
-
82
  # Sidebar for navigation
83
  st.sidebar.title("Navigation")
84
  page = st.sidebar.radio("Go to", ["Upload Knowledge", "Q&A"])
@@ -111,9 +61,6 @@ if page == "Upload Knowledge":
111
  # Generate embeddings
112
  embedding = embedding_model.encode([text])[0]
113
 
114
- # Store the document and embedding in the database
115
- store_document(text, detected_lang, embedding)
116
-
117
  # Add the embedding to FAISS index
118
  index.add(np.array([embedding], dtype=np.float32))
119
  documents.append(text)
@@ -130,9 +77,6 @@ if page == "Upload Knowledge":
130
  # Generate embedding
131
  embedding = embedding_model.encode([text])[0]
132
 
133
- # Store the document and embedding in the database
134
- store_document(text, detected_lang, embedding)
135
-
136
  # Add the embedding to FAISS index
137
  index.add(np.array([embedding], dtype=np.float32))
138
  documents.append(text)
 
1
  import streamlit as st
 
2
  import faiss
3
  import numpy as np
4
  from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
 
8
  import requests
9
  from bs4 import BeautifulSoup
10
  from langdetect import detect
 
11
 
12
  # Initialize models and pipeline
13
  qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
14
  embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
15
 
16
+ # FAISS index setup (in-memory)
17
  dimension = 512 # Size of the embeddings
18
  index = faiss.IndexFlatL2(dimension)
19
+ documents = []
20
 
21
  # Initialize translation model for on-the-fly translation
22
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
23
  model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def translate_text(text, src_lang, tgt_lang):
26
  """ Translate text using the M2M100 model. """
27
  tokenizer.src_lang = src_lang
 
29
  generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
30
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
31
 
 
 
 
 
 
 
32
  # Sidebar for navigation
33
  st.sidebar.title("Navigation")
34
  page = st.sidebar.radio("Go to", ["Upload Knowledge", "Q&A"])
 
61
  # Generate embeddings
62
  embedding = embedding_model.encode([text])[0]
63
 
 
 
 
64
  # Add the embedding to FAISS index
65
  index.add(np.array([embedding], dtype=np.float32))
66
  documents.append(text)
 
77
  # Generate embedding
78
  embedding = embedding_model.encode([text])[0]
79
 
 
 
 
80
  # Add the embedding to FAISS index
81
  index.add(np.array([embedding], dtype=np.float32))
82
  documents.append(text)