legaltechgc commited on
Commit
051a2c5
·
verified ·
1 Parent(s): 5bb71dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -13
app.py CHANGED
@@ -1,33 +1,84 @@
1
  import streamlit as st
 
 
 
2
  from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
3
- import PyMuPDF # for PDF handling
4
  from docx import Document
 
5
  import requests
6
  from bs4 import BeautifulSoup
7
- import faiss
8
- import numpy as np
9
- from sentence_transformers import SentenceTransformer
10
  from langdetect import detect
 
11
 
12
  # Initialize models and pipeline
13
  qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
14
  embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
15
 
16
  # FAISS index setup (in-memory for this example)
17
- index = faiss.IndexFlatL2(512)
18
- doc_store = []
19
 
20
  # Initialize translation model for on-the-fly translation
21
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
22
  model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
23
 
24
- # Function to translate text using the M2M100 model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def translate_text(text, src_lang, tgt_lang):
 
26
  tokenizer.src_lang = src_lang
27
  encoded = tokenizer(text, return_tensors="pt")
28
  generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
29
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
30
 
 
 
 
 
 
 
31
  # Sidebar for navigation
32
  st.sidebar.title("Navigation")
33
  page = st.sidebar.radio("Go to", ["Upload Knowledge", "Q&A"])
@@ -57,6 +108,15 @@ if page == "Upload Knowledge":
57
  detected_lang = detect(text)
58
  st.write(f"Detected language: {detected_lang}")
59
 
 
 
 
 
 
 
 
 
 
60
  texts.append(text)
61
 
62
  # Process URL
@@ -64,12 +124,20 @@ if page == "Upload Knowledge":
64
  response = requests.get(url)
65
  soup = BeautifulSoup(response.text, 'html.parser')
66
  text = soup.get_text()
 
 
 
 
 
 
 
 
 
 
 
 
67
  texts.append(text)
68
 
69
- # Create embeddings and store in FAISS
70
- embeddings = embedding_model.encode(texts)
71
- index.add(embeddings)
72
- doc_store.extend(texts)
73
  st.write("Data processed and added to knowledge base!")
74
 
75
  # Provide a summary of the uploaded content
@@ -91,8 +159,8 @@ elif page == "Q&A":
91
  user_query = translate_text(user_query, detected_query_lang, "en")
92
 
93
  query_embedding = embedding_model.encode([user_query])
94
- D, I = index.search(query_embedding, k=5) # Retrieve top 5 documents
95
- context = " ".join([doc_store[i] for i in I[0]])
96
 
97
  # Pass translated query and context to the QA pipeline
98
  result = qa_pipeline(question=user_query, context=context)
 
1
  import streamlit as st
2
+ import sqlite3
3
+ import faiss
4
+ import numpy as np
5
  from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
6
+ from sentence_transformers import SentenceTransformer
7
  from docx import Document
8
+ import PyMuPDF
9
  import requests
10
  from bs4 import BeautifulSoup
 
 
 
11
  from langdetect import detect
12
+ import os
13
 
14
  # Initialize models and pipeline
15
  qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
16
  embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
17
 
18
  # FAISS index setup (in-memory for this example)
19
+ dimension = 512 # Size of the embeddings
20
+ index = faiss.IndexFlatL2(dimension)
21
 
22
  # Initialize translation model for on-the-fly translation
23
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
24
  model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
25
 
26
+ # SQLite Database Setup
27
+ DB_PATH = "knowledge_base.db"
28
+
29
+ def init_db():
30
+ """ Initialize the database and tables if they don't exist. """
31
+ conn = sqlite3.connect(DB_PATH)
32
+ c = conn.cursor()
33
+ c.execute('''
34
+ CREATE TABLE IF NOT EXISTS documents (
35
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
36
+ content TEXT NOT NULL,
37
+ language TEXT,
38
+ embedding BLOB NOT NULL
39
+ )
40
+ ''')
41
+ conn.commit()
42
+ conn.close()
43
+
44
+ def store_document(content, language, embedding):
45
+ """ Store document content, language, and embedding in the SQLite database. """
46
+ conn = sqlite3.connect(DB_PATH)
47
+ c = conn.cursor()
48
+ c.execute("INSERT INTO documents (content, language, embedding) VALUES (?, ?, ?)",
49
+ (content, language, embedding.tobytes()))
50
+ conn.commit()
51
+ conn.close()
52
+
53
+ def load_documents():
54
+ """ Load all documents and embeddings from the SQLite database. """
55
+ conn = sqlite3.connect(DB_PATH)
56
+ c = conn.cursor()
57
+ c.execute("SELECT content, language, embedding FROM documents")
58
+ rows = c.fetchall()
59
+ conn.close()
60
+
61
+ documents = []
62
+ embeddings = []
63
+ for content, language, embedding_blob in rows:
64
+ documents.append(content)
65
+ embeddings.append(np.frombuffer(embedding_blob, dtype=np.float32))
66
+
67
+ return documents, np.array(embeddings)
68
+
69
  def translate_text(text, src_lang, tgt_lang):
70
+ """ Translate text using the M2M100 model. """
71
  tokenizer.src_lang = src_lang
72
  encoded = tokenizer(text, return_tensors="pt")
73
  generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
74
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
75
 
76
+ # Initialize database and FAISS index
77
+ init_db()
78
+ documents, embeddings = load_documents()
79
+ if len(embeddings) > 0:
80
+ index.add(embeddings)
81
+
82
  # Sidebar for navigation
83
  st.sidebar.title("Navigation")
84
  page = st.sidebar.radio("Go to", ["Upload Knowledge", "Q&A"])
 
108
  detected_lang = detect(text)
109
  st.write(f"Detected language: {detected_lang}")
110
 
111
+ # Generate embeddings
112
+ embedding = embedding_model.encode([text])[0]
113
+
114
+ # Store the document and embedding in the database
115
+ store_document(text, detected_lang, embedding)
116
+
117
+ # Add the embedding to FAISS index
118
+ index.add(np.array([embedding], dtype=np.float32))
119
+ documents.append(text)
120
  texts.append(text)
121
 
122
  # Process URL
 
124
  response = requests.get(url)
125
  soup = BeautifulSoup(response.text, 'html.parser')
126
  text = soup.get_text()
127
+ detected_lang = detect(text)
128
+ st.write(f"Detected language: {detected_lang}")
129
+
130
+ # Generate embedding
131
+ embedding = embedding_model.encode([text])[0]
132
+
133
+ # Store the document and embedding in the database
134
+ store_document(text, detected_lang, embedding)
135
+
136
+ # Add the embedding to FAISS index
137
+ index.add(np.array([embedding], dtype=np.float32))
138
+ documents.append(text)
139
  texts.append(text)
140
 
 
 
 
 
141
  st.write("Data processed and added to knowledge base!")
142
 
143
  # Provide a summary of the uploaded content
 
159
  user_query = translate_text(user_query, detected_query_lang, "en")
160
 
161
  query_embedding = embedding_model.encode([user_query])
162
+ D, I = index.search(np.array(query_embedding, dtype=np.float32), k=5) # Retrieve top 5 documents
163
+ context = " ".join([documents[i] for i in I[0]])
164
 
165
  # Pass translated query and context to the QA pipeline
166
  result = qa_pipeline(question=user_query, context=context)