legaltechgc commited on
Commit
5bb71dd
·
verified ·
1 Parent(s): 316e36c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -0
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
3
+ import PyMuPDF # for PDF handling
4
+ from docx import Document
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import faiss
8
+ import numpy as np
9
+ from sentence_transformers import SentenceTransformer
10
+ from langdetect import detect
11
+
12
+ # Initialize models and pipeline
13
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
14
+ embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
15
+
16
+ # FAISS index setup (in-memory for this example)
17
+ index = faiss.IndexFlatL2(512)
18
+ doc_store = []
19
+
20
+ # Initialize translation model for on-the-fly translation
21
+ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
22
+ model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
23
+
24
+ # Function to translate text using the M2M100 model
25
+ def translate_text(text, src_lang, tgt_lang):
26
+ tokenizer.src_lang = src_lang
27
+ encoded = tokenizer(text, return_tensors="pt")
28
+ generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
29
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
30
+
31
+ # Sidebar for navigation
32
+ st.sidebar.title("Navigation")
33
+ page = st.sidebar.radio("Go to", ["Upload Knowledge", "Q&A"])
34
+
35
+ # Page 1: Knowledge Upload
36
+ if page == "Upload Knowledge":
37
+ st.title("Upload Knowledge Base")
38
+ uploaded_files = st.file_uploader("Upload your files (DOCX, PDF)", type=["pdf", "docx"], accept_multiple_files=True)
39
+ url = st.text_input("Or enter a website URL to scrape")
40
+
41
+ if uploaded_files or url:
42
+ st.write("Processing your data...")
43
+ texts = []
44
+
45
+ # Process uploaded files
46
+ for file in uploaded_files:
47
+ if file.type == "application/pdf":
48
+ with PyMuPDF.open(file) as pdf_file:
49
+ text = ""
50
+ for page in pdf_file.pages():
51
+ text += page.get_text()
52
+ elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
53
+ doc = Document(file)
54
+ text = " ".join([para.text for para in doc.paragraphs])
55
+
56
+ # Language detection
57
+ detected_lang = detect(text)
58
+ st.write(f"Detected language: {detected_lang}")
59
+
60
+ texts.append(text)
61
+
62
+ # Process URL
63
+ if url:
64
+ response = requests.get(url)
65
+ soup = BeautifulSoup(response.text, 'html.parser')
66
+ text = soup.get_text()
67
+ texts.append(text)
68
+
69
+ # Create embeddings and store in FAISS
70
+ embeddings = embedding_model.encode(texts)
71
+ index.add(embeddings)
72
+ doc_store.extend(texts)
73
+ st.write("Data processed and added to knowledge base!")
74
+
75
+ # Provide a summary of the uploaded content
76
+ for i, text in enumerate(texts):
77
+ st.write(f"Summary of Document {i+1}:")
78
+ st.write(text[:500] + "...") # Display first 500 characters as a summary
79
+
80
+ # Page 2: Q&A Interface
81
+ elif page == "Q&A":
82
+ st.title("Ask the Knowledge Base")
83
+ user_query = st.text_input("Enter your query:")
84
+
85
+ if user_query:
86
+ detected_query_lang = detect(user_query)
87
+
88
+ # Translate the query if it's in a different language than the knowledge base
89
+ if detected_query_lang != "en":
90
+ st.write(f"Translating query from {detected_query_lang} to English")
91
+ user_query = translate_text(user_query, detected_query_lang, "en")
92
+
93
+ query_embedding = embedding_model.encode([user_query])
94
+ D, I = index.search(query_embedding, k=5) # Retrieve top 5 documents
95
+ context = " ".join([doc_store[i] for i in I[0]])
96
+
97
+ # Pass translated query and context to the QA pipeline
98
+ result = qa_pipeline(question=user_query, context=context)
99
+ st.write(f"Answer: {result['answer']}")