legaltechgc commited on
Commit
e62b6a6
·
verified ·
1 Parent(s): 90c05bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -45
app.py CHANGED
@@ -7,7 +7,7 @@ from docx import Document
7
  import PyMuPDF
8
  import requests
9
  from bs4 import BeautifulSoup
10
- from langdetect import detect
11
 
12
  # Initialize models and pipeline
13
  qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
@@ -45,42 +45,60 @@ if page == "Upload Knowledge":
45
 
46
  # Process uploaded files
47
  for file in uploaded_files:
48
- if file.type == "application/pdf":
49
- with PyMuPDF.open(file) as pdf_file:
 
50
  text = ""
51
  for page in pdf_file.pages():
52
  text += page.get_text()
53
- elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
54
- doc = Document(file)
55
- text = " ".join([para.text for para in doc.paragraphs])
56
-
57
- # Language detection
58
- detected_lang = detect(text)
59
- st.write(f"Detected language: {detected_lang}")
60
-
61
- # Generate embeddings
62
- embedding = embedding_model.encode([text])[0]
63
-
64
- # Add the embedding to FAISS index
65
- index.add(np.array([embedding], dtype=np.float32))
66
- documents.append(text)
67
- texts.append(text)
 
 
 
 
 
 
 
 
 
68
 
69
  # Process URL
70
  if url:
71
- response = requests.get(url)
72
- soup = BeautifulSoup(response.text, 'html.parser')
73
- text = soup.get_text()
74
- detected_lang = detect(text)
75
- st.write(f"Detected language: {detected_lang}")
76
-
77
- # Generate embedding
78
- embedding = embedding_model.encode([text])[0]
79
-
80
- # Add the embedding to FAISS index
81
- index.add(np.array([embedding], dtype=np.float32))
82
- documents.append(text)
83
- texts.append(text)
 
 
 
 
 
 
 
 
84
 
85
  st.write("Data processed and added to knowledge base!")
86
 
@@ -95,17 +113,22 @@ elif page == "Q&A":
95
  user_query = st.text_input("Enter your query:")
96
 
97
  if user_query:
98
- detected_query_lang = detect(user_query)
99
-
100
- # Translate the query if it's in a different language than the knowledge base
101
- if detected_query_lang != "en":
102
- st.write(f"Translating query from {detected_query_lang} to English")
103
- user_query = translate_text(user_query, detected_query_lang, "en")
104
-
105
- query_embedding = embedding_model.encode([user_query])
106
- D, I = index.search(np.array(query_embedding, dtype=np.float32), k=5) # Retrieve top 5 documents
107
- context = " ".join([documents[i] for i in I[0]])
108
-
109
- # Pass translated query and context to the QA pipeline
110
- result = qa_pipeline(question=user_query, context=context)
111
- st.write(f"Answer: {result['answer']}")
 
 
 
 
 
 
7
  import PyMuPDF
8
  import requests
9
  from bs4 import BeautifulSoup
10
+ from langdetect import detect, LangDetectException
11
 
12
  # Initialize models and pipeline
13
  qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
 
45
 
46
  # Process uploaded files
47
  for file in uploaded_files:
48
+ try:
49
+ if file.type == "application/pdf":
50
+ pdf_file = PyMuPDF.open(stream=file.read())
51
  text = ""
52
  for page in pdf_file.pages():
53
  text += page.get_text()
54
+ elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
55
+ doc = Document(file)
56
+ text = " ".join([para.text for para in doc.paragraphs])
57
+ else:
58
+ st.error(f"Unsupported file type: {file.type}")
59
+ continue
60
+
61
+ # Language detection
62
+ try:
63
+ detected_lang = detect(text)
64
+ st.write(f"Detected language: {detected_lang}")
65
+ except LangDetectException:
66
+ st.error("Could not detect the language of the text.")
67
+ continue
68
+
69
+ # Generate embeddings
70
+ embedding = embedding_model.encode([text])[0]
71
+
72
+ # Add the embedding to FAISS index
73
+ index.add(np.array([embedding], dtype=np.float32))
74
+ documents.append(text)
75
+ texts.append(text)
76
+ except Exception as e:
77
+ st.error(f"Error processing file: {e}")
78
 
79
  # Process URL
80
  if url:
81
+ try:
82
+ response = requests.get(url)
83
+ soup = BeautifulSoup(response.text, 'html.parser')
84
+ text = soup.get_text()
85
+
86
+ try:
87
+ detected_lang = detect(text)
88
+ st.write(f"Detected language: {detected_lang}")
89
+ except LangDetectException:
90
+ st.error("Could not detect the language of the webpage.")
91
+ continue
92
+
93
+ # Generate embedding
94
+ embedding = embedding_model.encode([text])[0]
95
+
96
+ # Add the embedding to FAISS index
97
+ index.add(np.array([embedding], dtype=np.float32))
98
+ documents.append(text)
99
+ texts.append(text)
100
+ except Exception as e:
101
+ st.error(f"Error processing URL: {e}")
102
 
103
  st.write("Data processed and added to knowledge base!")
104
 
 
113
  user_query = st.text_input("Enter your query:")
114
 
115
  if user_query:
116
+ try:
117
+ detected_query_lang = detect(user_query)
118
+
119
+ # Translate the query if it's in a different language than the knowledge base
120
+ if detected_query_lang != "en":
121
+ st.write(f"Translating query from {detected_query_lang} to English")
122
+ user_query = translate_text(user_query, detected_query_lang, "en")
123
+
124
+ query_embedding = embedding_model.encode([user_query])
125
+ D, I = index.search(np.array(query_embedding, dtype=np.float32), k=5) # Retrieve top 5 documents
126
+ context = " ".join([documents[i] for i in I[0]])
127
+
128
+ # Pass translated query and context to the QA pipeline
129
+ result = qa_pipeline(question=user_query, context=context)
130
+ st.write(f"Answer: {result['answer']}")
131
+ except LangDetectException:
132
+ st.error("Could not detect the language of the query.")
133
+ except Exception as e:
134
+ st.error(f"Error during Q&A processing: {e}")