Spaces:

opllegaltech
/

weknow

Sleeping

App Files Files Community

legaltechgc commited on Sep 22, 2024

Commit

e62b6a6

verified ·

1 Parent(s): 90c05bf

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -45

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from docx import Document
 import PyMuPDF
 import requests
 from bs4 import BeautifulSoup
-from langdetect import detect
 # Initialize models and pipeline
 qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
@@ -45,42 +45,60 @@ if page == "Upload Knowledge":
         # Process uploaded files
         for file in uploaded_files:
-            if file.type == "application/pdf":
-                with PyMuPDF.open(file) as pdf_file:
                     text = ""
                     for page in pdf_file.pages():
                         text += page.get_text()
-            elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-                doc = Document(file)
-                text = " ".join([para.text for para in doc.paragraphs])
-            # Language detection
-            detected_lang = detect(text)
-            st.write(f"Detected language: {detected_lang}")
-            # Generate embeddings
-            embedding = embedding_model.encode([text])[0]
-            # Add the embedding to FAISS index
-            index.add(np.array([embedding], dtype=np.float32))
-            documents.append(text)
-            texts.append(text)
         # Process URL
         if url:
-            response = requests.get(url)
-            soup = BeautifulSoup(response.text, 'html.parser')
-            text = soup.get_text()
-            detected_lang = detect(text)
-            st.write(f"Detected language: {detected_lang}")
-            # Generate embedding
-            embedding = embedding_model.encode([text])[0]
-            # Add the embedding to FAISS index
-            index.add(np.array([embedding], dtype=np.float32))
-            documents.append(text)
-            texts.append(text)
         st.write("Data processed and added to knowledge base!")
@@ -95,17 +113,22 @@ elif page == "Q&A":
     user_query = st.text_input("Enter your query:")
     if user_query:
-        detected_query_lang = detect(user_query)
-        # Translate the query if it's in a different language than the knowledge base
-        if detected_query_lang != "en":
-            st.write(f"Translating query from {detected_query_lang} to English")
-            user_query = translate_text(user_query, detected_query_lang, "en")
-        query_embedding = embedding_model.encode([user_query])
-        D, I = index.search(np.array(query_embedding, dtype=np.float32), k=5)  # Retrieve top 5 documents
-        context = " ".join([documents[i] for i in I[0]])
-        # Pass translated query and context to the QA pipeline
-        result = qa_pipeline(question=user_query, context=context)
-        st.write(f"Answer: {result['answer']}")

 import PyMuPDF
 import requests
 from bs4 import BeautifulSoup
+from langdetect import detect, LangDetectException
 # Initialize models and pipeline
 qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased")
         # Process uploaded files
         for file in uploaded_files:
+            try:
+                if file.type == "application/pdf":
+                    pdf_file = PyMuPDF.open(stream=file.read())
                     text = ""
                     for page in pdf_file.pages():
                         text += page.get_text()
+                elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+                    doc = Document(file)
+                    text = " ".join([para.text for para in doc.paragraphs])
+                else:
+                    st.error(f"Unsupported file type: {file.type}")
+                    continue
+                # Language detection
+                try:
+                    detected_lang = detect(text)
+                    st.write(f"Detected language: {detected_lang}")
+                except LangDetectException:
+                    st.error("Could not detect the language of the text.")
+                    continue
+                # Generate embeddings
+                embedding = embedding_model.encode([text])[0]
+                # Add the embedding to FAISS index
+                index.add(np.array([embedding], dtype=np.float32))
+                documents.append(text)
+                texts.append(text)
+            except Exception as e:
+                st.error(f"Error processing file: {e}")
         # Process URL
         if url:
+            try:
+                response = requests.get(url)
+                soup = BeautifulSoup(response.text, 'html.parser')
+                text = soup.get_text()
+                try:
+                    detected_lang = detect(text)
+                    st.write(f"Detected language: {detected_lang}")
+                except LangDetectException:
+                    st.error("Could not detect the language of the webpage.")
+                    continue
+                # Generate embedding
+                embedding = embedding_model.encode([text])[0]
+                # Add the embedding to FAISS index
+                index.add(np.array([embedding], dtype=np.float32))
+                documents.append(text)
+                texts.append(text)
+            except Exception as e:
+                st.error(f"Error processing URL: {e}")
         st.write("Data processed and added to knowledge base!")
     user_query = st.text_input("Enter your query:")
     if user_query:
+        try:
+            detected_query_lang = detect(user_query)
+            # Translate the query if it's in a different language than the knowledge base
+            if detected_query_lang != "en":
+                st.write(f"Translating query from {detected_query_lang} to English")
+                user_query = translate_text(user_query, detected_query_lang, "en")
+            query_embedding = embedding_model.encode([user_query])
+            D, I = index.search(np.array(query_embedding, dtype=np.float32), k=5)  # Retrieve top 5 documents
+            context = " ".join([documents[i] for i in I[0]])
+            # Pass translated query and context to the QA pipeline
+            result = qa_pipeline(question=user_query, context=context)
+            st.write(f"Answer: {result['answer']}")
+        except LangDetectException:
+            st.error("Could not detect the language of the query.")
+        except Exception as e:
+            st.error(f"Error during Q&A processing: {e}")