Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ import numpy as np
|
|
4 |
from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
from docx import Document
|
7 |
-
import PyMuPDF
|
8 |
import requests
|
9 |
from bs4 import BeautifulSoup
|
10 |
from langdetect import detect, LangDetectException
|
@@ -47,10 +47,10 @@ if page == "Upload Knowledge":
|
|
47 |
for file in uploaded_files:
|
48 |
try:
|
49 |
if file.type == "application/pdf":
|
50 |
-
|
51 |
text = ""
|
52 |
-
for page in
|
53 |
-
text += page.
|
54 |
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
55 |
doc = Document(file)
|
56 |
text = " ".join([para.text for para in doc.paragraphs])
|
@@ -88,7 +88,6 @@ if page == "Upload Knowledge":
|
|
88 |
st.write(f"Detected language: {detected_lang}")
|
89 |
except LangDetectException:
|
90 |
st.error("Could not detect the language of the webpage.")
|
91 |
-
# Skip further processing of this URL
|
92 |
url = None # Set URL to None or skip to prevent further processing
|
93 |
|
94 |
if url: # Continue only if URL processing is valid
|
|
|
4 |
from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
from docx import Document
|
7 |
+
import PyPDF2 # Use PyPDF2 instead of PyMuPDF
|
8 |
import requests
|
9 |
from bs4 import BeautifulSoup
|
10 |
from langdetect import detect, LangDetectException
|
|
|
47 |
for file in uploaded_files:
|
48 |
try:
|
49 |
if file.type == "application/pdf":
|
50 |
+
pdf_reader = PyPDF2.PdfReader(file) # Use PyPDF2 for PDF reading
|
51 |
text = ""
|
52 |
+
for page in pdf_reader.pages:
|
53 |
+
text += page.extract_text()
|
54 |
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
55 |
doc = Document(file)
|
56 |
text = " ".join([para.text for para in doc.paragraphs])
|
|
|
88 |
st.write(f"Detected language: {detected_lang}")
|
89 |
except LangDetectException:
|
90 |
st.error("Could not detect the language of the webpage.")
|
|
|
91 |
url = None # Set URL to None or skip to prevent further processing
|
92 |
|
93 |
if url: # Continue only if URL processing is valid
|