legaltechgc commited on
Commit
cfdb962
·
verified ·
1 Parent(s): 90a61b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -5
app.py CHANGED
@@ -4,7 +4,7 @@ import numpy as np
4
  from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
5
  from sentence_transformers import SentenceTransformer
6
  from docx import Document
7
- import PyMuPDF
8
  import requests
9
  from bs4 import BeautifulSoup
10
  from langdetect import detect, LangDetectException
@@ -47,10 +47,10 @@ if page == "Upload Knowledge":
47
  for file in uploaded_files:
48
  try:
49
  if file.type == "application/pdf":
50
- pdf_file = PyMuPDF.open(stream=file.read())
51
  text = ""
52
- for page in pdf_file.pages():
53
- text += page.get_text()
54
  elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
55
  doc = Document(file)
56
  text = " ".join([para.text for para in doc.paragraphs])
@@ -88,7 +88,6 @@ if page == "Upload Knowledge":
88
  st.write(f"Detected language: {detected_lang}")
89
  except LangDetectException:
90
  st.error("Could not detect the language of the webpage.")
91
- # Skip further processing of this URL
92
  url = None # Set URL to None or skip to prevent further processing
93
 
94
  if url: # Continue only if URL processing is valid
 
4
  from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
5
  from sentence_transformers import SentenceTransformer
6
  from docx import Document
7
+ import PyPDF2 # Use PyPDF2 instead of PyMuPDF
8
  import requests
9
  from bs4 import BeautifulSoup
10
  from langdetect import detect, LangDetectException
 
47
  for file in uploaded_files:
48
  try:
49
  if file.type == "application/pdf":
50
+ pdf_reader = PyPDF2.PdfReader(file) # Use PyPDF2 for PDF reading
51
  text = ""
52
+ for page in pdf_reader.pages:
53
+ text += page.extract_text()
54
  elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
55
  doc = Document(file)
56
  text = " ".join([para.text for para in doc.paragraphs])
 
88
  st.write(f"Detected language: {detected_lang}")
89
  except LangDetectException:
90
  st.error("Could not detect the language of the webpage.")
 
91
  url = None # Set URL to None or skip to prevent further processing
92
 
93
  if url: # Continue only if URL processing is valid