gradio-LegalNER / src /read_file.py
arosyihuddin's picture
add files
a450bc7
raw
history blame
476 Bytes
import PyPDF2
from clean_text import *
import requests
def read_pdf(file_pdf):
try:
pdf_text = ''
pdf_file = open(file_pdf, 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text = clean_text(page.extract_text())
pdf_text += text
pdf_file.close()
return pdf_text.strip()
except requests.exceptions.RequestException as e:
print("Error:", e)