pdf-reader / app.py
Omnibus's picture
Update app.py
73a3313
raw
history blame
2.08 kB
import gradio as gr
from bs4 import BeautifulSoup as bs
from pypdf import PdfReader
from pathlib import Path
import os
import sys
#import html5lib
#import copy
import requests
#from IPython.display import IFrame
def scrape(instring):
response = requests.get(instring, stream=True)
if response.status_code == 200:
with open("data.pdf", "wb") as f:
f.write(response.content)
else:
print(response.status_code)
out = Path("./data.pdf")
print (out)
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
return gr.HTML.update(f'''<embed src={out} type="application/pdf" width="100%" height="500px" />''')
def scrape1(instring):
# set the url to perform the get request
URL = f'{instring}'
page = requests.get(URL)
# load the page content
text = page.content
# make a soup object by using beautiful
# soup and set the markup as html parser
soup = bs(text, "html.parser")
out = str(soup.prettify())
return gr.HTML.update(f'''<object data={instring} type="application/pdf" width="100%" height="500px">''')
def scrape0(instring):
#r = requests.get(instring)
chunk_size=2000
url = f'{instring}'
r = requests.get(url, stream=True)
html_content = requests.get(url).text
soup = bs(html_content,"html.parser")
with open('metadata.pdf', 'wb') as fd:
for chunk in r.iter_content(chunk_size):
fd.write(chunk)
try:
out = r.content
except Exception:
#out=copy.copy(soup)
print ("No Divs")
#out = IFrame(src={instring}, width=700, height=600)
#return gr.HTML.update(f'''<iframe src={out}, width=700, height=600></iframe>''')
return gr.HTML.update(f'''<object data=metadata.pdf type="application/pdf" width="100%" height="500px">''')
with gr.Blocks() as app:
inp=gr.Textbox()
go_btn = gr.Button()
outp = gr.HTML()
go_btn.click(scrape,inp,outp)
app.queue(concurrency_count=10).launch()