pdf-reader / app.py
Omnibus's picture
Update app.py
a923971
raw
history blame
2.41 kB
import gradio as gr
from bs4 import BeautifulSoup as bs
from pypdf import PdfReader
from pathlib import Path
import os
import sys
#import html5lib
#import copy
import requests
#from IPython.display import IFrame
def scrape(instring):
html_src=('''
<div style="text-align:center">
<h4>Pdf viewer testing</h4>
<iframe src="https://docs.google.com/viewer?url=http://www.pdf995.com/samples/pdf.pdf&embedded=true" frameborder="0" height="500px" width="100%"></iframe>
</div>''')
return gr.HTML.update(f'''{html_src}''')
def scrape00(instring):
response = requests.get(instring, stream=True)
if response.status_code == 200:
with open("data.pdf", "wb") as f:
f.write(response.content)
else:
print(response.status_code)
out = Path("./data.pdf")
print (out)
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
return gr.HTML.update(f'''<embed src={out} type="application/pdf" width="100%" height="500px" />''')
def scrape1(instring):
# set the url to perform the get request
URL = f'{instring}'
page = requests.get(URL)
# load the page content
text = page.content
# make a soup object by using beautiful
# soup and set the markup as html parser
soup = bs(text, "html.parser")
out = str(soup.prettify())
return gr.HTML.update(f'''<object data={instring} type="application/pdf" width="100%" height="500px">''')
def scrape0(instring):
#r = requests.get(instring)
chunk_size=2000
url = f'{instring}'
r = requests.get(url, stream=True)
html_content = requests.get(url).text
soup = bs(html_content,"html.parser")
with open('metadata.pdf', 'wb') as fd:
for chunk in r.iter_content(chunk_size):
fd.write(chunk)
try:
out = r.content
except Exception:
#out=copy.copy(soup)
print ("No Divs")
#out = IFrame(src={instring}, width=700, height=600)
#return gr.HTML.update(f'''<iframe src={out}, width=700, height=600></iframe>''')
return gr.HTML.update(f'''<object data=metadata.pdf type="application/pdf" width="100%" height="500px">''')
with gr.Blocks() as app:
inp=gr.Textbox()
go_btn = gr.Button()
outp = gr.HTML()
go_btn.click(scrape,inp,outp)
app.queue(concurrency_count=10).launch()