Spaces:
Running
Running
import gradio as gr | |
from bs4 import BeautifulSoup as bs | |
from pypdf import PdfReader | |
from pathlib import Path | |
import os | |
import sys | |
#import html5lib | |
#import copy | |
import requests | |
#from IPython.display import IFrame | |
def scrape(instring): | |
html_src=(''' | |
<div style="text-align:center"> | |
<h4>Pdf viewer testing</h4> | |
<iframe src="https://docs.google.com/viewer?url=http://www.pdf995.com/samples/pdf.pdf&embedded=true" frameborder="0" height="500px" width="100%"></iframe> | |
</div>''') | |
return gr.HTML.update(f'''{html_src}''') | |
def scrape00(instring): | |
response = requests.get(instring, stream=True) | |
if response.status_code == 200: | |
with open("data.pdf", "wb") as f: | |
f.write(response.content) | |
else: | |
print(response.status_code) | |
out = Path("./data.pdf") | |
print (out) | |
reader = PdfReader("data.pdf") | |
number_of_pages = len(reader.pages) | |
page = reader.pages[0] | |
text = page.extract_text() | |
return gr.HTML.update(f'''<embed src={out} type="application/pdf" width="100%" height="500px" />''') | |
def scrape1(instring): | |
# set the url to perform the get request | |
URL = f'{instring}' | |
page = requests.get(URL) | |
# load the page content | |
text = page.content | |
# make a soup object by using beautiful | |
# soup and set the markup as html parser | |
soup = bs(text, "html.parser") | |
out = str(soup.prettify()) | |
return gr.HTML.update(f'''<object data={instring} type="application/pdf" width="100%" height="500px">''') | |
def scrape0(instring): | |
#r = requests.get(instring) | |
chunk_size=2000 | |
url = f'{instring}' | |
r = requests.get(url, stream=True) | |
html_content = requests.get(url).text | |
soup = bs(html_content,"html.parser") | |
with open('metadata.pdf', 'wb') as fd: | |
for chunk in r.iter_content(chunk_size): | |
fd.write(chunk) | |
try: | |
out = r.content | |
except Exception: | |
#out=copy.copy(soup) | |
print ("No Divs") | |
#out = IFrame(src={instring}, width=700, height=600) | |
#return gr.HTML.update(f'''<iframe src={out}, width=700, height=600></iframe>''') | |
return gr.HTML.update(f'''<object data=metadata.pdf type="application/pdf" width="100%" height="500px">''') | |
with gr.Blocks() as app: | |
inp=gr.Textbox() | |
go_btn = gr.Button() | |
outp = gr.HTML() | |
go_btn.click(scrape,inp,outp) | |
app.queue(concurrency_count=10).launch() |