Spaces:
Running
Running
File size: 2,413 Bytes
3919e25 a648a91 4aeb6eb 68b2584 73a3313 4aeb6eb 3919e25 a648a91 3919e25 a923971 3919e25 a923971 e2788a6 d9106f2 68b2584 e2788a6 f309ab6 68b2584 f05905d 3b77cd2 3ff2217 3b77cd2 3919e25 13951ed aaca73e 13951ed 7c1d83e 1a8e0e1 a648a91 1a8e0e1 d1bb951 13951ed 3919e25 b9c90b4 13951ed 3919e25 13951ed 8f70505 d1bb951 8f70505 502b110 3919e25 563ca5d ba73e05 3919e25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import gradio as gr
from bs4 import BeautifulSoup as bs
from pypdf import PdfReader
from pathlib import Path
import os
import sys
#import html5lib
#import copy
import requests
#from IPython.display import IFrame
def scrape(instring):
html_src=('''
<div style="text-align:center">
<h4>Pdf viewer testing</h4>
<iframe src="https://docs.google.com/viewer?url=http://www.pdf995.com/samples/pdf.pdf&embedded=true" frameborder="0" height="500px" width="100%"></iframe>
</div>''')
return gr.HTML.update(f'''{html_src}''')
def scrape00(instring):
response = requests.get(instring, stream=True)
if response.status_code == 200:
with open("data.pdf", "wb") as f:
f.write(response.content)
else:
print(response.status_code)
out = Path("./data.pdf")
print (out)
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
return gr.HTML.update(f'''<embed src={out} type="application/pdf" width="100%" height="500px" />''')
def scrape1(instring):
# set the url to perform the get request
URL = f'{instring}'
page = requests.get(URL)
# load the page content
text = page.content
# make a soup object by using beautiful
# soup and set the markup as html parser
soup = bs(text, "html.parser")
out = str(soup.prettify())
return gr.HTML.update(f'''<object data={instring} type="application/pdf" width="100%" height="500px">''')
def scrape0(instring):
#r = requests.get(instring)
chunk_size=2000
url = f'{instring}'
r = requests.get(url, stream=True)
html_content = requests.get(url).text
soup = bs(html_content,"html.parser")
with open('metadata.pdf', 'wb') as fd:
for chunk in r.iter_content(chunk_size):
fd.write(chunk)
try:
out = r.content
except Exception:
#out=copy.copy(soup)
print ("No Divs")
#out = IFrame(src={instring}, width=700, height=600)
#return gr.HTML.update(f'''<iframe src={out}, width=700, height=600></iframe>''')
return gr.HTML.update(f'''<object data=metadata.pdf type="application/pdf" width="100%" height="500px">''')
with gr.Blocks() as app:
inp=gr.Textbox()
go_btn = gr.Button()
outp = gr.HTML()
go_btn.click(scrape,inp,outp)
app.queue(concurrency_count=10).launch() |