Spaces:

Omnibus
/

pdf-reader

Sleeping

App Files Files

xet

Community

pdf-reader / app.py

Omnibus

Update app.py

73a3313 about 2 years ago

raw

history blame

2.08 kB

	import gradio as gr
	from bs4 import BeautifulSoup as bs
	from pypdf import PdfReader
	from pathlib import Path
	import os
	import sys
	#import html5lib
	#import copy
	import requests
	#from IPython.display import IFrame

	def scrape(instring):
	response = requests.get(instring, stream=True)

	if response.status_code == 200:
	with open("data.pdf", "wb") as f:
	f.write(response.content)
	else:
	print(response.status_code)


	out = Path("./data.pdf")
	print (out)
	reader = PdfReader("data.pdf")
	number_of_pages = len(reader.pages)
	page = reader.pages[0]
	text = page.extract_text()
	return gr.HTML.update(f'''<embed src={out} type="application/pdf" width="100%" height="500px" />''')

	def scrape1(instring):
	# set the url to perform the get request
	URL = f'{instring}'
	page = requests.get(URL)

	# load the page content
	text = page.content

	# make a soup object by using beautiful
	# soup and set the markup as html parser
	soup = bs(text, "html.parser")
	out = str(soup.prettify())
	return gr.HTML.update(f'''<object data={instring} type="application/pdf" width="100%" height="500px">''')
	def scrape0(instring):

	#r = requests.get(instring)
	chunk_size=2000
	url = f'{instring}'
	r = requests.get(url, stream=True)
	html_content = requests.get(url).text
	soup = bs(html_content,"html.parser")

	with open('metadata.pdf', 'wb') as fd:
	for chunk in r.iter_content(chunk_size):
	fd.write(chunk)

	try:
	out = r.content

	except Exception:
	#out=copy.copy(soup)
	print ("No Divs")

	#out = IFrame(src={instring}, width=700, height=600)
	#return gr.HTML.update(f'''<iframe src={out}, width=700, height=600></iframe>''')
	return gr.HTML.update(f'''<object data=metadata.pdf type="application/pdf" width="100%" height="500px">''')

	with gr.Blocks() as app:
	inp=gr.Textbox()
	go_btn = gr.Button()
	outp = gr.HTML()
	go_btn.click(scrape,inp,outp)
	app.queue(concurrency_count=10).launch()