File size: 2,413 Bytes
3919e25
a648a91
4aeb6eb
68b2584
73a3313
 
4aeb6eb
3919e25
 
a648a91
3919e25
a923971
3919e25
a923971
 
 
 
 
 
 
 
 
e2788a6
 
 
 
 
 
 
 
 
d9106f2
68b2584
e2788a6
f309ab6
 
 
68b2584
f05905d
 
3b77cd2
 
 
 
 
 
 
 
 
 
 
3ff2217
3b77cd2
3919e25
13951ed
aaca73e
13951ed
7c1d83e
1a8e0e1
a648a91
1a8e0e1
d1bb951
13951ed
 
 
3919e25
b9c90b4
13951ed
3919e25
 
 
 
13951ed
8f70505
d1bb951
8f70505
502b110
3919e25
 
563ca5d
ba73e05
3919e25
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from bs4 import BeautifulSoup as bs 
from pypdf import PdfReader
from pathlib import Path
import os 
import sys
#import html5lib
#import copy
import requests 
#from IPython.display import IFrame


def scrape(instring):
    html_src=('''
    <div style="text-align:center">
    <h4>Pdf viewer testing</h4>
    <iframe src="https://docs.google.com/viewer?url=http://www.pdf995.com/samples/pdf.pdf&embedded=true" frameborder="0" height="500px" width="100%"></iframe>
    </div>''')
    return gr.HTML.update(f'''{html_src}''')


def scrape00(instring):
    response = requests.get(instring, stream=True)

    if response.status_code == 200:
        with open("data.pdf", "wb") as f:
            f.write(response.content)
    else:
        print(response.status_code)


    out = Path("./data.pdf")
    print (out)
    reader = PdfReader("data.pdf")
    number_of_pages = len(reader.pages)
    page = reader.pages[0]
    text = page.extract_text()
    return gr.HTML.update(f'''<embed src={out} type="application/pdf" width="100%" height="500px" />''')

def scrape1(instring):
    # set the url to perform the get request
    URL = f'{instring}'
    page = requests.get(URL)
      
    # load the page content
    text = page.content
      
    # make a soup object by using beautiful
    # soup and set the markup as html parser
    soup = bs(text, "html.parser")
    out = str(soup.prettify())
    return gr.HTML.update(f'''<object data={instring} type="application/pdf" width="100%" height="500px">''')
def scrape0(instring):
    
    #r = requests.get(instring) 
    chunk_size=2000
    url = f'{instring}'
    r = requests.get(url, stream=True)
    html_content = requests.get(url).text
    soup = bs(html_content,"html.parser")

    with open('metadata.pdf', 'wb') as fd:
        for chunk in r.iter_content(chunk_size):
            fd.write(chunk)

    try:
        out = r.content
       
    except Exception:
        #out=copy.copy(soup)
        print ("No Divs")

    #out = IFrame(src={instring}, width=700, height=600)
    #return gr.HTML.update(f'''<iframe src={out}, width=700, height=600></iframe>''')
    return gr.HTML.update(f'''<object data=metadata.pdf type="application/pdf" width="100%" height="500px">''')

with gr.Blocks() as app:
    inp=gr.Textbox()
    go_btn = gr.Button()
    outp = gr.HTML()
    go_btn.click(scrape,inp,outp)
app.queue(concurrency_count=10).launch()