File size: 3,103 Bytes
2ec811e
 
65aa3f9
859e738
 
2ec811e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
859e738
 
65aa3f9
 
 
 
 
 
5f90c3e
 
65aa3f9
 
2ec811e
 
5f90c3e
 
2ec811e
 
5f90c3e
069d597
2ec811e
6ac5140
 
2ec811e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65aa3f9
2ec811e
 
 
 
 
 
 
 
 
 
 
 
 
65aa3f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import streamlit as st
import subprocess
import os
import path
import sys

#=======================================================================================================================#

def clear_submit():
    st.session_state["submit"] = False

if 'clicked' not in st.session_state:
    st.session_state.clicked = False

def click_button():
    st.session_state.clicked = True

st.set_page_config(page_title='OCR App', page_icon=':pencil:', layout='wide', initial_sidebar_state='auto')

#=======================================================================================================================#

#--------------------------Sidebar--------------------------#

with st.sidebar:
    # Add a title
    st.title('Load document')

    # Add a file uploader
    uploaded_file = st.file_uploader(
            "Upload file", type=["pdf"], 
            help="Only PDF files are supported", 
            on_change=clear_submit)
    # Add a button
    if uploaded_file:
        st.markdown('---')
        st.title('Extract text from PDF')
        extract_text = st.button('Extract', help='Extract text from the document')
    
#=======================================================================================================================#

#--------------------------Main Page--------------------------#

if uploaded_file:

    dir = path.Path(__file__).abspath()
    sys.path.append(dir.parent.parent)

    # create files folder
    if not os.path.exists('files'):
        os.makedirs('files')
    
    # Create a temporary folder in streamlit
    # input_path = './streamlit/files/{uploaded_file.name}'
    input_path = f'./files/{uploaded_file.name}'
    # input_path = os.path.join('files', uploaded_file.name)

    # Create output file
    output_file = f'{uploaded_file.name}'.replace('.pdf', '.mmd')
    output_path = f'./files/'
    # output_path = './streamlit/files'

    # mmd path
    mmd_path = os.path.join('files', output_file)
    # mmd_path = './streamlit/files/{output_file}'

    with open(input_path, 'wb') as f:
        f.write(uploaded_file.getbuffer())

    # Load the model
    @st.cache_resource(show_spinner=False)
    def load_model(input_path, output_path):
        subprocess.run(['nougat', input_path, '-o', output_path])

    
    if extract_text:
        with st.spinner('Extracting text...'):
            load_model(input_path, output_path)

        with open(mmd_path, 'r') as f:
            mmd = f.read()
            # move mmd to the session state
            st.session_state["mmd"] = mmd


    try:
        st.write(st.session_state["mmd"])
        
        with st.sidebar:
            st.success('Text extracted successfully!')
            st.markdown('---')
            st.title('Download file')
            download_output = st.download_button(label='Download', 
                                                data=st.session_state["mmd"], 
                                                file_name=output_file.replace('.mmd', '.md'),
                                                mime='text/markdown')
    
    except:
        pass