import streamlit as st |
import subprocess |
import os |
def clear_submit(): |
st.session_state["submit"] = False |
if 'clicked' not in st.session_state: |
st.session_state.clicked = False |
def click_button(): |
st.session_state.clicked = True |
st.set_page_config(page_title='OCR App', page_icon=':pencil:', layout='wide', initial_sidebar_state='auto') |
with st.sidebar: |
st.title('Load document') |
uploaded_file = st.file_uploader( |
"Upload file", type=["pdf"], |
help="Only PDF files are supported", |
on_change=clear_submit) |
if uploaded_file: |
st.markdown('---') |
st.title('Extract text from PDF') |
extract_text = st.button('Extract', help='Extract text from the document') |
if uploaded_file: |
if not os.path.exists('files'): |
os.makedirs('files') |
input_path = f'./files/{uploaded_file.name}' |
output_file = f'{uploaded_file.name}'.replace('.pdf', '.mmd') |
output_path = f'./files/' |
mmd_path = os.path.join('files', output_file) |
@st.cache_resource(show_spinner=False) |
def load_model(input_path, output_path): |
subprocess.run(['nougat', input_path, '-o', output_path]) |
if extract_text: |
with st.spinner('Extracting text...'): |
load_model(input_path, output_path) |
with open(mmd_path, 'r') as f: |
mmd = f.read() |
st.session_state["mmd"] = mmd |
try: |
st.write(st.session_state["mmd"]) |
with st.sidebar: |
st.success('Text extracted successfully!') |
st.markdown('---') |
st.title('Download file') |
download_output = st.download_button(label='Download', |
data=st.session_state["mmd"], |
file_name=output_file.replace('.mmd', '.md'), |
mime='text/markdown') |
except: |
pass |