ocr-app / app.py
bllin001's picture
Update app.py
069d597 verified
import streamlit as st
import subprocess
import os
import path
import sys
#=======================================================================================================================#
def clear_submit():
st.session_state["submit"] = False
if 'clicked' not in st.session_state:
st.session_state.clicked = False
def click_button():
st.session_state.clicked = True
st.set_page_config(page_title='OCR App', page_icon=':pencil:', layout='wide', initial_sidebar_state='auto')
#=======================================================================================================================#
#--------------------------Sidebar--------------------------#
with st.sidebar:
# Add a title
st.title('Load document')
# Add a file uploader
uploaded_file = st.file_uploader(
"Upload file", type=["pdf"],
help="Only PDF files are supported",
on_change=clear_submit)
# Add a button
if uploaded_file:
st.markdown('---')
st.title('Extract text from PDF')
extract_text = st.button('Extract', help='Extract text from the document')
#=======================================================================================================================#
#--------------------------Main Page--------------------------#
if uploaded_file:
dir = path.Path(__file__).abspath()
sys.path.append(dir.parent.parent)
# create files folder
if not os.path.exists('files'):
os.makedirs('files')
# Create a temporary folder in streamlit
# input_path = './streamlit/files/{uploaded_file.name}'
input_path = f'./files/{uploaded_file.name}'
# input_path = os.path.join('files', uploaded_file.name)
# Create output file
output_file = f'{uploaded_file.name}'.replace('.pdf', '.mmd')
output_path = f'./files/'
# output_path = './streamlit/files'
# mmd path
mmd_path = os.path.join('files', output_file)
# mmd_path = './streamlit/files/{output_file}'
with open(input_path, 'wb') as f:
f.write(uploaded_file.getbuffer())
# Load the model
@st.cache_resource(show_spinner=False)
def load_model(input_path, output_path):
subprocess.run(['nougat', input_path, '-o', output_path])
if extract_text:
with st.spinner('Extracting text...'):
load_model(input_path, output_path)
with open(mmd_path, 'r') as f:
mmd = f.read()
# move mmd to the session state
st.session_state["mmd"] = mmd
try:
st.write(st.session_state["mmd"])
with st.sidebar:
st.success('Text extracted successfully!')
st.markdown('---')
st.title('Download file')
download_output = st.download_button(label='Download',
data=st.session_state["mmd"],
file_name=output_file.replace('.mmd', '.md'),
mime='text/markdown')
except:
pass