|
import streamlit as st |
|
import subprocess |
|
import os |
|
import path |
|
import sys |
|
|
|
|
|
|
|
def clear_submit(): |
|
st.session_state["submit"] = False |
|
|
|
if 'clicked' not in st.session_state: |
|
st.session_state.clicked = False |
|
|
|
def click_button(): |
|
st.session_state.clicked = True |
|
|
|
st.set_page_config(page_title='OCR App', page_icon=':pencil:', layout='wide', initial_sidebar_state='auto') |
|
|
|
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
|
st.title('Load document') |
|
|
|
|
|
uploaded_file = st.file_uploader( |
|
"Upload file", type=["pdf"], |
|
help="Only PDF files are supported", |
|
on_change=clear_submit) |
|
|
|
if uploaded_file: |
|
st.markdown('---') |
|
st.title('Extract text from PDF') |
|
extract_text = st.button('Extract', help='Extract text from the document') |
|
|
|
|
|
|
|
|
|
|
|
if uploaded_file: |
|
|
|
dir = path.Path(__file__).abspath() |
|
sys.path.append(dir.parent.parent) |
|
|
|
|
|
if not os.path.exists('files'): |
|
os.makedirs('files') |
|
|
|
|
|
|
|
input_path = f'./files/{uploaded_file.name}' |
|
|
|
|
|
|
|
output_file = f'{uploaded_file.name}'.replace('.pdf', '.mmd') |
|
output_path = f'./files/' |
|
|
|
|
|
|
|
mmd_path = os.path.join('files', output_file) |
|
|
|
|
|
with open(input_path, 'wb') as f: |
|
f.write(uploaded_file.getbuffer()) |
|
|
|
|
|
@st.cache_resource(show_spinner=False) |
|
def load_model(input_path, output_path): |
|
subprocess.run(['nougat', input_path, '-o', output_path]) |
|
|
|
|
|
if extract_text: |
|
with st.spinner('Extracting text...'): |
|
load_model(input_path, output_path) |
|
|
|
with open(mmd_path, 'r') as f: |
|
mmd = f.read() |
|
|
|
st.session_state["mmd"] = mmd |
|
|
|
|
|
try: |
|
st.write(st.session_state["mmd"]) |
|
|
|
with st.sidebar: |
|
st.success('Text extracted successfully!') |
|
st.markdown('---') |
|
st.title('Download file') |
|
download_output = st.download_button(label='Download', |
|
data=st.session_state["mmd"], |
|
file_name=output_file.replace('.mmd', '.md'), |
|
mime='text/markdown') |
|
|
|
except: |
|
pass |