Spaces:

bllin001
/

ocr-app

Sleeping

App Files Files Community

ocr-app / app.py

bllin001

Update app.py

069d597 verified 9 months ago

raw

history blame contribute delete

3.1 kB

	import streamlit as st
	import subprocess
	import os
	import path
	import sys

	#=======================================================================================================================#

	def clear_submit():
	st.session_state["submit"] = False

	if 'clicked' not in st.session_state:
	st.session_state.clicked = False

	def click_button():
	st.session_state.clicked = True

	st.set_page_config(page_title='OCR App', page_icon=':pencil:', layout='wide', initial_sidebar_state='auto')

	#=======================================================================================================================#

	#--------------------------Sidebar--------------------------#

	with st.sidebar:
	# Add a title
	st.title('Load document')

	# Add a file uploader
	uploaded_file = st.file_uploader(
	"Upload file", type=["pdf"],
	help="Only PDF files are supported",
	on_change=clear_submit)
	# Add a button
	if uploaded_file:
	st.markdown('---')
	st.title('Extract text from PDF')
	extract_text = st.button('Extract', help='Extract text from the document')

	#=======================================================================================================================#

	#--------------------------Main Page--------------------------#

	if uploaded_file:

	dir = path.Path(__file__).abspath()
	sys.path.append(dir.parent.parent)

	# create files folder
	if not os.path.exists('files'):
	os.makedirs('files')

	# Create a temporary folder in streamlit
	# input_path = './streamlit/files/{uploaded_file.name}'
	input_path = f'./files/{uploaded_file.name}'
	# input_path = os.path.join('files', uploaded_file.name)

	# Create output file
	output_file = f'{uploaded_file.name}'.replace('.pdf', '.mmd')
	output_path = f'./files/'
	# output_path = './streamlit/files'

	# mmd path
	mmd_path = os.path.join('files', output_file)
	# mmd_path = './streamlit/files/{output_file}'

	with open(input_path, 'wb') as f:
	f.write(uploaded_file.getbuffer())

	# Load the model
	@st.cache_resource(show_spinner=False)
	def load_model(input_path, output_path):
	subprocess.run(['nougat', input_path, '-o', output_path])


	if extract_text:
	with st.spinner('Extracting text...'):
	load_model(input_path, output_path)

	with open(mmd_path, 'r') as f:
	mmd = f.read()
	# move mmd to the session state
	st.session_state["mmd"] = mmd


	try:
	st.write(st.session_state["mmd"])

	with st.sidebar:
	st.success('Text extracted successfully!')
	st.markdown('---')
	st.title('Download file')
	download_output = st.download_button(label='Download',
	data=st.session_state["mmd"],
	file_name=output_file.replace('.mmd', '.md'),
	mime='text/markdown')

	except:
	pass