Spaces:

deepsh2207
/

TextExtractor

Sleeping

App Files Files Community

TextExtractor / app.py

deepsh2207

bug fixed

d06a6f3 9 months ago

raw

history blame contribute delete

5.54 kB

	# import cv2
	# import matplotlib.pyplot as plt
	import numpy as np
	import streamlit as st
	import torch
	import json
	import base64

	from doctr.io import DocumentFile
	from doctr.utils.visualization import visualize_page

	from backend.pytorch import DET_ARCHS, RECO_ARCHS, load_predictor #forward_image

	forward_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


	def main(det_archs, reco_archs):
	"""Build a streamlit layout"""
	# Wide mode
	st.set_page_config(layout="wide")

	st.markdown("Used Github Actions to automatically build the app on any updates on this [github repo link](https://github.com/deepanshu2207/imgtotxt_using_DocTR)")
	st.caption("Made with ❤️ by Deepanshu. Credits to 🤗 Spaces for Hosting this.")

	# Designing the interface
	st.title("Document Text Extraction")
	# For newline
	st.write("\n")
	# Instructions
	st.markdown("Hint: click on the top-right corner of an image to enlarge it!")
	# Set the columns
	# cols = st.columns((1, 1, 1, 1))
	cols = st.columns((1, 1, 1))
	cols[0].subheader("Input page")
	# cols[1].subheader("Segmentation heatmap")
	cols[1].subheader("OCR output")
	cols[2].subheader("Page reconstitution")

	# Sidebar
	# File selection
	st.sidebar.title("Document selection")
	# Choose your own image
	uploaded_file = st.sidebar.file_uploader("Upload files", type=["pdf", "png", "jpeg", "jpg"])
	if uploaded_file is not None:
	if uploaded_file.name.endswith(".pdf"):
	doc = DocumentFile.from_pdf(uploaded_file.read())
	else:
	doc = DocumentFile.from_images(uploaded_file.read())
	page_idx = st.sidebar.selectbox("Page selection", [idx + 1 for idx in range(len(doc))]) - 1
	page = doc[page_idx]
	cols[0].image(page)

	# Model selection
	st.sidebar.title("Model selection")
	det_arch = st.sidebar.selectbox("Text detection model", det_archs)
	reco_arch = st.sidebar.selectbox("Text recognition model", reco_archs)

	# # For newline
	# st.sidebar.write("\n")
	# # Only straight pages or possible rotation
	# st.sidebar.title("Parameters")
	# assume_straight_pages = st.sidebar.checkbox("Assume straight pages", value=True)
	# st.sidebar.write("\n")
	# # Straighten pages
	# straighten_pages = st.sidebar.checkbox("Straighten pages", value=False)
	# st.sidebar.write("\n")
	# # Binarization threshold
	# bin_thresh = st.sidebar.slider("Binarization threshold", min_value=0.1, max_value=0.9, value=0.3, step=0.1)
	# st.sidebar.write("\n")

	if st.sidebar.button("Analyze page"):
	if uploaded_file is None:
	st.sidebar.write("Please upload a document")

	else:
	with st.spinner("Loading model..."):
	# Default Values
	assume_straight_pages, straighten_pages, bin_thresh = True, False, 0.3

	predictor = load_predictor(
	det_arch, reco_arch, assume_straight_pages, straighten_pages, bin_thresh, forward_device
	)

	with st.spinner("Analyzing..."):
	# # Forward the image to the model
	# seg_map = forward_image(predictor, page, forward_device)
	# seg_map = np.squeeze(seg_map)
	# seg_map = cv2.resize(seg_map, (page.shape[1], page.shape[0]), interpolation=cv2.INTER_LINEAR)

	# # Plot the raw heatmap
	# fig, ax = plt.subplots()
	# ax.imshow(seg_map)
	# ax.axis("off")
	# cols[1].pyplot(fig)

	# Plot OCR output
	out = predictor([page])
	fig = visualize_page(out.pages[0].export(), out.pages[0].page, interactive=False, add_labels=False)
	cols[1].pyplot(fig)

	# Page reconsitution under input page
	page_export = out.pages[0].export()
	if assume_straight_pages or (not assume_straight_pages and straighten_pages):
	img = out.pages[0].synthesize()
	cols[2].image(img, clamp=True)

	print('out',out)
	print('\n')
	print('page_export',page_export)
	print('\n')
	all_text = ''
	for i in page_export['blocks']:
	for line in i['lines']:
	for word in line['words']:
	all_text+=word['value']
	all_text+=' '
	all_text+='\n'

	print('all_text', all_text)
	print('\n')

	# Display Text
	st.markdown("\n### Here is your text:")
	st.write(all_text)

	# Display JSON
	# json_string = json.dumps(page_export)
	st.markdown("\n### Here is your document structure in JSON format:")
	encoded_data = base64.b64encode(json.dumps(page_export).encode("utf-8")).decode("utf-8")
	download_link = f"data:file/txt;base64,{encoded_data}"
	st.markdown(f"[Download JSON]( {download_link} )", unsafe_allow_html=True)
	# st.download_button(label="Download JSON", data=json_string, file_name='data.json', mime='application/json')
	st.json(page_export, expanded=False)

	st.success('Done!')
	st.balloons()


	if __name__ == "__main__":
	main(DET_ARCHS, RECO_ARCHS)