Spaces:

shimer56
/

Extract_PDF

Sleeping

App Files Files Community

Extract_PDF / test_extraction /sample.py

shimer56

Upload folder using huggingface_hub

d2cb17f verified 5 months ago

raw

history blame

3.89 kB

	# import gradio as gr
	# import fitz # PyMuPDF
	# from PIL import Image
	# from io import BytesIO
	# import pandas as pd
	# import os


	# def extract_images_and_tables(pdf_file):

	# pdf_path = "temp.pdf"
	# with open(pdf_path, "wb") as f:
	# f.write(pdf_file)


	# pdf_document = fitz.open(pdf_path)


	# images = []
	# for page_index in range(len(pdf_document)):
	# for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
	# xref = img[0]
	# base_image = pdf_document.extract_image(xref)
	# image_bytes = base_image["image"]
	# image = Image.open(BytesIO(image_bytes))
	# images.append(image)


	# tables = []
	# for page_num in range(len(pdf_document)):
	# page = pdf_document.load_page(page_num)
	# text = page.get_text("text")

	# lines = [line.strip() for line in text.split("\n") if line.strip()]

	# if any("," in line for line in lines):

	# rows = [line.split(",") for line in lines]

	# tables.extend(rows)


	# table_content = ""
	# if tables:
	# max_columns = max(len(row) for row in tables)
	# tables = [row + [""] * (max_columns - len(row)) for row in tables]
	# df = pd.DataFrame(tables[1:], columns=tables[0])
	# table_content = df.to_csv(index=False)


	# pdf_document.close()

	# # Remove the temporary PDF file
	# os.remove(pdf_path)

	# return images, table_content



	# interface = gr.Interface(
	# fn=extract_images_and_tables,
	# inputs=gr.File(type="binary"),
	# outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
	# title="PDF Image and Table Extractor",
	# description="Upload a PDF to extract images and tables."
	# )


	# interface.launch(share=True)
	import gradio as gr
	import fitz # PyMuPDF
	from PIL import Image
	from io import BytesIO
	import pandas as pd
	import os


	def extract_images_and_tables(pdf_file, model_option):
	pdf_path = "temp.pdf"
	with open(pdf_path, "wb") as f:
	f.write(pdf_file)

	pdf_document = fitz.open(pdf_path)

	images = []
	for page_index in range(len(pdf_document)):
	for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
	xref = img[0]
	base_image = pdf_document.extract_image(xref)
	image_bytes = base_image["image"]
	image = Image.open(BytesIO(image_bytes))
	images.append(image)

	tables = []
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	text = page.get_text("text")

	lines = [line.strip() for line in text.split("\n") if line.strip()]

	if any("," in line for line in lines):

	rows = [line.split(",") for line in lines]

	tables.extend(rows)

	table_content = ""
	if tables:
	max_columns = max(len(row) for row in tables)
	tables = [row + [""] * (max_columns - len(row)) for row in tables]
	df = pd.DataFrame(tables[1:], columns=tables[0])
	table_content = df.to_csv(index=False)

	pdf_document.close()

	os.remove(pdf_path)

	return images, table_content


	def handle_model_selection(pdf_file, model_option):

	return extract_images_and_tables(pdf_file, model_option)


	interface = gr.Interface(
	fn=handle_model_selection,
	inputs=[
	gr.File(type="binary", label="Upload PDF"),
	gr.Dropdown(label="Select Model", choices=["Model 1", "Model 2", "Model 3"], value="Model 1")
	],
	outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
	title="PDF Image and Table Extractor",
	description="Upload a PDF to extract images and tables. Choose the model for extraction."
	)

	interface.launch(share=True)