shimer56's picture
Upload folder using huggingface_hub
d2cb17f verified
raw
history blame contribute delete
No virus
3.89 kB
# import gradio as gr
# import fitz # PyMuPDF
# from PIL import Image
# from io import BytesIO
# import pandas as pd
# import os
# def extract_images_and_tables(pdf_file):
# pdf_path = "temp.pdf"
# with open(pdf_path, "wb") as f:
# f.write(pdf_file)
# pdf_document = fitz.open(pdf_path)
# images = []
# for page_index in range(len(pdf_document)):
# for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
# xref = img[0]
# base_image = pdf_document.extract_image(xref)
# image_bytes = base_image["image"]
# image = Image.open(BytesIO(image_bytes))
# images.append(image)
# tables = []
# for page_num in range(len(pdf_document)):
# page = pdf_document.load_page(page_num)
# text = page.get_text("text")
# lines = [line.strip() for line in text.split("\n") if line.strip()]
# if any("," in line for line in lines):
# rows = [line.split(",") for line in lines]
# tables.extend(rows)
# table_content = ""
# if tables:
# max_columns = max(len(row) for row in tables)
# tables = [row + [""] * (max_columns - len(row)) for row in tables]
# df = pd.DataFrame(tables[1:], columns=tables[0])
# table_content = df.to_csv(index=False)
# pdf_document.close()
# # Remove the temporary PDF file
# os.remove(pdf_path)
# return images, table_content
# interface = gr.Interface(
# fn=extract_images_and_tables,
# inputs=gr.File(type="binary"),
# outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
# title="PDF Image and Table Extractor",
# description="Upload a PDF to extract images and tables."
# )
# interface.launch(share=True)
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
from io import BytesIO
import pandas as pd
import os
def extract_images_and_tables(pdf_file, model_option):
pdf_path = "temp.pdf"
with open(pdf_path, "wb") as f:
f.write(pdf_file)
pdf_document = fitz.open(pdf_path)
images = []
for page_index in range(len(pdf_document)):
for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(BytesIO(image_bytes))
images.append(image)
tables = []
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text("text")
lines = [line.strip() for line in text.split("\n") if line.strip()]
if any("," in line for line in lines):
rows = [line.split(",") for line in lines]
tables.extend(rows)
table_content = ""
if tables:
max_columns = max(len(row) for row in tables)
tables = [row + [""] * (max_columns - len(row)) for row in tables]
df = pd.DataFrame(tables[1:], columns=tables[0])
table_content = df.to_csv(index=False)
pdf_document.close()
os.remove(pdf_path)
return images, table_content
def handle_model_selection(pdf_file, model_option):
return extract_images_and_tables(pdf_file, model_option)
interface = gr.Interface(
fn=handle_model_selection,
inputs=[
gr.File(type="binary", label="Upload PDF"),
gr.Dropdown(label="Select Model", choices=["Model 1", "Model 2", "Model 3"], value="Model 1")
],
outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
title="PDF Image and Table Extractor",
description="Upload a PDF to extract images and tables. Choose the model for extraction."
)
interface.launch(share=True)