Spaces:
Sleeping
Sleeping
# import gradio as gr | |
# import fitz # PyMuPDF | |
# from PIL import Image | |
# from io import BytesIO | |
# import pandas as pd | |
# import os | |
# def extract_images_and_tables(pdf_file): | |
# pdf_path = "temp.pdf" | |
# with open(pdf_path, "wb") as f: | |
# f.write(pdf_file) | |
# pdf_document = fitz.open(pdf_path) | |
# images = [] | |
# for page_index in range(len(pdf_document)): | |
# for img_index, img in enumerate(pdf_document.get_page_images(page_index)): | |
# xref = img[0] | |
# base_image = pdf_document.extract_image(xref) | |
# image_bytes = base_image["image"] | |
# image = Image.open(BytesIO(image_bytes)) | |
# images.append(image) | |
# tables = [] | |
# for page_num in range(len(pdf_document)): | |
# page = pdf_document.load_page(page_num) | |
# text = page.get_text("text") | |
# lines = [line.strip() for line in text.split("\n") if line.strip()] | |
# if any("," in line for line in lines): | |
# rows = [line.split(",") for line in lines] | |
# tables.extend(rows) | |
# table_content = "" | |
# if tables: | |
# max_columns = max(len(row) for row in tables) | |
# tables = [row + [""] * (max_columns - len(row)) for row in tables] | |
# df = pd.DataFrame(tables[1:], columns=tables[0]) | |
# table_content = df.to_csv(index=False) | |
# pdf_document.close() | |
# # Remove the temporary PDF file | |
# os.remove(pdf_path) | |
# return images, table_content | |
# interface = gr.Interface( | |
# fn=extract_images_and_tables, | |
# inputs=gr.File(type="binary"), | |
# outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")], | |
# title="PDF Image and Table Extractor", | |
# description="Upload a PDF to extract images and tables." | |
# ) | |
# interface.launch(share=True) | |
import gradio as gr | |
import fitz # PyMuPDF | |
from PIL import Image | |
from io import BytesIO | |
import pandas as pd | |
import os | |
def extract_images_and_tables(pdf_file, model_option): | |
pdf_path = "temp.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
pdf_document = fitz.open(pdf_path) | |
images = [] | |
for page_index in range(len(pdf_document)): | |
for img_index, img in enumerate(pdf_document.get_page_images(page_index)): | |
xref = img[0] | |
base_image = pdf_document.extract_image(xref) | |
image_bytes = base_image["image"] | |
image = Image.open(BytesIO(image_bytes)) | |
images.append(image) | |
tables = [] | |
for page_num in range(len(pdf_document)): | |
page = pdf_document.load_page(page_num) | |
text = page.get_text("text") | |
lines = [line.strip() for line in text.split("\n") if line.strip()] | |
if any("," in line for line in lines): | |
rows = [line.split(",") for line in lines] | |
tables.extend(rows) | |
table_content = "" | |
if tables: | |
max_columns = max(len(row) for row in tables) | |
tables = [row + [""] * (max_columns - len(row)) for row in tables] | |
df = pd.DataFrame(tables[1:], columns=tables[0]) | |
table_content = df.to_csv(index=False) | |
pdf_document.close() | |
os.remove(pdf_path) | |
return images, table_content | |
def handle_model_selection(pdf_file, model_option): | |
return extract_images_and_tables(pdf_file, model_option) | |
interface = gr.Interface( | |
fn=handle_model_selection, | |
inputs=[ | |
gr.File(type="binary", label="Upload PDF"), | |
gr.Dropdown(label="Select Model", choices=["Model 1", "Model 2", "Model 3"], value="Model 1") | |
], | |
outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")], | |
title="PDF Image and Table Extractor", | |
description="Upload a PDF to extract images and tables. Choose the model for extraction." | |
) | |
interface.launch(share=True) | |