shimer56's picture
Upload folder using huggingface_hub
86ac03f verified
raw
history blame contribute delete
No virus
4.39 kB
import gradio as gr
import fitz # PyMuPDF
from PIL import Image, ImageDraw
from io import BytesIO
import pandas as pd
import os
import numpy as np
import google.generativeai as genai
import openai
import base64
import requests
import tempfile
import ast
gemini_api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=gemini_api_key)
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
from io import BytesIO
import pandas as pd
import numpy as np
import tempfile
# Define the model extraction functions
def extract_bounding_box_pymupdf(pdf_content):
bounding_boxes = []
pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
for page_index in range(len(pdf_file)):
page_bbox = []
page = pdf_file[page_index]
image_list = page.get_images(full=True)
for image_index, img in enumerate(page.get_images(full=True), start=1):
rect = page.get_image_bbox(img[7])
bbox = list(rect)
page_bbox.append(bbox)
bounding_boxes.append(page_bbox)
pdf_file.close() # Close the PDF file after use
return bounding_boxes
def extract_bounding_boxes_gemini(api_key, images):
# Placeholder for Gemini API integration
bounding_boxes = [[(0, 0, 100, 100)]] * len(images) # Dummy bounding boxes
return bounding_boxes
def extract_bounding_box_gpt(api_key, pdf_content):
# Placeholder for GPT-4 API integration
bounding_boxes = [[(0, 0, 100, 100)]] * len(
fitz.open(stream=pdf_content, filetype="pdf")
) # Dummy bounding boxes
return bounding_boxes
def extract_images_and_tables(pdf_file, model_option):
if isinstance(pdf_file, str):
# If input is a file path (usually in testing or local execution)
with open(pdf_file, "rb") as f:
pdf_bytes = f.read()
elif isinstance(pdf_file, bytes):
# If input is bytes (from Gradio)
pdf_bytes = pdf_file
else:
raise TypeError("Unsupported input type for pdf_file.")
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
for page_index in range(len(pdf_document)):
for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(BytesIO(image_bytes))
images.append(image)
tables = []
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text("text")
lines = [line.strip() for line in text.split("\n") if line.strip()]
if any("," in line for line in lines):
rows = [line.split(",") for line in lines]
tables.extend(rows)
table_content = ""
if tables:
max_columns = max(len(row) for row in tables)
tables = [row + [""] * (max_columns - len(row)) for row in tables]
df = pd.DataFrame(tables[1:], columns=tables[0])
table_content = df.to_csv(index=False)
pdf_document.close()
if model_option == "PyMuPDF":
bounding_boxes = extract_bounding_box_pymupdf(pdf_bytes)
elif model_option == "Gemini":
bounding_boxes = extract_bounding_boxes_gemini(
"your_gemini_api_key_here", images
)
elif model_option == "GPT-4":
bounding_boxes = extract_bounding_box_gpt("your_gpt4_api_key_here", pdf_bytes)
else:
bounding_boxes = []
return images, table_content, bounding_boxes
def handle_model_selection(pdf_file, model_option):
return extract_images_and_tables(pdf_file, model_option)
# Define the Gradio interface
interface = gr.Interface(
fn=handle_model_selection,
inputs=[
gr.File(type="filepath", label="Upload PDF"),
gr.Dropdown(
label="Select Model",
choices=["PyMuPDF", "Gemini", "GPT-4"],
value="PyMuPDF",
),
],
outputs=[
gr.Gallery(label="Extracted Images"),
gr.Textbox(label="Extracted Tables"),
gr.JSON(label="Extracted Bounding Boxes"),
],
title="PDF Image and Table Extractor",
description="Upload a PDF to extract images and tables. Choose the model for extraction.",
)
interface.launch(share=True)