import gradio as gr import fitz # PyMuPDF from PIL import Image, ImageDraw from io import BytesIO import pandas as pd import os import numpy as np import google.generativeai as genai import openai import base64 import requests import tempfile import ast gemini_api_key = os.getenv("GEMINI_API_KEY") genai.configure(api_key=gemini_api_key) openai_api_key = os.getenv("OPENAI_API_KEY") openai.api_key = openai_api_key import gradio as gr import fitz # PyMuPDF from PIL import Image from io import BytesIO import pandas as pd import numpy as np import tempfile # Define the model extraction functions def extract_bounding_box_pymupdf(pdf_content): bounding_boxes = [] pdf_file = fitz.open(stream=pdf_content, filetype="pdf") for page_index in range(len(pdf_file)): page_bbox = [] page = pdf_file[page_index] image_list = page.get_images(full=True) for image_index, img in enumerate(page.get_images(full=True), start=1): rect = page.get_image_bbox(img[7]) bbox = list(rect) page_bbox.append(bbox) bounding_boxes.append(page_bbox) pdf_file.close() # Close the PDF file after use return bounding_boxes def extract_bounding_boxes_gemini(api_key, images): # Placeholder for Gemini API integration bounding_boxes = [[(0, 0, 100, 100)]] * len(images) # Dummy bounding boxes return bounding_boxes def extract_bounding_box_gpt(api_key, pdf_content): # Placeholder for GPT-4 API integration bounding_boxes = [[(0, 0, 100, 100)]] * len( fitz.open(stream=pdf_content, filetype="pdf") ) # Dummy bounding boxes return bounding_boxes def extract_images_and_tables(pdf_file, model_option): if isinstance(pdf_file, str): # If input is a file path (usually in testing or local execution) with open(pdf_file, "rb") as f: pdf_bytes = f.read() elif isinstance(pdf_file, bytes): # If input is bytes (from Gradio) pdf_bytes = pdf_file else: raise TypeError("Unsupported input type for pdf_file.") pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") images = [] for page_index in range(len(pdf_document)): for img_index, img in enumerate(pdf_document.get_page_images(page_index)): xref = img[0] base_image = pdf_document.extract_image(xref) image_bytes = base_image["image"] image = Image.open(BytesIO(image_bytes)) images.append(image) tables = [] for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) text = page.get_text("text") lines = [line.strip() for line in text.split("\n") if line.strip()] if any("," in line for line in lines): rows = [line.split(",") for line in lines] tables.extend(rows) table_content = "" if tables: max_columns = max(len(row) for row in tables) tables = [row + [""] * (max_columns - len(row)) for row in tables] df = pd.DataFrame(tables[1:], columns=tables[0]) table_content = df.to_csv(index=False) pdf_document.close() if model_option == "PyMuPDF": bounding_boxes = extract_bounding_box_pymupdf(pdf_bytes) elif model_option == "Gemini": bounding_boxes = extract_bounding_boxes_gemini( "your_gemini_api_key_here", images ) elif model_option == "GPT-4": bounding_boxes = extract_bounding_box_gpt("your_gpt4_api_key_here", pdf_bytes) else: bounding_boxes = [] return images, table_content, bounding_boxes def handle_model_selection(pdf_file, model_option): return extract_images_and_tables(pdf_file, model_option) # Define the Gradio interface interface = gr.Interface( fn=handle_model_selection, inputs=[ gr.File(type="filepath", label="Upload PDF"), gr.Dropdown( label="Select Model", choices=["PyMuPDF", "Gemini", "GPT-4"], value="PyMuPDF", ), ], outputs=[ gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables"), gr.JSON(label="Extracted Bounding Boxes"), ], title="PDF Image and Table Extractor", description="Upload a PDF to extract images and tables. Choose the model for extraction.", ) interface.launch(share=True)