Extract_PDF / app.py
shimer56's picture
Upload folder using huggingface_hub
d2cb17f verified
raw
history blame
No virus
2.68 kB
import gradio as gr
from extract_images.services import (
extract_images_pymupdf,
extract_images_pdfplumber,
extract_images_gemini,
extract_images_gpt,
)
from extract_tables.services import (
extract_tables_pymupdf,
extract_tables_tab_transformer,
extract_tables_img2table,
extract_tables_gemini,
extract_tables_gpt,
)
from utils import clear_directory
def handle_model_selection(pdf_file, model_option):
if model_option == "PyMuPDF":
images = extract_images_pymupdf(pdf_file)
tables = extract_tables_pymupdf(pdf_file)
elif model_option == "PdfPlumber (Extracts Images only)":
images = extract_images_pdfplumber(pdf_file)
tables = None
elif model_option == "Table Transformer (Extracts Tables only)":
images = None
tables = extract_tables_tab_transformer(pdf_file)
elif model_option == "img2table (Extracts Tables only)":
images = None
tables = extract_tables_img2table(pdf_file)
elif model_option == "Gemini Pro":
images = extract_images_gemini("gemini-pro-vision", pdf_file)
tables = extract_tables_gemini("gemini-pro-vision", pdf_file)
elif model_option == "Gemini Flash":
images = extract_images_gemini("gemini-1.5-flash-latest", pdf_file)
tables = extract_tables_gemini("gemini-1.5-flash-latest", pdf_file)
elif model_option == "GPT 4 Turbo":
images = extract_images_gpt("gpt-4-turbo", pdf_file)
tables = extract_tables_gpt("gpt-4-turbo", pdf_file)
elif model_option == "GPT 4o":
images = extract_images_gpt("gpt-4o", pdf_file)
tables = extract_tables_gpt("gpt-4o", pdf_file)
clear_directory("extract_tables/table_outputs")
clear_directory("extract_images/image_outputs")
return images, tables
interface = gr.Interface(
fn=handle_model_selection,
inputs=[
gr.File(type="binary", label="Upload PDF"),
gr.Dropdown(
label="Select Model",
choices=[
"PdfPlumber (Extracts Images only)",
"Table Transformer (Extracts Tables only)",
"img2table (Extracts Tables only)",
"PyMuPDF",
"Gemini Pro",
"Gemini Flash",
"GPT 4 Turbo",
"GPT 4o",
],
value="PyMuPDF",
),
],
outputs=[
gr.Gallery(label="Extracted Images"),
gr.Gallery(label="Extracted Tables"),
],
title="PDF Image and Table Extractor",
description="Upload a PDF to extract images and tables. Choose the model for extraction.",
)
interface.launch(share=True)