Spaces:

shimer56
/

Extract_PDF

Sleeping

File size: 3,885 Bytes

d2cb17f

# import gradio as gr
# import fitz  # PyMuPDF
# from PIL import Image
# from io import BytesIO
# import pandas as pd
# import os


# def extract_images_and_tables(pdf_file):
    
#     pdf_path = "temp.pdf"
#     with open(pdf_path, "wb") as f:
#         f.write(pdf_file)

    
#     pdf_document = fitz.open(pdf_path)

    
#     images = []
#     for page_index in range(len(pdf_document)):
#         for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
#             xref = img[0]
#             base_image = pdf_document.extract_image(xref)
#             image_bytes = base_image["image"]
#             image = Image.open(BytesIO(image_bytes))
#             images.append(image)

    
#     tables = []
#     for page_num in range(len(pdf_document)):
#         page = pdf_document.load_page(page_num)
#         text = page.get_text("text")
        
#         lines = [line.strip() for line in text.split("\n") if line.strip()]
        
#         if any("," in line for line in lines):
        
#             rows = [line.split(",") for line in lines]
        
#             tables.extend(rows)

    
#     table_content = ""
#     if tables:
#         max_columns = max(len(row) for row in tables)
#         tables = [row + [""] * (max_columns - len(row)) for row in tables]
#         df = pd.DataFrame(tables[1:], columns=tables[0])
#         table_content = df.to_csv(index=False)

    
#     pdf_document.close()

#     # Remove the temporary PDF file
#     os.remove(pdf_path)

#     return images, table_content



# interface = gr.Interface(
#     fn=extract_images_and_tables,
#     inputs=gr.File(type="binary"),  
#     outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
#     title="PDF Image and Table Extractor",
#     description="Upload a PDF to extract images and tables."
# )


# interface.launch(share=True)
import gradio as gr
import fitz  # PyMuPDF
from PIL import Image
from io import BytesIO
import pandas as pd
import os


def extract_images_and_tables(pdf_file, model_option):
    pdf_path = "temp.pdf"
    with open(pdf_path, "wb") as f:
        f.write(pdf_file)

    pdf_document = fitz.open(pdf_path)

    images = []
    for page_index in range(len(pdf_document)):
        for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(BytesIO(image_bytes))
            images.append(image)

    tables = []
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text = page.get_text("text")

        lines = [line.strip() for line in text.split("\n") if line.strip()]

        if any("," in line for line in lines):

            rows = [line.split(",") for line in lines]

            tables.extend(rows)

    table_content = ""
    if tables:
        max_columns = max(len(row) for row in tables)
        tables = [row + [""] * (max_columns - len(row)) for row in tables]
        df = pd.DataFrame(tables[1:], columns=tables[0])
        table_content = df.to_csv(index=False)

    pdf_document.close()

    os.remove(pdf_path)

    return images, table_content


def handle_model_selection(pdf_file, model_option):
   
    return extract_images_and_tables(pdf_file, model_option)


interface = gr.Interface(
    fn=handle_model_selection,
    inputs=[
        gr.File(type="binary", label="Upload PDF"), 
        gr.Dropdown(label="Select Model", choices=["Model 1", "Model 2", "Model 3"], value="Model 1")
    ],
    outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
    title="PDF Image and Table Extractor",
    description="Upload a PDF to extract images and tables. Choose the model for extraction."
)

interface.launch(share=True)