Spaces:
Sleeping
Sleeping
File size: 3,885 Bytes
d2cb17f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# import gradio as gr
# import fitz # PyMuPDF
# from PIL import Image
# from io import BytesIO
# import pandas as pd
# import os
# def extract_images_and_tables(pdf_file):
# pdf_path = "temp.pdf"
# with open(pdf_path, "wb") as f:
# f.write(pdf_file)
# pdf_document = fitz.open(pdf_path)
# images = []
# for page_index in range(len(pdf_document)):
# for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
# xref = img[0]
# base_image = pdf_document.extract_image(xref)
# image_bytes = base_image["image"]
# image = Image.open(BytesIO(image_bytes))
# images.append(image)
# tables = []
# for page_num in range(len(pdf_document)):
# page = pdf_document.load_page(page_num)
# text = page.get_text("text")
# lines = [line.strip() for line in text.split("\n") if line.strip()]
# if any("," in line for line in lines):
# rows = [line.split(",") for line in lines]
# tables.extend(rows)
# table_content = ""
# if tables:
# max_columns = max(len(row) for row in tables)
# tables = [row + [""] * (max_columns - len(row)) for row in tables]
# df = pd.DataFrame(tables[1:], columns=tables[0])
# table_content = df.to_csv(index=False)
# pdf_document.close()
# # Remove the temporary PDF file
# os.remove(pdf_path)
# return images, table_content
# interface = gr.Interface(
# fn=extract_images_and_tables,
# inputs=gr.File(type="binary"),
# outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
# title="PDF Image and Table Extractor",
# description="Upload a PDF to extract images and tables."
# )
# interface.launch(share=True)
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
from io import BytesIO
import pandas as pd
import os
def extract_images_and_tables(pdf_file, model_option):
pdf_path = "temp.pdf"
with open(pdf_path, "wb") as f:
f.write(pdf_file)
pdf_document = fitz.open(pdf_path)
images = []
for page_index in range(len(pdf_document)):
for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(BytesIO(image_bytes))
images.append(image)
tables = []
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text("text")
lines = [line.strip() for line in text.split("\n") if line.strip()]
if any("," in line for line in lines):
rows = [line.split(",") for line in lines]
tables.extend(rows)
table_content = ""
if tables:
max_columns = max(len(row) for row in tables)
tables = [row + [""] * (max_columns - len(row)) for row in tables]
df = pd.DataFrame(tables[1:], columns=tables[0])
table_content = df.to_csv(index=False)
pdf_document.close()
os.remove(pdf_path)
return images, table_content
def handle_model_selection(pdf_file, model_option):
return extract_images_and_tables(pdf_file, model_option)
interface = gr.Interface(
fn=handle_model_selection,
inputs=[
gr.File(type="binary", label="Upload PDF"),
gr.Dropdown(label="Select Model", choices=["Model 1", "Model 2", "Model 3"], value="Model 1")
],
outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
title="PDF Image and Table Extractor",
description="Upload a PDF to extract images and tables. Choose the model for extraction."
)
interface.launch(share=True)
|