Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitignore +4 -0
- README.md +2 -8
- app.py +84 -0
- constants.py +2 -0
- extract_images/input_docs/uploaded_pdf.pdf +0 -0
- extract_images/services.py +147 -0
- extract_tables/input_docs/uploaded_pdf.pdf +0 -0
- extract_tables/services.py +191 -0
- ocr_notebooks/OCR_Benchmarking.ipynb +0 -0
- ocr_notebooks/image_extraction.ipynb +0 -0
- ocr_notebooks/table_extraction.ipynb +0 -0
- requirements.txt +16 -0
- test_extraction/input_docs/output_gemini_flash.png +0 -0
- test_extraction/input_docs/page_0.png +0 -0
- test_extraction/input_docs/page_1.png +0 -0
- test_extraction/input_docs/page_2.png +0 -0
- test_extraction/output_files/extracted_table.csv +48 -0
- test_extraction/output_files/output_gemini_pro.png +0 -0
- test_extraction/output_files/output_initial_gpt4_4o.png +0 -0
- test_extraction/output_files/output_initial_gpt4_turbo.png +0 -0
- test_extraction/output_files/output_scaled_gpt4_4o.png +0 -0
- test_extraction/output_files/output_scaled_gpt4_turbo.png +0 -0
- test_extraction/requirements_versioned.txt +1 -0
- test_extraction/sample.py +137 -0
- test_extraction/sample1.py +58 -0
- test_extraction/sample_11.py +141 -0
- utils.py +108 -0
- visual_assets/.gitkeep +0 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*cache*
|
2 |
+
.venv/
|
3 |
+
flagged/
|
4 |
+
__pycache__/
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.36.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Extract_PDF
|
3 |
+
app_file: app.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.36.1
|
|
|
|
|
6 |
---
|
|
|
|
app.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from extract_images.services import (
|
4 |
+
extract_images_pymupdf,
|
5 |
+
extract_images_pdfplumber,
|
6 |
+
extract_images_gemini,
|
7 |
+
extract_images_gpt,
|
8 |
+
)
|
9 |
+
from extract_tables.services import (
|
10 |
+
extract_tables_pymupdf,
|
11 |
+
extract_tables_tab_transformer,
|
12 |
+
extract_tables_img2table,
|
13 |
+
extract_tables_gemini,
|
14 |
+
extract_tables_gpt,
|
15 |
+
)
|
16 |
+
from utils import clear_directory
|
17 |
+
|
18 |
+
|
19 |
+
def handle_model_selection(pdf_file, model_option):
|
20 |
+
if model_option == "PyMuPDF":
|
21 |
+
images = extract_images_pymupdf(pdf_file)
|
22 |
+
tables = extract_tables_pymupdf(pdf_file)
|
23 |
+
|
24 |
+
elif model_option == "PdfPlumber (Extracts Images only)":
|
25 |
+
images = extract_images_pdfplumber(pdf_file)
|
26 |
+
tables = None
|
27 |
+
|
28 |
+
elif model_option == "Table Transformer (Extracts Tables only)":
|
29 |
+
images = None
|
30 |
+
tables = extract_tables_tab_transformer(pdf_file)
|
31 |
+
|
32 |
+
elif model_option == "img2table (Extracts Tables only)":
|
33 |
+
images = None
|
34 |
+
tables = extract_tables_img2table(pdf_file)
|
35 |
+
|
36 |
+
elif model_option == "Gemini Pro":
|
37 |
+
images = extract_images_gemini("gemini-pro-vision", pdf_file)
|
38 |
+
tables = extract_tables_gemini("gemini-pro-vision", pdf_file)
|
39 |
+
|
40 |
+
elif model_option == "Gemini Flash":
|
41 |
+
images = extract_images_gemini("gemini-1.5-flash-latest", pdf_file)
|
42 |
+
tables = extract_tables_gemini("gemini-1.5-flash-latest", pdf_file)
|
43 |
+
|
44 |
+
elif model_option == "GPT 4 Turbo":
|
45 |
+
images = extract_images_gpt("gpt-4-turbo", pdf_file)
|
46 |
+
tables = extract_tables_gpt("gpt-4-turbo", pdf_file)
|
47 |
+
|
48 |
+
elif model_option == "GPT 4o":
|
49 |
+
images = extract_images_gpt("gpt-4o", pdf_file)
|
50 |
+
tables = extract_tables_gpt("gpt-4o", pdf_file)
|
51 |
+
|
52 |
+
clear_directory("extract_tables/table_outputs")
|
53 |
+
clear_directory("extract_images/image_outputs")
|
54 |
+
return images, tables
|
55 |
+
|
56 |
+
|
57 |
+
interface = gr.Interface(
|
58 |
+
fn=handle_model_selection,
|
59 |
+
inputs=[
|
60 |
+
gr.File(type="binary", label="Upload PDF"),
|
61 |
+
gr.Dropdown(
|
62 |
+
label="Select Model",
|
63 |
+
choices=[
|
64 |
+
"PdfPlumber (Extracts Images only)",
|
65 |
+
"Table Transformer (Extracts Tables only)",
|
66 |
+
"img2table (Extracts Tables only)",
|
67 |
+
"PyMuPDF",
|
68 |
+
"Gemini Pro",
|
69 |
+
"Gemini Flash",
|
70 |
+
"GPT 4 Turbo",
|
71 |
+
"GPT 4o",
|
72 |
+
],
|
73 |
+
value="PyMuPDF",
|
74 |
+
),
|
75 |
+
],
|
76 |
+
outputs=[
|
77 |
+
gr.Gallery(label="Extracted Images"),
|
78 |
+
gr.Gallery(label="Extracted Tables"),
|
79 |
+
],
|
80 |
+
title="PDF Image and Table Extractor",
|
81 |
+
description="Upload a PDF to extract images and tables. Choose the model for extraction.",
|
82 |
+
)
|
83 |
+
|
84 |
+
interface.launch(share=True)
|
constants.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
GEMINI_API_KEY = "AIzaSyBwk94xRhPOIkvO0E3pYhXQ7Rrk5my5IyY"
|
2 |
+
OPENAI_API_KEY = "sk-proj-YOl2xepEsNppWm3xLshlT3BlbkFJL04qQgahGxFcFGEClnQK"
|
extract_images/input_docs/uploaded_pdf.pdf
ADDED
Binary file (41.4 kB). View file
|
|
extract_images/services.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pymupdf
|
2 |
+
from io import BytesIO
|
3 |
+
from PIL import Image
|
4 |
+
import pdfplumber
|
5 |
+
import ast
|
6 |
+
import google.generativeai as genai
|
7 |
+
from PIL import Image, ImageDraw
|
8 |
+
import openai
|
9 |
+
import requests
|
10 |
+
|
11 |
+
from constants import GEMINI_API_KEY, OPENAI_API_KEY
|
12 |
+
from utils import (
|
13 |
+
draw_boxes,
|
14 |
+
pdf_to_images,
|
15 |
+
parse_bboxs_gemini_flash,
|
16 |
+
convert_pdf_to_images,
|
17 |
+
encode_image_to_base64,
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
def extract_images_pymupdf(pdf_file):
|
22 |
+
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
|
23 |
+
with open(pdf_path, "wb") as f:
|
24 |
+
f.write(pdf_file)
|
25 |
+
|
26 |
+
doc = pymupdf.open(pdf_path)
|
27 |
+
images = []
|
28 |
+
for page_idx, page in enumerate(doc):
|
29 |
+
for img_index, img in enumerate(doc.get_page_images(page_idx)):
|
30 |
+
xref = img[0]
|
31 |
+
base_image = doc.extract_image(xref)
|
32 |
+
image_bytes = base_image["image"]
|
33 |
+
image = Image.open(BytesIO(image_bytes))
|
34 |
+
images.append(image)
|
35 |
+
return images if images != [] else None
|
36 |
+
|
37 |
+
|
38 |
+
def extract_images_pdfplumber(pdf_file):
|
39 |
+
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
|
40 |
+
with open(pdf_path, "wb") as f:
|
41 |
+
f.write(pdf_file)
|
42 |
+
|
43 |
+
images = []
|
44 |
+
pdf_obj = pdfplumber.open(pdf_path)
|
45 |
+
for page_idx, page in enumerate(pdf_obj.pages):
|
46 |
+
page_bbox = []
|
47 |
+
for image_idx, image in enumerate(page.images):
|
48 |
+
page_height = page.height
|
49 |
+
image_bbox = (
|
50 |
+
image["x0"],
|
51 |
+
page_height - image["y1"],
|
52 |
+
image["x1"],
|
53 |
+
page_height - image["y0"],
|
54 |
+
)
|
55 |
+
page_bbox.append(image_bbox)
|
56 |
+
cropped_page = page.crop(image_bbox)
|
57 |
+
image_obj = cropped_page.to_image(resolution=400)
|
58 |
+
image_path = (
|
59 |
+
f"extract_images/image_outputs/image-{page_idx}-{image_idx}.png"
|
60 |
+
)
|
61 |
+
image_obj.save(image_path)
|
62 |
+
image = Image.open(image_path)
|
63 |
+
images.append(image)
|
64 |
+
return images if images != [] else None
|
65 |
+
|
66 |
+
|
67 |
+
def extract_images_gemini(model, pdf_file):
|
68 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
69 |
+
gemini_model = genai.GenerativeModel(model)
|
70 |
+
prompt = f"Extract the bounding boxes of all the images present in this page. Return the bounding boxes as list of lists. Do not include anyother text or symbols in the output"
|
71 |
+
|
72 |
+
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
|
73 |
+
with open(pdf_path, "wb") as f:
|
74 |
+
f.write(pdf_file)
|
75 |
+
|
76 |
+
images = []
|
77 |
+
pdf_images = pdf_to_images(pdf_path)
|
78 |
+
for page in pdf_images:
|
79 |
+
img = Image.open(page).convert("RGB")
|
80 |
+
response = gemini_model.generate_content([img, prompt], stream=False)
|
81 |
+
response.resolve()
|
82 |
+
print(response.text)
|
83 |
+
|
84 |
+
if model == "gemini-pro-vision":
|
85 |
+
page_bbox = ast.literal_eval(response.text)
|
86 |
+
elif model == "gemini-1.5-flash-latest":
|
87 |
+
page_bbox = parse_bboxs_gemini_flash(response.text)
|
88 |
+
|
89 |
+
image = draw_boxes(page, page_bbox)
|
90 |
+
images.append(image)
|
91 |
+
return images
|
92 |
+
|
93 |
+
|
94 |
+
def extract_images_gpt(model, pdf_file):
|
95 |
+
openai.api_key = OPENAI_API_KEY
|
96 |
+
image_media_type = "image/png"
|
97 |
+
|
98 |
+
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
|
99 |
+
with open(pdf_path, "wb") as f:
|
100 |
+
f.write(pdf_file)
|
101 |
+
|
102 |
+
images = convert_pdf_to_images(pdf_path)
|
103 |
+
image_paths = pdf_to_images(pdf_path)
|
104 |
+
|
105 |
+
headers = {
|
106 |
+
"Content-Type": "application/json",
|
107 |
+
"Authorization": f"Bearer {openai.api_key}",
|
108 |
+
}
|
109 |
+
|
110 |
+
extracted_images = []
|
111 |
+
for page_idx, image in enumerate(images):
|
112 |
+
base64_string = encode_image_to_base64(image)
|
113 |
+
payload = {
|
114 |
+
"model": model,
|
115 |
+
"messages": [
|
116 |
+
{
|
117 |
+
"role": "user",
|
118 |
+
"content": [
|
119 |
+
{
|
120 |
+
"type": "text",
|
121 |
+
"text": "Extract bounding boxes of all the images present in this page. Return bounding boxes as liat of lists and don't provide any other text in the response.",
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"type": "image_url",
|
125 |
+
"image_url": {
|
126 |
+
"url": f"data:image/jpeg;base64,{base64_string}"
|
127 |
+
},
|
128 |
+
},
|
129 |
+
],
|
130 |
+
}
|
131 |
+
],
|
132 |
+
}
|
133 |
+
|
134 |
+
response = requests.post(
|
135 |
+
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload
|
136 |
+
)
|
137 |
+
response_json = response.json()
|
138 |
+
|
139 |
+
if "choices" in response_json and len(response_json["choices"]) > 0:
|
140 |
+
extracted_images.append(
|
141 |
+
draw_boxes(
|
142 |
+
image_paths[page_idx],
|
143 |
+
ast.literal_eval(response_json["choices"][0]["message"]["content"]),
|
144 |
+
)
|
145 |
+
)
|
146 |
+
|
147 |
+
return extracted_images
|
extract_tables/input_docs/uploaded_pdf.pdf
ADDED
Binary file (41.4 kB). View file
|
|
extract_tables/services.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pymupdf
|
2 |
+
import fitz
|
3 |
+
import io
|
4 |
+
from io import BytesIO
|
5 |
+
from PIL import Image
|
6 |
+
from transformers import AutoImageProcessor, TableTransformerForObjectDetection
|
7 |
+
import torch
|
8 |
+
import ast
|
9 |
+
import google.generativeai as genai
|
10 |
+
import openai
|
11 |
+
import requests
|
12 |
+
|
13 |
+
|
14 |
+
from constants import GEMINI_API_KEY, OPENAI_API_KEY
|
15 |
+
from utils import (
|
16 |
+
draw_boxes,
|
17 |
+
pdf_to_images,
|
18 |
+
parse_bboxs_gemini_flash,
|
19 |
+
convert_pdf_to_images,
|
20 |
+
encode_image_to_base64,
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
def get_bounding_box_pymupdf(pdf_path):
|
25 |
+
bounding_boxes = []
|
26 |
+
pages = pymupdf.open(pdf_path)
|
27 |
+
for page_num in range(len(pages)):
|
28 |
+
page = pages[page_num]
|
29 |
+
tabs = page.find_tables()
|
30 |
+
page_tables = []
|
31 |
+
for table in range(len(tabs.tables)):
|
32 |
+
page_tables.append(list(tabs.tables[table].bbox))
|
33 |
+
bounding_boxes.append(page_tables)
|
34 |
+
return bounding_boxes
|
35 |
+
|
36 |
+
|
37 |
+
def extract_tables_pymupdf(pdf_file):
|
38 |
+
pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf"
|
39 |
+
with open(pdf_path, "wb") as f:
|
40 |
+
f.write(pdf_file)
|
41 |
+
|
42 |
+
bounding_boxes = get_bounding_box_pymupdf(pdf_path)
|
43 |
+
pages = fitz.open(pdf_path)
|
44 |
+
|
45 |
+
tables = []
|
46 |
+
for page_num, page_tables in enumerate(bounding_boxes, start=1):
|
47 |
+
page = pages[page_num - 1]
|
48 |
+
for table_num, bbox in enumerate(page_tables, start=1):
|
49 |
+
clip = page.get_pixmap(clip=bbox, alpha=False)
|
50 |
+
img = Image.frombytes("RGB", [clip.width, clip.height], clip.samples)
|
51 |
+
img_bytes = io.BytesIO()
|
52 |
+
img.save(img_bytes, format="PNG")
|
53 |
+
img_bytes = img_bytes.getvalue()
|
54 |
+
image = Image.open(BytesIO(img_bytes))
|
55 |
+
tables.append(image)
|
56 |
+
return tables
|
57 |
+
|
58 |
+
|
59 |
+
def extract_tables_tab_transformer(pdf_file):
|
60 |
+
image_processor = AutoImageProcessor.from_pretrained(
|
61 |
+
"microsoft/table-transformer-detection"
|
62 |
+
)
|
63 |
+
model = TableTransformerForObjectDetection.from_pretrained(
|
64 |
+
"microsoft/table-transformer-detection"
|
65 |
+
)
|
66 |
+
|
67 |
+
pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf"
|
68 |
+
with open(pdf_path, "wb") as f:
|
69 |
+
f.write(pdf_file)
|
70 |
+
|
71 |
+
tables = []
|
72 |
+
pdf_images = pdf_to_images(pdf_path)
|
73 |
+
for page in pdf_images:
|
74 |
+
image = Image.open(page).convert("RGB")
|
75 |
+
|
76 |
+
inputs = image_processor(images=image, return_tensors="pt")
|
77 |
+
outputs = model(**inputs)
|
78 |
+
|
79 |
+
target_sizes = torch.tensor([image.size[::-1]])
|
80 |
+
results = image_processor.post_process_object_detection(
|
81 |
+
outputs, threshold=0.9, target_sizes=target_sizes
|
82 |
+
)[0]
|
83 |
+
|
84 |
+
image = draw_boxes(page, results["boxes"].tolist())
|
85 |
+
tables.append(image)
|
86 |
+
return tables
|
87 |
+
|
88 |
+
|
89 |
+
def extract_tables_img2table(pdf_file):
|
90 |
+
from img2table.document import Image
|
91 |
+
|
92 |
+
pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf"
|
93 |
+
with open(pdf_path, "wb") as f:
|
94 |
+
f.write(pdf_file)
|
95 |
+
|
96 |
+
tables = []
|
97 |
+
pdf_images = pdf_to_images(pdf_path)
|
98 |
+
for image_path in pdf_images:
|
99 |
+
img = Image(src=image_path)
|
100 |
+
extracted_tables = img.extract_tables()
|
101 |
+
bbox_values = [
|
102 |
+
[table.bbox.x1, table.bbox.y1, table.bbox.x2, table.bbox.y2]
|
103 |
+
for table in extracted_tables
|
104 |
+
]
|
105 |
+
|
106 |
+
image = draw_boxes(image_path, bbox_values)
|
107 |
+
tables.append(image)
|
108 |
+
return tables
|
109 |
+
|
110 |
+
|
111 |
+
def extract_tables_gemini(model, pdf_file):
|
112 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
113 |
+
gemini_model = genai.GenerativeModel(model)
|
114 |
+
prompt = f"Extract the bounding boxes of all the tables present in this image. Return the bounding boxes as list of lists. Do not include anyother text or symbols in the output"
|
115 |
+
|
116 |
+
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
|
117 |
+
with open(pdf_path, "wb") as f:
|
118 |
+
f.write(pdf_file)
|
119 |
+
|
120 |
+
tables = []
|
121 |
+
pdf_images = pdf_to_images(pdf_path)
|
122 |
+
for page in pdf_images:
|
123 |
+
img = Image.open(page).convert("RGB")
|
124 |
+
response = gemini_model.generate_content([img, prompt], stream=False)
|
125 |
+
response.resolve()
|
126 |
+
print(response.text)
|
127 |
+
|
128 |
+
if model == "gemini-pro-vision":
|
129 |
+
page_bbox = ast.literal_eval(response.text)
|
130 |
+
elif model == "gemini-1.5-flash-latest":
|
131 |
+
page_bbox = parse_bboxs_gemini_flash(response.text)
|
132 |
+
|
133 |
+
image = draw_boxes(page, page_bbox)
|
134 |
+
tables.append(image)
|
135 |
+
return tables
|
136 |
+
|
137 |
+
|
138 |
+
def extract_tables_gpt(model, pdf_file):
|
139 |
+
openai.api_key = OPENAI_API_KEY
|
140 |
+
image_media_type = "image/png"
|
141 |
+
|
142 |
+
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
|
143 |
+
with open(pdf_path, "wb") as f:
|
144 |
+
f.write(pdf_file)
|
145 |
+
|
146 |
+
images = convert_pdf_to_images(pdf_path)
|
147 |
+
image_paths = pdf_to_images(pdf_path)
|
148 |
+
|
149 |
+
headers = {
|
150 |
+
"Content-Type": "application/json",
|
151 |
+
"Authorization": f"Bearer {openai.api_key}",
|
152 |
+
}
|
153 |
+
|
154 |
+
extracted_tables = []
|
155 |
+
for page_idx, image in enumerate(images):
|
156 |
+
base64_string = encode_image_to_base64(image)
|
157 |
+
payload = {
|
158 |
+
"model": model,
|
159 |
+
"messages": [
|
160 |
+
{
|
161 |
+
"role": "user",
|
162 |
+
"content": [
|
163 |
+
{
|
164 |
+
"type": "text",
|
165 |
+
"text": "Extract bounding boxes of all the tables present in this page. Return bounding boxes as liat of lists and don't provide any other text in the response.",
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"type": "image_url",
|
169 |
+
"image_url": {
|
170 |
+
"url": f"data:image/jpeg;base64,{base64_string}"
|
171 |
+
},
|
172 |
+
},
|
173 |
+
],
|
174 |
+
}
|
175 |
+
],
|
176 |
+
}
|
177 |
+
|
178 |
+
response = requests.post(
|
179 |
+
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload
|
180 |
+
)
|
181 |
+
response_json = response.json()
|
182 |
+
|
183 |
+
if "choices" in response_json and len(response_json["choices"]) > 0:
|
184 |
+
extracted_tables.append(
|
185 |
+
draw_boxes(
|
186 |
+
image_paths[page_idx],
|
187 |
+
ast.literal_eval(response_json["choices"][0]["message"]["content"]),
|
188 |
+
)
|
189 |
+
)
|
190 |
+
|
191 |
+
return extracted_tables
|
ocr_notebooks/OCR_Benchmarking.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ocr_notebooks/image_extraction.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ocr_notebooks/table_extraction.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
black
|
2 |
+
numpy
|
3 |
+
pytest
|
4 |
+
gradio
|
5 |
+
pdf2image
|
6 |
+
tensorflow
|
7 |
+
pytesseract
|
8 |
+
opencv-python
|
9 |
+
python-resize-image
|
10 |
+
google-generativeai
|
11 |
+
openai
|
12 |
+
pdfplumber
|
13 |
+
pymupdf
|
14 |
+
timm
|
15 |
+
transformers
|
16 |
+
img2table
|
test_extraction/input_docs/output_gemini_flash.png
ADDED
test_extraction/input_docs/page_0.png
ADDED
test_extraction/input_docs/page_1.png
ADDED
test_extraction/input_docs/page_2.png
ADDED
test_extraction/output_files/extracted_table.csv
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2-13,,,,
|
2 |
+
02,,,,
|
3 |
+
7,5(63(&,),&$7,21$1'
|
4 |
+
35(6685(/$%(/,,,,
|
5 |
+
OAI3019023,,,,
|
6 |
+
OAI3019023,,,,
|
7 |
+
The tires supplied on your new,,,,
|
8 |
+
vehicle are chosen to provide the best,,,,
|
9 |
+
performance for normal driving.,,,,
|
10 |
+
The tire label located on the driver’s,,,,
|
11 |
+
side center pillar gives the tire pressures,,,,
|
12 |
+
recommended for your vehicle.,,,,
|
13 |
+
(1*,1(180%(5,,,
|
14 |
+
OAH2088004,,,,
|
15 |
+
OAH2088004,,,,
|
16 |
+
The engine number is stamped on the,,,,
|
17 |
+
engine block as shown in the drawing.,,,,
|
18 |
+
10,,,,
|
19 |
+
SRS warning indicator in the instrument,,,,
|
20 |
+
cluster illuminates continuously, it means,,,
|
21 |
+
that there is malfunction in the system. Re,,,,
|
22 |
+
move the CRS from front passenger seat,,,,
|
23 |
+
and contact your TATA MOTORS autho,,,,
|
24 |
+
rised service center.,,,,
|
25 |
+
NOTE,,,,
|
26 |
+
The above image’s are for reference,,,,
|
27 |
+
purpose only.,,,,
|
28 |
+
SAFETY,,,,
|
29 |
+
2-6,,,,
|
30 |
+
Vehicle Information,,,,
|
31 |
+
1. Engine coolant reservoir ..............................................................................................................9-23,,,,
|
32 |
+
2. Engine oil filler cap .....................................................................................................................9-20,,,,
|
33 |
+
3. Brake/clutch* fluid reservoir,,,,
|
34 |
+
......................................................................................................9-26,,,,
|
35 |
+
4. Air cleaner ...................................................................................................................................9-30,,,,
|
36 |
+
5. Fuse box ......................................................................................................................................9-54,,,,
|
37 |
+
6. Battery .........................................................................................................................................9-37,,,,
|
38 |
+
7. Windshield washer fluid reservoir ..............................................................................................9-28,,,,
|
39 |
+
8. Radiator cap,,,,
|
40 |
+
.................................................................................................................................9-24,,,,
|
41 |
+
9. Engine oil dipstick .......................................................................................................................9-20,,,,
|
42 |
+
(1*,1(&203$570(1729(59,(:,,
|
43 |
+
The actual engine room in the vehicle may differ from the illustration.,,,,
|
44 |
+
OAI3089001,,,,
|
45 |
+
OAI3089001,,,,
|
46 |
+
,,,,
|
47 |
+
Petrol Engine (Kappa 1.2 MPI),,,,
|
48 |
+
Petrol Engine (Kappa 1.2 MPI),,,,
|
test_extraction/output_files/output_gemini_pro.png
ADDED
test_extraction/output_files/output_initial_gpt4_4o.png
ADDED
test_extraction/output_files/output_initial_gpt4_turbo.png
ADDED
test_extraction/output_files/output_scaled_gpt4_4o.png
ADDED
test_extraction/output_files/output_scaled_gpt4_turbo.png
ADDED
test_extraction/requirements_versioned.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
��
|
test_extraction/sample.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import gradio as gr
|
2 |
+
# import fitz # PyMuPDF
|
3 |
+
# from PIL import Image
|
4 |
+
# from io import BytesIO
|
5 |
+
# import pandas as pd
|
6 |
+
# import os
|
7 |
+
|
8 |
+
|
9 |
+
# def extract_images_and_tables(pdf_file):
|
10 |
+
|
11 |
+
# pdf_path = "temp.pdf"
|
12 |
+
# with open(pdf_path, "wb") as f:
|
13 |
+
# f.write(pdf_file)
|
14 |
+
|
15 |
+
|
16 |
+
# pdf_document = fitz.open(pdf_path)
|
17 |
+
|
18 |
+
|
19 |
+
# images = []
|
20 |
+
# for page_index in range(len(pdf_document)):
|
21 |
+
# for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
|
22 |
+
# xref = img[0]
|
23 |
+
# base_image = pdf_document.extract_image(xref)
|
24 |
+
# image_bytes = base_image["image"]
|
25 |
+
# image = Image.open(BytesIO(image_bytes))
|
26 |
+
# images.append(image)
|
27 |
+
|
28 |
+
|
29 |
+
# tables = []
|
30 |
+
# for page_num in range(len(pdf_document)):
|
31 |
+
# page = pdf_document.load_page(page_num)
|
32 |
+
# text = page.get_text("text")
|
33 |
+
|
34 |
+
# lines = [line.strip() for line in text.split("\n") if line.strip()]
|
35 |
+
|
36 |
+
# if any("," in line for line in lines):
|
37 |
+
|
38 |
+
# rows = [line.split(",") for line in lines]
|
39 |
+
|
40 |
+
# tables.extend(rows)
|
41 |
+
|
42 |
+
|
43 |
+
# table_content = ""
|
44 |
+
# if tables:
|
45 |
+
# max_columns = max(len(row) for row in tables)
|
46 |
+
# tables = [row + [""] * (max_columns - len(row)) for row in tables]
|
47 |
+
# df = pd.DataFrame(tables[1:], columns=tables[0])
|
48 |
+
# table_content = df.to_csv(index=False)
|
49 |
+
|
50 |
+
|
51 |
+
# pdf_document.close()
|
52 |
+
|
53 |
+
# # Remove the temporary PDF file
|
54 |
+
# os.remove(pdf_path)
|
55 |
+
|
56 |
+
# return images, table_content
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
# interface = gr.Interface(
|
61 |
+
# fn=extract_images_and_tables,
|
62 |
+
# inputs=gr.File(type="binary"),
|
63 |
+
# outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
|
64 |
+
# title="PDF Image and Table Extractor",
|
65 |
+
# description="Upload a PDF to extract images and tables."
|
66 |
+
# )
|
67 |
+
|
68 |
+
|
69 |
+
# interface.launch(share=True)
|
70 |
+
import gradio as gr
|
71 |
+
import fitz # PyMuPDF
|
72 |
+
from PIL import Image
|
73 |
+
from io import BytesIO
|
74 |
+
import pandas as pd
|
75 |
+
import os
|
76 |
+
|
77 |
+
|
78 |
+
def extract_images_and_tables(pdf_file, model_option):
|
79 |
+
pdf_path = "temp.pdf"
|
80 |
+
with open(pdf_path, "wb") as f:
|
81 |
+
f.write(pdf_file)
|
82 |
+
|
83 |
+
pdf_document = fitz.open(pdf_path)
|
84 |
+
|
85 |
+
images = []
|
86 |
+
for page_index in range(len(pdf_document)):
|
87 |
+
for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
|
88 |
+
xref = img[0]
|
89 |
+
base_image = pdf_document.extract_image(xref)
|
90 |
+
image_bytes = base_image["image"]
|
91 |
+
image = Image.open(BytesIO(image_bytes))
|
92 |
+
images.append(image)
|
93 |
+
|
94 |
+
tables = []
|
95 |
+
for page_num in range(len(pdf_document)):
|
96 |
+
page = pdf_document.load_page(page_num)
|
97 |
+
text = page.get_text("text")
|
98 |
+
|
99 |
+
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
100 |
+
|
101 |
+
if any("," in line for line in lines):
|
102 |
+
|
103 |
+
rows = [line.split(",") for line in lines]
|
104 |
+
|
105 |
+
tables.extend(rows)
|
106 |
+
|
107 |
+
table_content = ""
|
108 |
+
if tables:
|
109 |
+
max_columns = max(len(row) for row in tables)
|
110 |
+
tables = [row + [""] * (max_columns - len(row)) for row in tables]
|
111 |
+
df = pd.DataFrame(tables[1:], columns=tables[0])
|
112 |
+
table_content = df.to_csv(index=False)
|
113 |
+
|
114 |
+
pdf_document.close()
|
115 |
+
|
116 |
+
os.remove(pdf_path)
|
117 |
+
|
118 |
+
return images, table_content
|
119 |
+
|
120 |
+
|
121 |
+
def handle_model_selection(pdf_file, model_option):
|
122 |
+
|
123 |
+
return extract_images_and_tables(pdf_file, model_option)
|
124 |
+
|
125 |
+
|
126 |
+
interface = gr.Interface(
|
127 |
+
fn=handle_model_selection,
|
128 |
+
inputs=[
|
129 |
+
gr.File(type="binary", label="Upload PDF"),
|
130 |
+
gr.Dropdown(label="Select Model", choices=["Model 1", "Model 2", "Model 3"], value="Model 1")
|
131 |
+
],
|
132 |
+
outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
|
133 |
+
title="PDF Image and Table Extractor",
|
134 |
+
description="Upload a PDF to extract images and tables. Choose the model for extraction."
|
135 |
+
)
|
136 |
+
|
137 |
+
interface.launch(share=True)
|
test_extraction/sample1.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from google.cloud import vision
|
3 |
+
from PIL import Image
|
4 |
+
import pandas as pd
|
5 |
+
import os
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
|
9 |
+
def extract_tables_with_google_vision(image_file):
|
10 |
+
# Initialize Google Cloud Vision client
|
11 |
+
client = vision.ImageAnnotatorClient()
|
12 |
+
|
13 |
+
# Read the image file
|
14 |
+
with BytesIO(image_file) as image_stream:
|
15 |
+
image = Image.open(image_stream)
|
16 |
+
# Convert image to bytes
|
17 |
+
img_bytes = image_stream.getvalue()
|
18 |
+
|
19 |
+
# Perform text detection on the image
|
20 |
+
image = vision.Image(content=img_bytes)
|
21 |
+
response = client.text_detection(image=image)
|
22 |
+
texts = response.text_annotations
|
23 |
+
|
24 |
+
# Extract text lines
|
25 |
+
lines = [text.description for text in texts]
|
26 |
+
|
27 |
+
# Check if lines resemble a table (e.g., have commas)
|
28 |
+
tables = []
|
29 |
+
is_table = False
|
30 |
+
table_rows = []
|
31 |
+
for line in lines:
|
32 |
+
if "," in line: # Assuming comma-separated values indicate a table
|
33 |
+
is_table = True
|
34 |
+
table_rows.append([cell.strip() for cell in line.split(",")])
|
35 |
+
else:
|
36 |
+
if is_table:
|
37 |
+
tables.extend(table_rows)
|
38 |
+
is_table = False
|
39 |
+
table_rows = []
|
40 |
+
|
41 |
+
table_content = ""
|
42 |
+
if tables:
|
43 |
+
df = pd.DataFrame(tables[1:], columns=tables[0])
|
44 |
+
table_content = df.to_csv(index=False)
|
45 |
+
|
46 |
+
return table_content
|
47 |
+
|
48 |
+
|
49 |
+
interface = gr.Interface(
|
50 |
+
fn=extract_tables_with_google_vision,
|
51 |
+
inputs=gr.Image(type="pil", label="Upload a PDF page image"),
|
52 |
+
outputs=gr.Textbox(label="Extracted Tables"),
|
53 |
+
title="PDF Table Extractor with Google Cloud Vision",
|
54 |
+
description="Upload an image of a PDF page to extract tables.",
|
55 |
+
allow_flagging=False
|
56 |
+
)
|
57 |
+
|
58 |
+
interface.launch()
|
test_extraction/sample_11.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
from PIL import Image, ImageDraw
|
4 |
+
from io import BytesIO
|
5 |
+
import pandas as pd
|
6 |
+
import os
|
7 |
+
import numpy as np
|
8 |
+
import google.generativeai as genai
|
9 |
+
import openai
|
10 |
+
import base64
|
11 |
+
import requests
|
12 |
+
import tempfile
|
13 |
+
import ast
|
14 |
+
|
15 |
+
genai.configure(api_key="AIzaSyBwk94xRhPOIkvO0E3pYhXQ7Rrk5my5IyY")
|
16 |
+
openai.api_key = "sk-proj-YOl2xepEsNppWm3xLshlT3BlbkFJL04qQgahGxFcFGEClnQK"
|
17 |
+
|
18 |
+
import gradio as gr
|
19 |
+
import fitz # PyMuPDF
|
20 |
+
from PIL import Image
|
21 |
+
from io import BytesIO
|
22 |
+
import pandas as pd
|
23 |
+
import numpy as np
|
24 |
+
import tempfile
|
25 |
+
|
26 |
+
|
27 |
+
# Define the model extraction functions
|
28 |
+
def extract_bounding_box_pymupdf(pdf_content):
|
29 |
+
bounding_boxes = []
|
30 |
+
pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
|
31 |
+
|
32 |
+
for page_index in range(len(pdf_file)):
|
33 |
+
page_bbox = []
|
34 |
+
page = pdf_file[page_index]
|
35 |
+
image_list = page.get_images(full=True)
|
36 |
+
|
37 |
+
for image_index, img in enumerate(page.get_images(full=True), start=1):
|
38 |
+
rect = page.get_image_bbox(img[7])
|
39 |
+
bbox = list(rect)
|
40 |
+
page_bbox.append(bbox)
|
41 |
+
bounding_boxes.append(page_bbox)
|
42 |
+
|
43 |
+
pdf_file.close() # Close the PDF file after use
|
44 |
+
return bounding_boxes
|
45 |
+
|
46 |
+
|
47 |
+
def extract_bounding_boxes_gemini(api_key, images):
|
48 |
+
# Placeholder for Gemini API integration
|
49 |
+
bounding_boxes = [[(0, 0, 100, 100)]] * len(images) # Dummy bounding boxes
|
50 |
+
return bounding_boxes
|
51 |
+
|
52 |
+
|
53 |
+
def extract_bounding_box_gpt(api_key, pdf_content):
|
54 |
+
# Placeholder for GPT-4 API integration
|
55 |
+
bounding_boxes = [[(0, 0, 100, 100)]] * len(
|
56 |
+
fitz.open(stream=pdf_content, filetype="pdf")
|
57 |
+
) # Dummy bounding boxes
|
58 |
+
return bounding_boxes
|
59 |
+
|
60 |
+
|
61 |
+
def extract_images_and_tables(pdf_file, model_option):
|
62 |
+
if isinstance(pdf_file, str):
|
63 |
+
# If input is a file path (usually in testing or local execution)
|
64 |
+
with open(pdf_file, "rb") as f:
|
65 |
+
pdf_bytes = f.read()
|
66 |
+
elif isinstance(pdf_file, bytes):
|
67 |
+
# If input is bytes (from Gradio)
|
68 |
+
pdf_bytes = pdf_file
|
69 |
+
else:
|
70 |
+
raise TypeError("Unsupported input type for pdf_file.")
|
71 |
+
|
72 |
+
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
|
73 |
+
|
74 |
+
images = []
|
75 |
+
for page_index in range(len(pdf_document)):
|
76 |
+
for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
|
77 |
+
xref = img[0]
|
78 |
+
base_image = pdf_document.extract_image(xref)
|
79 |
+
image_bytes = base_image["image"]
|
80 |
+
image = Image.open(BytesIO(image_bytes))
|
81 |
+
images.append(image)
|
82 |
+
|
83 |
+
tables = []
|
84 |
+
for page_num in range(len(pdf_document)):
|
85 |
+
page = pdf_document.load_page(page_num)
|
86 |
+
text = page.get_text("text")
|
87 |
+
|
88 |
+
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
89 |
+
|
90 |
+
if any("," in line for line in lines):
|
91 |
+
rows = [line.split(",") for line in lines]
|
92 |
+
tables.extend(rows)
|
93 |
+
|
94 |
+
table_content = ""
|
95 |
+
if tables:
|
96 |
+
max_columns = max(len(row) for row in tables)
|
97 |
+
tables = [row + [""] * (max_columns - len(row)) for row in tables]
|
98 |
+
df = pd.DataFrame(tables[1:], columns=tables[0])
|
99 |
+
table_content = df.to_csv(index=False)
|
100 |
+
|
101 |
+
pdf_document.close()
|
102 |
+
|
103 |
+
if model_option == "PyMuPDF":
|
104 |
+
bounding_boxes = extract_bounding_box_pymupdf(pdf_bytes)
|
105 |
+
elif model_option == "Gemini":
|
106 |
+
bounding_boxes = extract_bounding_boxes_gemini(
|
107 |
+
"your_gemini_api_key_here", images
|
108 |
+
)
|
109 |
+
elif model_option == "GPT-4":
|
110 |
+
bounding_boxes = extract_bounding_box_gpt("your_gpt4_api_key_here", pdf_bytes)
|
111 |
+
else:
|
112 |
+
bounding_boxes = []
|
113 |
+
|
114 |
+
return images, table_content, bounding_boxes
|
115 |
+
|
116 |
+
|
117 |
+
def handle_model_selection(pdf_file, model_option):
|
118 |
+
return extract_images_and_tables(pdf_file, model_option)
|
119 |
+
|
120 |
+
|
121 |
+
# Define the Gradio interface
|
122 |
+
interface = gr.Interface(
|
123 |
+
fn=handle_model_selection,
|
124 |
+
inputs=[
|
125 |
+
gr.File(type="filepath", label="Upload PDF"),
|
126 |
+
gr.Dropdown(
|
127 |
+
label="Select Model",
|
128 |
+
choices=["PyMuPDF", "Gemini", "GPT-4"],
|
129 |
+
value="PyMuPDF",
|
130 |
+
),
|
131 |
+
],
|
132 |
+
outputs=[
|
133 |
+
gr.Gallery(label="Extracted Images"),
|
134 |
+
gr.Textbox(label="Extracted Tables"),
|
135 |
+
gr.JSON(label="Extracted Bounding Boxes"),
|
136 |
+
],
|
137 |
+
title="PDF Image and Table Extractor",
|
138 |
+
description="Upload a PDF to extract images and tables. Choose the model for extraction.",
|
139 |
+
)
|
140 |
+
|
141 |
+
interface.launch(share=True)
|
utils.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from PIL import Image, ImageDraw
|
3 |
+
from pdf2image import convert_from_path
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
+
import re
|
7 |
+
import fitz
|
8 |
+
import base64
|
9 |
+
|
10 |
+
|
11 |
+
def draw_boxes(image_path, boxes):
|
12 |
+
image = Image.open(image_path)
|
13 |
+
draw = ImageDraw.Draw(image)
|
14 |
+
|
15 |
+
for box in boxes:
|
16 |
+
draw.rectangle(box, outline="red", width=2)
|
17 |
+
|
18 |
+
return image
|
19 |
+
|
20 |
+
|
21 |
+
def pdf_to_images(pdf_path):
|
22 |
+
images = convert_from_path(pdf_path)
|
23 |
+
|
24 |
+
image_paths = []
|
25 |
+
for idx, image in enumerate(images):
|
26 |
+
image_file_path = f"extract_tables/table_outputs/pdf-image-{idx + 1}.png"
|
27 |
+
image.save(image_file_path, format="PNG")
|
28 |
+
image_paths.append(image_file_path)
|
29 |
+
|
30 |
+
return image_paths
|
31 |
+
|
32 |
+
|
33 |
+
def parse_bboxs_gemini_flash(input_string):
|
34 |
+
lines = [line for line in input_string.strip().split("\n") if line]
|
35 |
+
bounding_boxes = [list(map(int, re.findall(r"\d+", line))) for line in lines]
|
36 |
+
return bounding_boxes
|
37 |
+
|
38 |
+
|
39 |
+
def convert_pdf_to_images(pdf_path):
|
40 |
+
images = []
|
41 |
+
with fitz.open(pdf_path) as doc:
|
42 |
+
for page_num in range(len(doc)):
|
43 |
+
page = doc.load_page(page_num)
|
44 |
+
pix = page.get_pixmap()
|
45 |
+
images.append(pix)
|
46 |
+
return images
|
47 |
+
|
48 |
+
|
49 |
+
def encode_image_to_base64(image):
|
50 |
+
image_bytes = image.tobytes()
|
51 |
+
base64_encoded = base64.b64encode(image_bytes)
|
52 |
+
base64_string = base64_encoded.decode("utf-8")
|
53 |
+
return base64_string
|
54 |
+
|
55 |
+
|
56 |
+
def calculate_scaling_factors(groundtruth_boxes, extracted_boxes):
|
57 |
+
assert len(groundtruth_boxes) == len(
|
58 |
+
extracted_boxes
|
59 |
+
), "Mismatch in the number of bounding boxes."
|
60 |
+
|
61 |
+
x_factors = []
|
62 |
+
y_factors = []
|
63 |
+
|
64 |
+
for gt_box, ext_box in zip(groundtruth_boxes, extracted_boxes):
|
65 |
+
gt_xmin, gt_ymin, gt_xmax, gt_ymax = gt_box
|
66 |
+
ext_xmin, ext_ymin, ext_xmax, ext_ymax = ext_box
|
67 |
+
|
68 |
+
gt_width = gt_xmax - gt_xmin
|
69 |
+
gt_height = gt_ymax - gt_ymin
|
70 |
+
ext_width = ext_xmax - ext_xmin
|
71 |
+
ext_height = ext_ymax - ext_ymin
|
72 |
+
|
73 |
+
x_factors.append(ext_width / gt_width)
|
74 |
+
y_factors.append(ext_height / gt_height)
|
75 |
+
|
76 |
+
x_scale = np.mean(x_factors)
|
77 |
+
y_scale = np.mean(y_factors)
|
78 |
+
|
79 |
+
return x_scale, y_scale
|
80 |
+
|
81 |
+
|
82 |
+
def scale_bounding_boxes(extracted_boxes, scaling_factors):
|
83 |
+
scaled_boxes = []
|
84 |
+
for page_boxes in extracted_boxes:
|
85 |
+
scaled_page_boxes = []
|
86 |
+
for box in page_boxes:
|
87 |
+
scaled_box = [
|
88 |
+
box[0] / scaling_factors[0],
|
89 |
+
box[1] / scaling_factors[1],
|
90 |
+
box[2] / scaling_factors[0],
|
91 |
+
box[3] / scaling_factors[1],
|
92 |
+
]
|
93 |
+
scaled_page_boxes.append(scaled_box)
|
94 |
+
scaled_boxes.append(scaled_page_boxes)
|
95 |
+
return scaled_boxes
|
96 |
+
|
97 |
+
|
98 |
+
def clear_directory(directory_path):
|
99 |
+
if os.path.exists(directory_path):
|
100 |
+
for filename in os.listdir(directory_path):
|
101 |
+
file_path = os.path.join(directory_path, filename)
|
102 |
+
try:
|
103 |
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
104 |
+
os.unlink(file_path)
|
105 |
+
elif os.path.isdir(file_path):
|
106 |
+
shutil.rmtree(file_path)
|
107 |
+
except Exception as e:
|
108 |
+
print(f"Failed to delete {file_path}. Reason: {e}")
|
visual_assets/.gitkeep
ADDED
File without changes
|