shimer56 commited on
Commit
d2cb17f
1 Parent(s): 50d37cb

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *cache*
2
+ .venv/
3
+ flagged/
4
+ __pycache__/
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Extract PDF
3
- emoji: 💻
4
- colorFrom: gray
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Extract_PDF
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.36.1
 
 
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from extract_images.services import (
4
+ extract_images_pymupdf,
5
+ extract_images_pdfplumber,
6
+ extract_images_gemini,
7
+ extract_images_gpt,
8
+ )
9
+ from extract_tables.services import (
10
+ extract_tables_pymupdf,
11
+ extract_tables_tab_transformer,
12
+ extract_tables_img2table,
13
+ extract_tables_gemini,
14
+ extract_tables_gpt,
15
+ )
16
+ from utils import clear_directory
17
+
18
+
19
+ def handle_model_selection(pdf_file, model_option):
20
+ if model_option == "PyMuPDF":
21
+ images = extract_images_pymupdf(pdf_file)
22
+ tables = extract_tables_pymupdf(pdf_file)
23
+
24
+ elif model_option == "PdfPlumber (Extracts Images only)":
25
+ images = extract_images_pdfplumber(pdf_file)
26
+ tables = None
27
+
28
+ elif model_option == "Table Transformer (Extracts Tables only)":
29
+ images = None
30
+ tables = extract_tables_tab_transformer(pdf_file)
31
+
32
+ elif model_option == "img2table (Extracts Tables only)":
33
+ images = None
34
+ tables = extract_tables_img2table(pdf_file)
35
+
36
+ elif model_option == "Gemini Pro":
37
+ images = extract_images_gemini("gemini-pro-vision", pdf_file)
38
+ tables = extract_tables_gemini("gemini-pro-vision", pdf_file)
39
+
40
+ elif model_option == "Gemini Flash":
41
+ images = extract_images_gemini("gemini-1.5-flash-latest", pdf_file)
42
+ tables = extract_tables_gemini("gemini-1.5-flash-latest", pdf_file)
43
+
44
+ elif model_option == "GPT 4 Turbo":
45
+ images = extract_images_gpt("gpt-4-turbo", pdf_file)
46
+ tables = extract_tables_gpt("gpt-4-turbo", pdf_file)
47
+
48
+ elif model_option == "GPT 4o":
49
+ images = extract_images_gpt("gpt-4o", pdf_file)
50
+ tables = extract_tables_gpt("gpt-4o", pdf_file)
51
+
52
+ clear_directory("extract_tables/table_outputs")
53
+ clear_directory("extract_images/image_outputs")
54
+ return images, tables
55
+
56
+
57
+ interface = gr.Interface(
58
+ fn=handle_model_selection,
59
+ inputs=[
60
+ gr.File(type="binary", label="Upload PDF"),
61
+ gr.Dropdown(
62
+ label="Select Model",
63
+ choices=[
64
+ "PdfPlumber (Extracts Images only)",
65
+ "Table Transformer (Extracts Tables only)",
66
+ "img2table (Extracts Tables only)",
67
+ "PyMuPDF",
68
+ "Gemini Pro",
69
+ "Gemini Flash",
70
+ "GPT 4 Turbo",
71
+ "GPT 4o",
72
+ ],
73
+ value="PyMuPDF",
74
+ ),
75
+ ],
76
+ outputs=[
77
+ gr.Gallery(label="Extracted Images"),
78
+ gr.Gallery(label="Extracted Tables"),
79
+ ],
80
+ title="PDF Image and Table Extractor",
81
+ description="Upload a PDF to extract images and tables. Choose the model for extraction.",
82
+ )
83
+
84
+ interface.launch(share=True)
constants.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ GEMINI_API_KEY = "AIzaSyBwk94xRhPOIkvO0E3pYhXQ7Rrk5my5IyY"
2
+ OPENAI_API_KEY = "sk-proj-YOl2xepEsNppWm3xLshlT3BlbkFJL04qQgahGxFcFGEClnQK"
extract_images/input_docs/uploaded_pdf.pdf ADDED
Binary file (41.4 kB). View file
 
extract_images/services.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf
2
+ from io import BytesIO
3
+ from PIL import Image
4
+ import pdfplumber
5
+ import ast
6
+ import google.generativeai as genai
7
+ from PIL import Image, ImageDraw
8
+ import openai
9
+ import requests
10
+
11
+ from constants import GEMINI_API_KEY, OPENAI_API_KEY
12
+ from utils import (
13
+ draw_boxes,
14
+ pdf_to_images,
15
+ parse_bboxs_gemini_flash,
16
+ convert_pdf_to_images,
17
+ encode_image_to_base64,
18
+ )
19
+
20
+
21
+ def extract_images_pymupdf(pdf_file):
22
+ pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
23
+ with open(pdf_path, "wb") as f:
24
+ f.write(pdf_file)
25
+
26
+ doc = pymupdf.open(pdf_path)
27
+ images = []
28
+ for page_idx, page in enumerate(doc):
29
+ for img_index, img in enumerate(doc.get_page_images(page_idx)):
30
+ xref = img[0]
31
+ base_image = doc.extract_image(xref)
32
+ image_bytes = base_image["image"]
33
+ image = Image.open(BytesIO(image_bytes))
34
+ images.append(image)
35
+ return images if images != [] else None
36
+
37
+
38
+ def extract_images_pdfplumber(pdf_file):
39
+ pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
40
+ with open(pdf_path, "wb") as f:
41
+ f.write(pdf_file)
42
+
43
+ images = []
44
+ pdf_obj = pdfplumber.open(pdf_path)
45
+ for page_idx, page in enumerate(pdf_obj.pages):
46
+ page_bbox = []
47
+ for image_idx, image in enumerate(page.images):
48
+ page_height = page.height
49
+ image_bbox = (
50
+ image["x0"],
51
+ page_height - image["y1"],
52
+ image["x1"],
53
+ page_height - image["y0"],
54
+ )
55
+ page_bbox.append(image_bbox)
56
+ cropped_page = page.crop(image_bbox)
57
+ image_obj = cropped_page.to_image(resolution=400)
58
+ image_path = (
59
+ f"extract_images/image_outputs/image-{page_idx}-{image_idx}.png"
60
+ )
61
+ image_obj.save(image_path)
62
+ image = Image.open(image_path)
63
+ images.append(image)
64
+ return images if images != [] else None
65
+
66
+
67
+ def extract_images_gemini(model, pdf_file):
68
+ genai.configure(api_key=GEMINI_API_KEY)
69
+ gemini_model = genai.GenerativeModel(model)
70
+ prompt = f"Extract the bounding boxes of all the images present in this page. Return the bounding boxes as list of lists. Do not include anyother text or symbols in the output"
71
+
72
+ pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
73
+ with open(pdf_path, "wb") as f:
74
+ f.write(pdf_file)
75
+
76
+ images = []
77
+ pdf_images = pdf_to_images(pdf_path)
78
+ for page in pdf_images:
79
+ img = Image.open(page).convert("RGB")
80
+ response = gemini_model.generate_content([img, prompt], stream=False)
81
+ response.resolve()
82
+ print(response.text)
83
+
84
+ if model == "gemini-pro-vision":
85
+ page_bbox = ast.literal_eval(response.text)
86
+ elif model == "gemini-1.5-flash-latest":
87
+ page_bbox = parse_bboxs_gemini_flash(response.text)
88
+
89
+ image = draw_boxes(page, page_bbox)
90
+ images.append(image)
91
+ return images
92
+
93
+
94
+ def extract_images_gpt(model, pdf_file):
95
+ openai.api_key = OPENAI_API_KEY
96
+ image_media_type = "image/png"
97
+
98
+ pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
99
+ with open(pdf_path, "wb") as f:
100
+ f.write(pdf_file)
101
+
102
+ images = convert_pdf_to_images(pdf_path)
103
+ image_paths = pdf_to_images(pdf_path)
104
+
105
+ headers = {
106
+ "Content-Type": "application/json",
107
+ "Authorization": f"Bearer {openai.api_key}",
108
+ }
109
+
110
+ extracted_images = []
111
+ for page_idx, image in enumerate(images):
112
+ base64_string = encode_image_to_base64(image)
113
+ payload = {
114
+ "model": model,
115
+ "messages": [
116
+ {
117
+ "role": "user",
118
+ "content": [
119
+ {
120
+ "type": "text",
121
+ "text": "Extract bounding boxes of all the images present in this page. Return bounding boxes as liat of lists and don't provide any other text in the response.",
122
+ },
123
+ {
124
+ "type": "image_url",
125
+ "image_url": {
126
+ "url": f"data:image/jpeg;base64,{base64_string}"
127
+ },
128
+ },
129
+ ],
130
+ }
131
+ ],
132
+ }
133
+
134
+ response = requests.post(
135
+ "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
136
+ )
137
+ response_json = response.json()
138
+
139
+ if "choices" in response_json and len(response_json["choices"]) > 0:
140
+ extracted_images.append(
141
+ draw_boxes(
142
+ image_paths[page_idx],
143
+ ast.literal_eval(response_json["choices"][0]["message"]["content"]),
144
+ )
145
+ )
146
+
147
+ return extracted_images
extract_tables/input_docs/uploaded_pdf.pdf ADDED
Binary file (41.4 kB). View file
 
extract_tables/services.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf
2
+ import fitz
3
+ import io
4
+ from io import BytesIO
5
+ from PIL import Image
6
+ from transformers import AutoImageProcessor, TableTransformerForObjectDetection
7
+ import torch
8
+ import ast
9
+ import google.generativeai as genai
10
+ import openai
11
+ import requests
12
+
13
+
14
+ from constants import GEMINI_API_KEY, OPENAI_API_KEY
15
+ from utils import (
16
+ draw_boxes,
17
+ pdf_to_images,
18
+ parse_bboxs_gemini_flash,
19
+ convert_pdf_to_images,
20
+ encode_image_to_base64,
21
+ )
22
+
23
+
24
+ def get_bounding_box_pymupdf(pdf_path):
25
+ bounding_boxes = []
26
+ pages = pymupdf.open(pdf_path)
27
+ for page_num in range(len(pages)):
28
+ page = pages[page_num]
29
+ tabs = page.find_tables()
30
+ page_tables = []
31
+ for table in range(len(tabs.tables)):
32
+ page_tables.append(list(tabs.tables[table].bbox))
33
+ bounding_boxes.append(page_tables)
34
+ return bounding_boxes
35
+
36
+
37
+ def extract_tables_pymupdf(pdf_file):
38
+ pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf"
39
+ with open(pdf_path, "wb") as f:
40
+ f.write(pdf_file)
41
+
42
+ bounding_boxes = get_bounding_box_pymupdf(pdf_path)
43
+ pages = fitz.open(pdf_path)
44
+
45
+ tables = []
46
+ for page_num, page_tables in enumerate(bounding_boxes, start=1):
47
+ page = pages[page_num - 1]
48
+ for table_num, bbox in enumerate(page_tables, start=1):
49
+ clip = page.get_pixmap(clip=bbox, alpha=False)
50
+ img = Image.frombytes("RGB", [clip.width, clip.height], clip.samples)
51
+ img_bytes = io.BytesIO()
52
+ img.save(img_bytes, format="PNG")
53
+ img_bytes = img_bytes.getvalue()
54
+ image = Image.open(BytesIO(img_bytes))
55
+ tables.append(image)
56
+ return tables
57
+
58
+
59
+ def extract_tables_tab_transformer(pdf_file):
60
+ image_processor = AutoImageProcessor.from_pretrained(
61
+ "microsoft/table-transformer-detection"
62
+ )
63
+ model = TableTransformerForObjectDetection.from_pretrained(
64
+ "microsoft/table-transformer-detection"
65
+ )
66
+
67
+ pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf"
68
+ with open(pdf_path, "wb") as f:
69
+ f.write(pdf_file)
70
+
71
+ tables = []
72
+ pdf_images = pdf_to_images(pdf_path)
73
+ for page in pdf_images:
74
+ image = Image.open(page).convert("RGB")
75
+
76
+ inputs = image_processor(images=image, return_tensors="pt")
77
+ outputs = model(**inputs)
78
+
79
+ target_sizes = torch.tensor([image.size[::-1]])
80
+ results = image_processor.post_process_object_detection(
81
+ outputs, threshold=0.9, target_sizes=target_sizes
82
+ )[0]
83
+
84
+ image = draw_boxes(page, results["boxes"].tolist())
85
+ tables.append(image)
86
+ return tables
87
+
88
+
89
+ def extract_tables_img2table(pdf_file):
90
+ from img2table.document import Image
91
+
92
+ pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf"
93
+ with open(pdf_path, "wb") as f:
94
+ f.write(pdf_file)
95
+
96
+ tables = []
97
+ pdf_images = pdf_to_images(pdf_path)
98
+ for image_path in pdf_images:
99
+ img = Image(src=image_path)
100
+ extracted_tables = img.extract_tables()
101
+ bbox_values = [
102
+ [table.bbox.x1, table.bbox.y1, table.bbox.x2, table.bbox.y2]
103
+ for table in extracted_tables
104
+ ]
105
+
106
+ image = draw_boxes(image_path, bbox_values)
107
+ tables.append(image)
108
+ return tables
109
+
110
+
111
+ def extract_tables_gemini(model, pdf_file):
112
+ genai.configure(api_key=GEMINI_API_KEY)
113
+ gemini_model = genai.GenerativeModel(model)
114
+ prompt = f"Extract the bounding boxes of all the tables present in this image. Return the bounding boxes as list of lists. Do not include anyother text or symbols in the output"
115
+
116
+ pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
117
+ with open(pdf_path, "wb") as f:
118
+ f.write(pdf_file)
119
+
120
+ tables = []
121
+ pdf_images = pdf_to_images(pdf_path)
122
+ for page in pdf_images:
123
+ img = Image.open(page).convert("RGB")
124
+ response = gemini_model.generate_content([img, prompt], stream=False)
125
+ response.resolve()
126
+ print(response.text)
127
+
128
+ if model == "gemini-pro-vision":
129
+ page_bbox = ast.literal_eval(response.text)
130
+ elif model == "gemini-1.5-flash-latest":
131
+ page_bbox = parse_bboxs_gemini_flash(response.text)
132
+
133
+ image = draw_boxes(page, page_bbox)
134
+ tables.append(image)
135
+ return tables
136
+
137
+
138
+ def extract_tables_gpt(model, pdf_file):
139
+ openai.api_key = OPENAI_API_KEY
140
+ image_media_type = "image/png"
141
+
142
+ pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
143
+ with open(pdf_path, "wb") as f:
144
+ f.write(pdf_file)
145
+
146
+ images = convert_pdf_to_images(pdf_path)
147
+ image_paths = pdf_to_images(pdf_path)
148
+
149
+ headers = {
150
+ "Content-Type": "application/json",
151
+ "Authorization": f"Bearer {openai.api_key}",
152
+ }
153
+
154
+ extracted_tables = []
155
+ for page_idx, image in enumerate(images):
156
+ base64_string = encode_image_to_base64(image)
157
+ payload = {
158
+ "model": model,
159
+ "messages": [
160
+ {
161
+ "role": "user",
162
+ "content": [
163
+ {
164
+ "type": "text",
165
+ "text": "Extract bounding boxes of all the tables present in this page. Return bounding boxes as liat of lists and don't provide any other text in the response.",
166
+ },
167
+ {
168
+ "type": "image_url",
169
+ "image_url": {
170
+ "url": f"data:image/jpeg;base64,{base64_string}"
171
+ },
172
+ },
173
+ ],
174
+ }
175
+ ],
176
+ }
177
+
178
+ response = requests.post(
179
+ "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
180
+ )
181
+ response_json = response.json()
182
+
183
+ if "choices" in response_json and len(response_json["choices"]) > 0:
184
+ extracted_tables.append(
185
+ draw_boxes(
186
+ image_paths[page_idx],
187
+ ast.literal_eval(response_json["choices"][0]["message"]["content"]),
188
+ )
189
+ )
190
+
191
+ return extracted_tables
ocr_notebooks/OCR_Benchmarking.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
ocr_notebooks/image_extraction.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
ocr_notebooks/table_extraction.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ black
2
+ numpy
3
+ pytest
4
+ gradio
5
+ pdf2image
6
+ tensorflow
7
+ pytesseract
8
+ opencv-python
9
+ python-resize-image
10
+ google-generativeai
11
+ openai
12
+ pdfplumber
13
+ pymupdf
14
+ timm
15
+ transformers
16
+ img2table
test_extraction/input_docs/output_gemini_flash.png ADDED
test_extraction/input_docs/page_0.png ADDED
test_extraction/input_docs/page_1.png ADDED
test_extraction/input_docs/page_2.png ADDED
test_extraction/output_files/extracted_table.csv ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2-13,,,,
2
+ 02,,,,
3
+ 7,5(63(&,),&$7,21$1'
4
+ 35(6685(/$%(/,,,,
5
+ OAI3019023,,,,
6
+ OAI3019023,,,,
7
+ The tires supplied on your new,,,,
8
+ vehicle are chosen to provide the best,,,,
9
+ performance for normal driving.,,,,
10
+ The tire label located on the driver’s,,,,
11
+ side center pillar gives the tire pressures,,,,
12
+ recommended for your vehicle.,,,,
13
+ (1*,1(180%(5,,,
14
+ OAH2088004,,,,
15
+ OAH2088004,,,,
16
+ The engine number is stamped on the,,,,
17
+ engine block as shown in the drawing.,,,,
18
+ 10,,,,
19
+ SRS warning indicator in the instrument,,,,
20
+ cluster illuminates continuously, it means,,,
21
+ that there is malfunction in the system. Re­,,,,
22
+ move the CRS from front passenger seat,,,,
23
+ and contact your TATA MOTORS autho­,,,,
24
+ rised service center.,,,,
25
+ NOTE,,,,
26
+ The above image’s are for reference,,,,
27
+ purpose only.,,,,
28
+ SAFETY,,,,
29
+ 2-6,,,,
30
+ Vehicle Information,,,,
31
+ 1. Engine coolant reservoir ..............................................................................................................9-23,,,,
32
+ 2. Engine oil filler cap .....................................................................................................................9-20,,,,
33
+ 3. Brake/clutch* fluid reservoir,,,,
34
+ ......................................................................................................9-26,,,,
35
+ 4. Air cleaner ...................................................................................................................................9-30,,,,
36
+ 5. Fuse box ......................................................................................................................................9-54,,,,
37
+ 6. Battery .........................................................................................................................................9-37,,,,
38
+ 7. Windshield washer fluid reservoir ..............................................................................................9-28,,,,
39
+ 8. Radiator cap,,,,
40
+ .................................................................................................................................9-24,,,,
41
+ 9. Engine oil dipstick .......................................................................................................................9-20,,,,
42
+ (1*,1(&203$570(1729(59,(:,,
43
+ The actual engine room in the vehicle may differ from the illustration.,,,,
44
+ OAI3089001,,,,
45
+ OAI3089001,,,,
46
+ ,,,,
47
+ „ Petrol Engine (Kappa 1.2 MPI),,,,
48
+ Petrol Engine (Kappa 1.2 MPI),,,,
test_extraction/output_files/output_gemini_pro.png ADDED
test_extraction/output_files/output_initial_gpt4_4o.png ADDED
test_extraction/output_files/output_initial_gpt4_turbo.png ADDED
test_extraction/output_files/output_scaled_gpt4_4o.png ADDED
test_extraction/output_files/output_scaled_gpt4_turbo.png ADDED
test_extraction/requirements_versioned.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ��
test_extraction/sample.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import gradio as gr
2
+ # import fitz # PyMuPDF
3
+ # from PIL import Image
4
+ # from io import BytesIO
5
+ # import pandas as pd
6
+ # import os
7
+
8
+
9
+ # def extract_images_and_tables(pdf_file):
10
+
11
+ # pdf_path = "temp.pdf"
12
+ # with open(pdf_path, "wb") as f:
13
+ # f.write(pdf_file)
14
+
15
+
16
+ # pdf_document = fitz.open(pdf_path)
17
+
18
+
19
+ # images = []
20
+ # for page_index in range(len(pdf_document)):
21
+ # for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
22
+ # xref = img[0]
23
+ # base_image = pdf_document.extract_image(xref)
24
+ # image_bytes = base_image["image"]
25
+ # image = Image.open(BytesIO(image_bytes))
26
+ # images.append(image)
27
+
28
+
29
+ # tables = []
30
+ # for page_num in range(len(pdf_document)):
31
+ # page = pdf_document.load_page(page_num)
32
+ # text = page.get_text("text")
33
+
34
+ # lines = [line.strip() for line in text.split("\n") if line.strip()]
35
+
36
+ # if any("," in line for line in lines):
37
+
38
+ # rows = [line.split(",") for line in lines]
39
+
40
+ # tables.extend(rows)
41
+
42
+
43
+ # table_content = ""
44
+ # if tables:
45
+ # max_columns = max(len(row) for row in tables)
46
+ # tables = [row + [""] * (max_columns - len(row)) for row in tables]
47
+ # df = pd.DataFrame(tables[1:], columns=tables[0])
48
+ # table_content = df.to_csv(index=False)
49
+
50
+
51
+ # pdf_document.close()
52
+
53
+ # # Remove the temporary PDF file
54
+ # os.remove(pdf_path)
55
+
56
+ # return images, table_content
57
+
58
+
59
+
60
+ # interface = gr.Interface(
61
+ # fn=extract_images_and_tables,
62
+ # inputs=gr.File(type="binary"),
63
+ # outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
64
+ # title="PDF Image and Table Extractor",
65
+ # description="Upload a PDF to extract images and tables."
66
+ # )
67
+
68
+
69
+ # interface.launch(share=True)
70
+ import gradio as gr
71
+ import fitz # PyMuPDF
72
+ from PIL import Image
73
+ from io import BytesIO
74
+ import pandas as pd
75
+ import os
76
+
77
+
78
+ def extract_images_and_tables(pdf_file, model_option):
79
+ pdf_path = "temp.pdf"
80
+ with open(pdf_path, "wb") as f:
81
+ f.write(pdf_file)
82
+
83
+ pdf_document = fitz.open(pdf_path)
84
+
85
+ images = []
86
+ for page_index in range(len(pdf_document)):
87
+ for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
88
+ xref = img[0]
89
+ base_image = pdf_document.extract_image(xref)
90
+ image_bytes = base_image["image"]
91
+ image = Image.open(BytesIO(image_bytes))
92
+ images.append(image)
93
+
94
+ tables = []
95
+ for page_num in range(len(pdf_document)):
96
+ page = pdf_document.load_page(page_num)
97
+ text = page.get_text("text")
98
+
99
+ lines = [line.strip() for line in text.split("\n") if line.strip()]
100
+
101
+ if any("," in line for line in lines):
102
+
103
+ rows = [line.split(",") for line in lines]
104
+
105
+ tables.extend(rows)
106
+
107
+ table_content = ""
108
+ if tables:
109
+ max_columns = max(len(row) for row in tables)
110
+ tables = [row + [""] * (max_columns - len(row)) for row in tables]
111
+ df = pd.DataFrame(tables[1:], columns=tables[0])
112
+ table_content = df.to_csv(index=False)
113
+
114
+ pdf_document.close()
115
+
116
+ os.remove(pdf_path)
117
+
118
+ return images, table_content
119
+
120
+
121
+ def handle_model_selection(pdf_file, model_option):
122
+
123
+ return extract_images_and_tables(pdf_file, model_option)
124
+
125
+
126
+ interface = gr.Interface(
127
+ fn=handle_model_selection,
128
+ inputs=[
129
+ gr.File(type="binary", label="Upload PDF"),
130
+ gr.Dropdown(label="Select Model", choices=["Model 1", "Model 2", "Model 3"], value="Model 1")
131
+ ],
132
+ outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
133
+ title="PDF Image and Table Extractor",
134
+ description="Upload a PDF to extract images and tables. Choose the model for extraction."
135
+ )
136
+
137
+ interface.launch(share=True)
test_extraction/sample1.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from google.cloud import vision
3
+ from PIL import Image
4
+ import pandas as pd
5
+ import os
6
+ from io import BytesIO
7
+
8
+
9
+ def extract_tables_with_google_vision(image_file):
10
+ # Initialize Google Cloud Vision client
11
+ client = vision.ImageAnnotatorClient()
12
+
13
+ # Read the image file
14
+ with BytesIO(image_file) as image_stream:
15
+ image = Image.open(image_stream)
16
+ # Convert image to bytes
17
+ img_bytes = image_stream.getvalue()
18
+
19
+ # Perform text detection on the image
20
+ image = vision.Image(content=img_bytes)
21
+ response = client.text_detection(image=image)
22
+ texts = response.text_annotations
23
+
24
+ # Extract text lines
25
+ lines = [text.description for text in texts]
26
+
27
+ # Check if lines resemble a table (e.g., have commas)
28
+ tables = []
29
+ is_table = False
30
+ table_rows = []
31
+ for line in lines:
32
+ if "," in line: # Assuming comma-separated values indicate a table
33
+ is_table = True
34
+ table_rows.append([cell.strip() for cell in line.split(",")])
35
+ else:
36
+ if is_table:
37
+ tables.extend(table_rows)
38
+ is_table = False
39
+ table_rows = []
40
+
41
+ table_content = ""
42
+ if tables:
43
+ df = pd.DataFrame(tables[1:], columns=tables[0])
44
+ table_content = df.to_csv(index=False)
45
+
46
+ return table_content
47
+
48
+
49
+ interface = gr.Interface(
50
+ fn=extract_tables_with_google_vision,
51
+ inputs=gr.Image(type="pil", label="Upload a PDF page image"),
52
+ outputs=gr.Textbox(label="Extracted Tables"),
53
+ title="PDF Table Extractor with Google Cloud Vision",
54
+ description="Upload an image of a PDF page to extract tables.",
55
+ allow_flagging=False
56
+ )
57
+
58
+ interface.launch()
test_extraction/sample_11.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ from PIL import Image, ImageDraw
4
+ from io import BytesIO
5
+ import pandas as pd
6
+ import os
7
+ import numpy as np
8
+ import google.generativeai as genai
9
+ import openai
10
+ import base64
11
+ import requests
12
+ import tempfile
13
+ import ast
14
+
15
+ genai.configure(api_key="AIzaSyBwk94xRhPOIkvO0E3pYhXQ7Rrk5my5IyY")
16
+ openai.api_key = "sk-proj-YOl2xepEsNppWm3xLshlT3BlbkFJL04qQgahGxFcFGEClnQK"
17
+
18
+ import gradio as gr
19
+ import fitz # PyMuPDF
20
+ from PIL import Image
21
+ from io import BytesIO
22
+ import pandas as pd
23
+ import numpy as np
24
+ import tempfile
25
+
26
+
27
+ # Define the model extraction functions
28
+ def extract_bounding_box_pymupdf(pdf_content):
29
+ bounding_boxes = []
30
+ pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
31
+
32
+ for page_index in range(len(pdf_file)):
33
+ page_bbox = []
34
+ page = pdf_file[page_index]
35
+ image_list = page.get_images(full=True)
36
+
37
+ for image_index, img in enumerate(page.get_images(full=True), start=1):
38
+ rect = page.get_image_bbox(img[7])
39
+ bbox = list(rect)
40
+ page_bbox.append(bbox)
41
+ bounding_boxes.append(page_bbox)
42
+
43
+ pdf_file.close() # Close the PDF file after use
44
+ return bounding_boxes
45
+
46
+
47
+ def extract_bounding_boxes_gemini(api_key, images):
48
+ # Placeholder for Gemini API integration
49
+ bounding_boxes = [[(0, 0, 100, 100)]] * len(images) # Dummy bounding boxes
50
+ return bounding_boxes
51
+
52
+
53
+ def extract_bounding_box_gpt(api_key, pdf_content):
54
+ # Placeholder for GPT-4 API integration
55
+ bounding_boxes = [[(0, 0, 100, 100)]] * len(
56
+ fitz.open(stream=pdf_content, filetype="pdf")
57
+ ) # Dummy bounding boxes
58
+ return bounding_boxes
59
+
60
+
61
+ def extract_images_and_tables(pdf_file, model_option):
62
+ if isinstance(pdf_file, str):
63
+ # If input is a file path (usually in testing or local execution)
64
+ with open(pdf_file, "rb") as f:
65
+ pdf_bytes = f.read()
66
+ elif isinstance(pdf_file, bytes):
67
+ # If input is bytes (from Gradio)
68
+ pdf_bytes = pdf_file
69
+ else:
70
+ raise TypeError("Unsupported input type for pdf_file.")
71
+
72
+ pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
73
+
74
+ images = []
75
+ for page_index in range(len(pdf_document)):
76
+ for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
77
+ xref = img[0]
78
+ base_image = pdf_document.extract_image(xref)
79
+ image_bytes = base_image["image"]
80
+ image = Image.open(BytesIO(image_bytes))
81
+ images.append(image)
82
+
83
+ tables = []
84
+ for page_num in range(len(pdf_document)):
85
+ page = pdf_document.load_page(page_num)
86
+ text = page.get_text("text")
87
+
88
+ lines = [line.strip() for line in text.split("\n") if line.strip()]
89
+
90
+ if any("," in line for line in lines):
91
+ rows = [line.split(",") for line in lines]
92
+ tables.extend(rows)
93
+
94
+ table_content = ""
95
+ if tables:
96
+ max_columns = max(len(row) for row in tables)
97
+ tables = [row + [""] * (max_columns - len(row)) for row in tables]
98
+ df = pd.DataFrame(tables[1:], columns=tables[0])
99
+ table_content = df.to_csv(index=False)
100
+
101
+ pdf_document.close()
102
+
103
+ if model_option == "PyMuPDF":
104
+ bounding_boxes = extract_bounding_box_pymupdf(pdf_bytes)
105
+ elif model_option == "Gemini":
106
+ bounding_boxes = extract_bounding_boxes_gemini(
107
+ "your_gemini_api_key_here", images
108
+ )
109
+ elif model_option == "GPT-4":
110
+ bounding_boxes = extract_bounding_box_gpt("your_gpt4_api_key_here", pdf_bytes)
111
+ else:
112
+ bounding_boxes = []
113
+
114
+ return images, table_content, bounding_boxes
115
+
116
+
117
+ def handle_model_selection(pdf_file, model_option):
118
+ return extract_images_and_tables(pdf_file, model_option)
119
+
120
+
121
+ # Define the Gradio interface
122
+ interface = gr.Interface(
123
+ fn=handle_model_selection,
124
+ inputs=[
125
+ gr.File(type="filepath", label="Upload PDF"),
126
+ gr.Dropdown(
127
+ label="Select Model",
128
+ choices=["PyMuPDF", "Gemini", "GPT-4"],
129
+ value="PyMuPDF",
130
+ ),
131
+ ],
132
+ outputs=[
133
+ gr.Gallery(label="Extracted Images"),
134
+ gr.Textbox(label="Extracted Tables"),
135
+ gr.JSON(label="Extracted Bounding Boxes"),
136
+ ],
137
+ title="PDF Image and Table Extractor",
138
+ description="Upload a PDF to extract images and tables. Choose the model for extraction.",
139
+ )
140
+
141
+ interface.launch(share=True)
utils.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from PIL import Image, ImageDraw
3
+ from pdf2image import convert_from_path
4
+ import os
5
+ import shutil
6
+ import re
7
+ import fitz
8
+ import base64
9
+
10
+
11
+ def draw_boxes(image_path, boxes):
12
+ image = Image.open(image_path)
13
+ draw = ImageDraw.Draw(image)
14
+
15
+ for box in boxes:
16
+ draw.rectangle(box, outline="red", width=2)
17
+
18
+ return image
19
+
20
+
21
+ def pdf_to_images(pdf_path):
22
+ images = convert_from_path(pdf_path)
23
+
24
+ image_paths = []
25
+ for idx, image in enumerate(images):
26
+ image_file_path = f"extract_tables/table_outputs/pdf-image-{idx + 1}.png"
27
+ image.save(image_file_path, format="PNG")
28
+ image_paths.append(image_file_path)
29
+
30
+ return image_paths
31
+
32
+
33
+ def parse_bboxs_gemini_flash(input_string):
34
+ lines = [line for line in input_string.strip().split("\n") if line]
35
+ bounding_boxes = [list(map(int, re.findall(r"\d+", line))) for line in lines]
36
+ return bounding_boxes
37
+
38
+
39
+ def convert_pdf_to_images(pdf_path):
40
+ images = []
41
+ with fitz.open(pdf_path) as doc:
42
+ for page_num in range(len(doc)):
43
+ page = doc.load_page(page_num)
44
+ pix = page.get_pixmap()
45
+ images.append(pix)
46
+ return images
47
+
48
+
49
+ def encode_image_to_base64(image):
50
+ image_bytes = image.tobytes()
51
+ base64_encoded = base64.b64encode(image_bytes)
52
+ base64_string = base64_encoded.decode("utf-8")
53
+ return base64_string
54
+
55
+
56
+ def calculate_scaling_factors(groundtruth_boxes, extracted_boxes):
57
+ assert len(groundtruth_boxes) == len(
58
+ extracted_boxes
59
+ ), "Mismatch in the number of bounding boxes."
60
+
61
+ x_factors = []
62
+ y_factors = []
63
+
64
+ for gt_box, ext_box in zip(groundtruth_boxes, extracted_boxes):
65
+ gt_xmin, gt_ymin, gt_xmax, gt_ymax = gt_box
66
+ ext_xmin, ext_ymin, ext_xmax, ext_ymax = ext_box
67
+
68
+ gt_width = gt_xmax - gt_xmin
69
+ gt_height = gt_ymax - gt_ymin
70
+ ext_width = ext_xmax - ext_xmin
71
+ ext_height = ext_ymax - ext_ymin
72
+
73
+ x_factors.append(ext_width / gt_width)
74
+ y_factors.append(ext_height / gt_height)
75
+
76
+ x_scale = np.mean(x_factors)
77
+ y_scale = np.mean(y_factors)
78
+
79
+ return x_scale, y_scale
80
+
81
+
82
+ def scale_bounding_boxes(extracted_boxes, scaling_factors):
83
+ scaled_boxes = []
84
+ for page_boxes in extracted_boxes:
85
+ scaled_page_boxes = []
86
+ for box in page_boxes:
87
+ scaled_box = [
88
+ box[0] / scaling_factors[0],
89
+ box[1] / scaling_factors[1],
90
+ box[2] / scaling_factors[0],
91
+ box[3] / scaling_factors[1],
92
+ ]
93
+ scaled_page_boxes.append(scaled_box)
94
+ scaled_boxes.append(scaled_page_boxes)
95
+ return scaled_boxes
96
+
97
+
98
+ def clear_directory(directory_path):
99
+ if os.path.exists(directory_path):
100
+ for filename in os.listdir(directory_path):
101
+ file_path = os.path.join(directory_path, filename)
102
+ try:
103
+ if os.path.isfile(file_path) or os.path.islink(file_path):
104
+ os.unlink(file_path)
105
+ elif os.path.isdir(file_path):
106
+ shutil.rmtree(file_path)
107
+ except Exception as e:
108
+ print(f"Failed to delete {file_path}. Reason: {e}")
visual_assets/.gitkeep ADDED
File without changes