Spaces:
Sleeping
Sleeping
import pymupdf | |
import fitz | |
import io | |
from io import BytesIO | |
from PIL import Image | |
from transformers import AutoImageProcessor, TableTransformerForObjectDetection | |
import torch | |
import ast | |
import google.generativeai as genai | |
import openai | |
import requests | |
import os | |
# from constants import GEMINI_API_KEY, OPENAI_API_KEY | |
from utils import ( | |
draw_boxes, | |
pdf_to_images, | |
parse_bboxs_gemini_flash, | |
convert_pdf_to_images, | |
encode_image_to_base64, | |
) | |
def get_bounding_box_pymupdf(pdf_path): | |
bounding_boxes = [] | |
pages = pymupdf.open(pdf_path) | |
for page_num in range(len(pages)): | |
page = pages[page_num] | |
tabs = page.find_tables() | |
page_tables = [] | |
for table in range(len(tabs.tables)): | |
page_tables.append(list(tabs.tables[table].bbox)) | |
bounding_boxes.append(page_tables) | |
return bounding_boxes | |
def extract_tables_pymupdf(pdf_file): | |
pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
bounding_boxes = get_bounding_box_pymupdf(pdf_path) | |
pages = fitz.open(pdf_path) | |
tables = [] | |
for page_num, page_tables in enumerate(bounding_boxes, start=1): | |
page = pages[page_num - 1] | |
for table_num, bbox in enumerate(page_tables, start=1): | |
clip = page.get_pixmap(clip=bbox, alpha=False) | |
img = Image.frombytes("RGB", [clip.width, clip.height], clip.samples) | |
img_bytes = io.BytesIO() | |
img.save(img_bytes, format="PNG") | |
img_bytes = img_bytes.getvalue() | |
image = Image.open(BytesIO(img_bytes)) | |
tables.append(image) | |
return tables | |
def extract_tables_tab_transformer(pdf_file): | |
image_processor = AutoImageProcessor.from_pretrained( | |
"microsoft/table-transformer-detection" | |
) | |
model = TableTransformerForObjectDetection.from_pretrained( | |
"microsoft/table-transformer-detection" | |
) | |
pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
tables = [] | |
pdf_images = pdf_to_images(pdf_path) | |
for page in pdf_images: | |
image = Image.open(page).convert("RGB") | |
inputs = image_processor(images=image, return_tensors="pt") | |
outputs = model(**inputs) | |
target_sizes = torch.tensor([image.size[::-1]]) | |
results = image_processor.post_process_object_detection( | |
outputs, threshold=0.9, target_sizes=target_sizes | |
)[0] | |
image = draw_boxes(page, results["boxes"].tolist()) | |
tables.append(image) | |
return tables | |
def extract_tables_img2table(pdf_file): | |
from img2table.document import Image | |
pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
tables = [] | |
pdf_images = pdf_to_images(pdf_path) | |
for image_path in pdf_images: | |
img = Image(src=image_path) | |
extracted_tables = img.extract_tables() | |
bbox_values = [ | |
[table.bbox.x1, table.bbox.y1, table.bbox.x2, table.bbox.y2] | |
for table in extracted_tables | |
] | |
image = draw_boxes(image_path, bbox_values) | |
tables.append(image) | |
return tables | |
def extract_tables_gemini(model, pdf_file): | |
gemini_api_key = os.getenv("GEMINI_API_KEY") | |
genai.configure(api_key=gemini_api_key) | |
gemini_model = genai.GenerativeModel(model) | |
prompt = f"Extract the bounding boxes of all the tables present in this image. Return the bounding boxes as list of lists. Do not include anyother text or symbols in the output" | |
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
tables = [] | |
pdf_images = pdf_to_images(pdf_path) | |
for page in pdf_images: | |
img = Image.open(page).convert("RGB") | |
response = gemini_model.generate_content([img, prompt], stream=False) | |
response.resolve() | |
print(response.text) | |
if model == "gemini-pro-vision": | |
page_bbox = ast.literal_eval(response.text) | |
elif model == "gemini-1.5-flash-latest": | |
page_bbox = parse_bboxs_gemini_flash(response.text) | |
image = draw_boxes(page, page_bbox) | |
tables.append(image) | |
return tables | |
def extract_tables_gpt(model, pdf_file): | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
openai.api_key = openai_api_key | |
image_media_type = "image/png" | |
pdf_path = "extract_images/input_docs/uploaded_pdf.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file) | |
images = convert_pdf_to_images(pdf_path) | |
image_paths = pdf_to_images(pdf_path) | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {openai.api_key}", | |
} | |
extracted_tables = [] | |
for page_idx, image in enumerate(images): | |
base64_string = encode_image_to_base64(image) | |
payload = { | |
"model": model, | |
"messages": [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": "Extract bounding boxes of all the tables present in this page. Return bounding boxes as liat of lists and don't provide any other text in the response.", | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{base64_string}" | |
}, | |
}, | |
], | |
} | |
], | |
} | |
response = requests.post( | |
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload | |
) | |
response_json = response.json() | |
print(response_json["choices"][0]["message"]["content"]) | |
if "choices" in response_json and len(response_json["choices"]) > 0: | |
extracted_tables.append( | |
draw_boxes( | |
image_paths[page_idx], | |
ast.literal_eval(response_json["choices"][0]["message"]["content"]), | |
) | |
) | |
return extracted_tables | |