Extract_PDF / utils.py
shimer56's picture
Upload folder using huggingface_hub
63f6421 verified
raw
history blame contribute delete
No virus
3.54 kB
import numpy as np
from PIL import Image, ImageDraw
from pdf2image import convert_from_path
import os
import shutil
import re
import fitz
import base64
def draw_boxes(image_path, boxes):
image = Image.open(image_path)
draw = ImageDraw.Draw(image)
for box in boxes:
draw.rectangle(box, outline="red", width=2)
return image
def pdf_to_images(
pdf_path, output_dir="extract_tables/table_outputs", output_format="png"
):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
pdf_document = fitz.open(pdf_path)
image_paths = []
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
pix = page.get_pixmap(dpi=300)
image_file_path = os.path.join(
output_dir, f"pdf-image-{page_num + 1}.{output_format}"
)
try:
pix.save(image_file_path)
image_paths.append(image_file_path)
except Exception as e:
print(f"Error saving image {image_file_path}: {e}")
return image_paths
def parse_bboxs_gemini_flash(input_string):
lines = [line for line in input_string.strip().split("\n") if line]
bounding_boxes = [list(map(int, re.findall(r"\d+", line))) for line in lines]
return bounding_boxes
def convert_pdf_to_images(pdf_path):
images = []
with fitz.open(pdf_path) as doc:
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap()
images.append(pix)
return images
def encode_image_to_base64(image):
image_bytes = image.tobytes()
base64_encoded = base64.b64encode(image_bytes)
base64_string = base64_encoded.decode("utf-8")
return base64_string
def calculate_scaling_factors(groundtruth_boxes, extracted_boxes):
assert len(groundtruth_boxes) == len(
extracted_boxes
), "Mismatch in the number of bounding boxes."
x_factors = []
y_factors = []
for gt_box, ext_box in zip(groundtruth_boxes, extracted_boxes):
gt_xmin, gt_ymin, gt_xmax, gt_ymax = gt_box
ext_xmin, ext_ymin, ext_xmax, ext_ymax = ext_box
gt_width = gt_xmax - gt_xmin
gt_height = gt_ymax - gt_ymin
ext_width = ext_xmax - ext_xmin
ext_height = ext_ymax - ext_ymin
x_factors.append(ext_width / gt_width)
y_factors.append(ext_height / gt_height)
x_scale = np.mean(x_factors)
y_scale = np.mean(y_factors)
return x_scale, y_scale
def scale_bounding_boxes(extracted_boxes, scaling_factors):
scaled_boxes = []
for page_boxes in extracted_boxes:
scaled_page_boxes = []
for box in page_boxes:
scaled_box = [
box[0] / scaling_factors[0],
box[1] / scaling_factors[1],
box[2] / scaling_factors[0],
box[3] / scaling_factors[1],
]
scaled_page_boxes.append(scaled_box)
scaled_boxes.append(scaled_page_boxes)
return scaled_boxes
def clear_directory(directory_path):
if os.path.exists(directory_path):
for filename in os.listdir(directory_path):
file_path = os.path.join(directory_path, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(f"Failed to delete {file_path}. Reason: {e}")