File size: 3,540 Bytes
d2cb17f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63f6421
 
 
 
 
 
 
d2cb17f
 
63f6421
 
 
 
 
 
 
 
 
 
 
 
 
d2cb17f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
from PIL import Image, ImageDraw
from pdf2image import convert_from_path
import os
import shutil
import re
import fitz
import base64


def draw_boxes(image_path, boxes):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)

    for box in boxes:
        draw.rectangle(box, outline="red", width=2)

    return image


def pdf_to_images(
    pdf_path, output_dir="extract_tables/table_outputs", output_format="png"
):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    pdf_document = fitz.open(pdf_path)

    image_paths = []
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap(dpi=300)

        image_file_path = os.path.join(
            output_dir, f"pdf-image-{page_num + 1}.{output_format}"
        )

        try:
            pix.save(image_file_path)
            image_paths.append(image_file_path)
        except Exception as e:
            print(f"Error saving image {image_file_path}: {e}")

    return image_paths


def parse_bboxs_gemini_flash(input_string):
    lines = [line for line in input_string.strip().split("\n") if line]
    bounding_boxes = [list(map(int, re.findall(r"\d+", line))) for line in lines]
    return bounding_boxes


def convert_pdf_to_images(pdf_path):
    images = []
    with fitz.open(pdf_path) as doc:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            images.append(pix)
    return images


def encode_image_to_base64(image):
    image_bytes = image.tobytes()
    base64_encoded = base64.b64encode(image_bytes)
    base64_string = base64_encoded.decode("utf-8")
    return base64_string


def calculate_scaling_factors(groundtruth_boxes, extracted_boxes):
    assert len(groundtruth_boxes) == len(
        extracted_boxes
    ), "Mismatch in the number of bounding boxes."

    x_factors = []
    y_factors = []

    for gt_box, ext_box in zip(groundtruth_boxes, extracted_boxes):
        gt_xmin, gt_ymin, gt_xmax, gt_ymax = gt_box
        ext_xmin, ext_ymin, ext_xmax, ext_ymax = ext_box

        gt_width = gt_xmax - gt_xmin
        gt_height = gt_ymax - gt_ymin
        ext_width = ext_xmax - ext_xmin
        ext_height = ext_ymax - ext_ymin

        x_factors.append(ext_width / gt_width)
        y_factors.append(ext_height / gt_height)

    x_scale = np.mean(x_factors)
    y_scale = np.mean(y_factors)

    return x_scale, y_scale


def scale_bounding_boxes(extracted_boxes, scaling_factors):
    scaled_boxes = []
    for page_boxes in extracted_boxes:
        scaled_page_boxes = []
        for box in page_boxes:
            scaled_box = [
                box[0] / scaling_factors[0],
                box[1] / scaling_factors[1],
                box[2] / scaling_factors[0],
                box[3] / scaling_factors[1],
            ]
            scaled_page_boxes.append(scaled_box)
        scaled_boxes.append(scaled_page_boxes)
    return scaled_boxes


def clear_directory(directory_path):
    if os.path.exists(directory_path):
        for filename in os.listdir(directory_path):
            file_path = os.path.join(directory_path, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print(f"Failed to delete {file_path}. Reason: {e}")