File size: 3,309 Bytes
9d496b5
 
 
 
 
 
 
 
 
 
 
 
3a692a6
0577e36
3a692a6
9d496b5
3cb12d8
9d496b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
import supervision as sv
from ultralytics import YOLO
from tqdm import tqdm
import re
from collections import defaultdict
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import json
import cv2
import gradio as gr

import os
os.system('apt update && apt-get install poppler-utils')

# Initialize YOLO model
model_yolo = YOLO(model="yolov8n-box.pt")
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)

def process_pdf(file):
    images = convert_from_path(file.name)

    # Function to process each slice of the image
    def slicer_callback(slice: np.ndarray) -> sv.Detections:
        result = model_yolo.predict(slice, conf=0.85)[0]
        detections = sv.Detections.from_ultralytics(result)
        return detections

    # Initialize the slicer
    slicer = sv.InferenceSlicer(
        callback=slicer_callback,
        slice_wh=(2000, 800),
        overlap_ratio_wh=(0.6, 0.6),
        overlap_filter_strategy=sv.OverlapFilter.NON_MAX_MERGE,
        iou_threshold=0.05,
    )

    results = []
    for pil_image in images:
        opencvImage = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
        opencvImage = cv2.rotate(opencvImage, cv2.ROTATE_90_CLOCKWISE)
        # Perform inference on the entire image
        detections = slicer(opencvImage)

        # Function to run the TrOCR model with detections
        def run_example(detections):
            for detection in tqdm(detections):
                # Extract bounding box coordinates
                bbox = detection[0]
                x_min, y_min, x_max, y_max = bbox
                x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)

                # Crop the detected region from the image
                cropped_image = opencvImage[y_min:y_max, x_min:x_max]
                result = ocr.ocr(cropped_image, cls=True)[0]
                if result is not None:
                    text = ''

                    if re.match(r"([A-Z])(\d+)-(\d+)", result[0][1][0]):
                        text = result[0][1][0]
                    elif re.match(r"([A-Z])(\d+)-(\d+)", ''.join([line[1][0] for line in result])):
                        text = ''.join([line[1][0] for line in result])

                    # Print the generated text
                    results.append(text)

        # Run example with detections
        run_example(detections)

    detected_numbers = defaultdict(list)
    for result in results:
        match = re.match(r"([A-Z])(\d+)-(\d+)", result)
        if match:
            letter = match.group(1)
            x = int(match.group(2))
            y = int(match.group(3))
            detected_numbers[(letter, x)].append(y)

    # Generate the desired JSON output
    output = {}

    for (letter, x) in sorted(detected_numbers.keys()):
        key = f"CB-{letter}{x}"
        value = [f"{letter}{x}-{i}" for i in sorted(detected_numbers[(letter, x)])]
        output[key] = value

    return json.dumps(output, indent=4)

# Create the Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs="json",
    title="Extract Data from PDF",
    description="Upload a PDF file and get the JSON output of detected numbers."
)

# Launch the Gradio app
iface.launch()