Spaces:
Sleeping
Sleeping
import numpy as np | |
import supervision as sv | |
from ultralytics import YOLO | |
from tqdm import tqdm | |
import re | |
from collections import defaultdict | |
from paddleocr import PaddleOCR | |
from pdf2image import convert_from_path | |
import json | |
import cv2 | |
import gradio as gr | |
# Initialize YOLO model | |
model_yolo = YOLO(model="yolov8n-box.pt") | |
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False) | |
def process_pdf(file): | |
images = convert_from_path(file.name) | |
# Function to process each slice of the image | |
def slicer_callback(slice: np.ndarray) -> sv.Detections: | |
result = model_yolo.predict(slice, conf=0.85)[0] | |
detections = sv.Detections.from_ultralytics(result) | |
return detections | |
# Initialize the slicer | |
slicer = sv.InferenceSlicer( | |
callback=slicer_callback, | |
slice_wh=(2000, 800), | |
overlap_ratio_wh=(0.6, 0.6), | |
overlap_filter_strategy=sv.OverlapFilter.NON_MAX_MERGE, | |
iou_threshold=0.05, | |
) | |
results = [] | |
for pil_image in images: | |
opencvImage = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) | |
opencvImage = cv2.rotate(opencvImage, cv2.ROTATE_90_CLOCKWISE) | |
# Perform inference on the entire image | |
detections = slicer(opencvImage) | |
# Function to run the TrOCR model with detections | |
def run_example(detections): | |
for detection in tqdm(detections): | |
# Extract bounding box coordinates | |
bbox = detection[0] | |
x_min, y_min, x_max, y_max = bbox | |
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max) | |
# Crop the detected region from the image | |
cropped_image = opencvImage[y_min:y_max, x_min:x_max] | |
result = ocr.ocr(cropped_image, cls=True)[0] | |
if result is not None: | |
text = '' | |
if re.match(r"([A-Z])(\d+)-(\d+)", result[0][1][0]): | |
text = result[0][1][0] | |
elif re.match(r"([A-Z])(\d+)-(\d+)", ''.join([line[1][0] for line in result])): | |
text = ''.join([line[1][0] for line in result]) | |
# Print the generated text | |
results.append(text) | |
# Run example with detections | |
run_example(detections) | |
detected_numbers = defaultdict(list) | |
for result in results: | |
match = re.match(r"([A-Z])(\d+)-(\d+)", result) | |
if match: | |
letter = match.group(1) | |
x = int(match.group(2)) | |
y = int(match.group(3)) | |
detected_numbers[(letter, x)].append(y) | |
# Generate the desired JSON output | |
output = {} | |
for (letter, x) in sorted(detected_numbers.keys()): | |
key = f"CB-{letter}{x}" | |
value = [f"{letter}{x}-{i}" for i in sorted(detected_numbers[(letter, x)])] | |
output[key] = value | |
return json.dumps(output, indent=4) | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=process_pdf, | |
inputs=gr.File(label="Upload PDF"), | |
outputs="json", | |
title="Extract Data from PDF", | |
description="Upload a PDF file and get the JSON output of detected numbers." | |
) | |
# Launch the Gradio app | |
iface.launch() | |