Spaces:
Sleeping
Sleeping
File size: 3,260 Bytes
9d496b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import numpy as np
import supervision as sv
from ultralytics import YOLO
from tqdm import tqdm
import re
from collections import defaultdict
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import json
import cv2
import gradio as gr
# Initialize YOLO model
model_yolo = YOLO(model="runs/detect/train/weights/best.pt")
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)
def process_pdf(file):
images = convert_from_path(file.name)
# Function to process each slice of the image
def slicer_callback(slice: np.ndarray) -> sv.Detections:
result = model_yolo.predict(slice, conf=0.85)[0]
detections = sv.Detections.from_ultralytics(result)
return detections
# Initialize the slicer
slicer = sv.InferenceSlicer(
callback=slicer_callback,
slice_wh=(2000, 800),
overlap_ratio_wh=(0.6, 0.6),
overlap_filter_strategy=sv.OverlapFilter.NON_MAX_MERGE,
iou_threshold=0.05,
)
results = []
for pil_image in images:
opencvImage = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
opencvImage = cv2.rotate(opencvImage, cv2.ROTATE_90_CLOCKWISE)
# Perform inference on the entire image
detections = slicer(opencvImage)
# Function to run the TrOCR model with detections
def run_example(detections):
for detection in tqdm(detections):
# Extract bounding box coordinates
bbox = detection[0]
x_min, y_min, x_max, y_max = bbox
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
# Crop the detected region from the image
cropped_image = opencvImage[y_min:y_max, x_min:x_max]
result = ocr.ocr(cropped_image, cls=True)[0]
if result is not None:
text = ''
if re.match(r"([A-Z])(\d+)-(\d+)", result[0][1][0]):
text = result[0][1][0]
elif re.match(r"([A-Z])(\d+)-(\d+)", ''.join([line[1][0] for line in result])):
text = ''.join([line[1][0] for line in result])
# Print the generated text
results.append(text)
# Run example with detections
run_example(detections)
detected_numbers = defaultdict(list)
for result in results:
match = re.match(r"([A-Z])(\d+)-(\d+)", result)
if match:
letter = match.group(1)
x = int(match.group(2))
y = int(match.group(3))
detected_numbers[(letter, x)].append(y)
# Generate the desired JSON output
output = {}
for (letter, x) in sorted(detected_numbers.keys()):
key = f"CB-{letter}{x}"
value = [f"{letter}{x}-{i}" for i in sorted(detected_numbers[(letter, x)])]
output[key] = value
return json.dumps(output, indent=4)
# Create the Gradio interface
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload PDF"),
outputs="json",
title="Extract Data from PDF",
description="Upload a PDF file and get the JSON output of detected numbers."
)
# Launch the Gradio app
iface.launch()
|