import numpy as np import supervision as sv from ultralytics import YOLO from tqdm import tqdm import re from collections import defaultdict from paddleocr import PaddleOCR from pdf2image import convert_from_path import json import cv2 import gradio as gr # Initialize YOLO model model_yolo = YOLO(model="runs/detect/train/weights/best.pt") ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False) def process_pdf(file): images = convert_from_path(file.name) # Function to process each slice of the image def slicer_callback(slice: np.ndarray) -> sv.Detections: result = model_yolo.predict(slice, conf=0.85)[0] detections = sv.Detections.from_ultralytics(result) return detections # Initialize the slicer slicer = sv.InferenceSlicer( callback=slicer_callback, slice_wh=(2000, 800), overlap_ratio_wh=(0.6, 0.6), overlap_filter_strategy=sv.OverlapFilter.NON_MAX_MERGE, iou_threshold=0.05, ) results = [] for pil_image in images: opencvImage = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) opencvImage = cv2.rotate(opencvImage, cv2.ROTATE_90_CLOCKWISE) # Perform inference on the entire image detections = slicer(opencvImage) # Function to run the TrOCR model with detections def run_example(detections): for detection in tqdm(detections): # Extract bounding box coordinates bbox = detection[0] x_min, y_min, x_max, y_max = bbox x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max) # Crop the detected region from the image cropped_image = opencvImage[y_min:y_max, x_min:x_max] result = ocr.ocr(cropped_image, cls=True)[0] if result is not None: text = '' if re.match(r"([A-Z])(\d+)-(\d+)", result[0][1][0]): text = result[0][1][0] elif re.match(r"([A-Z])(\d+)-(\d+)", ''.join([line[1][0] for line in result])): text = ''.join([line[1][0] for line in result]) # Print the generated text results.append(text) # Run example with detections run_example(detections) detected_numbers = defaultdict(list) for result in results: match = re.match(r"([A-Z])(\d+)-(\d+)", result) if match: letter = match.group(1) x = int(match.group(2)) y = int(match.group(3)) detected_numbers[(letter, x)].append(y) # Generate the desired JSON output output = {} for (letter, x) in sorted(detected_numbers.keys()): key = f"CB-{letter}{x}" value = [f"{letter}{x}-{i}" for i in sorted(detected_numbers[(letter, x)])] output[key] = value return json.dumps(output, indent=4) # Create the Gradio interface iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload PDF"), outputs="json", title="Extract Data from PDF", description="Upload a PDF file and get the JSON output of detected numbers." ) # Launch the Gradio app iface.launch()