praysimanjuntak's picture
Update app.py
0577e36 verified
import numpy as np
import supervision as sv
from ultralytics import YOLO
from tqdm import tqdm
import re
from collections import defaultdict
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import json
import cv2
import gradio as gr
import os
os.system('apt update && apt-get install poppler-utils')
# Initialize YOLO model
model_yolo = YOLO(model="yolov8n-box.pt")
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)
def process_pdf(file):
images = convert_from_path(file.name)
# Function to process each slice of the image
def slicer_callback(slice: np.ndarray) -> sv.Detections:
result = model_yolo.predict(slice, conf=0.85)[0]
detections = sv.Detections.from_ultralytics(result)
return detections
# Initialize the slicer
slicer = sv.InferenceSlicer(
callback=slicer_callback,
slice_wh=(2000, 800),
overlap_ratio_wh=(0.6, 0.6),
overlap_filter_strategy=sv.OverlapFilter.NON_MAX_MERGE,
iou_threshold=0.05,
)
results = []
for pil_image in images:
opencvImage = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
opencvImage = cv2.rotate(opencvImage, cv2.ROTATE_90_CLOCKWISE)
# Perform inference on the entire image
detections = slicer(opencvImage)
# Function to run the TrOCR model with detections
def run_example(detections):
for detection in tqdm(detections):
# Extract bounding box coordinates
bbox = detection[0]
x_min, y_min, x_max, y_max = bbox
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
# Crop the detected region from the image
cropped_image = opencvImage[y_min:y_max, x_min:x_max]
result = ocr.ocr(cropped_image, cls=True)[0]
if result is not None:
text = ''
if re.match(r"([A-Z])(\d+)-(\d+)", result[0][1][0]):
text = result[0][1][0]
elif re.match(r"([A-Z])(\d+)-(\d+)", ''.join([line[1][0] for line in result])):
text = ''.join([line[1][0] for line in result])
# Print the generated text
results.append(text)
# Run example with detections
run_example(detections)
detected_numbers = defaultdict(list)
for result in results:
match = re.match(r"([A-Z])(\d+)-(\d+)", result)
if match:
letter = match.group(1)
x = int(match.group(2))
y = int(match.group(3))
detected_numbers[(letter, x)].append(y)
# Generate the desired JSON output
output = {}
for (letter, x) in sorted(detected_numbers.keys()):
key = f"CB-{letter}{x}"
value = [f"{letter}{x}-{i}" for i in sorted(detected_numbers[(letter, x)])]
output[key] = value
return json.dumps(output, indent=4)
# Create the Gradio interface
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload PDF"),
outputs="json",
title="Extract Data from PDF",
description="Upload a PDF file and get the JSON output of detected numbers."
)
# Launch the Gradio app
iface.launch()