MinerU / magic_pdf /pre_proc /remove_bbox_overlap.py
derful's picture
Upload folder using huggingface_hub
240e0a0 verified
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
from magic_pdf.libs.drop_reason import DropReason
def _remove_overlap_between_bbox(bbox1, bbox2):
if _is_part_overlap(bbox1, bbox2):
ix0, iy0, ix1, iy1 = bbox1
x0, y0, x1, y1 = bbox2
diff_x = min(x1, ix1) - max(x0, ix0)
diff_y = min(y1, iy1) - max(y0, iy0)
if diff_y > diff_x:
if x1 >= ix1:
mid = (x0 + ix1) // 2
ix1 = min(mid - 0.25, ix1)
x0 = max(mid + 0.25, x0)
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 0.25, ix0)
x1 = min(mid - 0.25, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
y0 = max(mid + 0.25, y0)
iy1 = min(iy1, mid-0.25)
else:
mid = (iy0 + y1) // 2
y1 = min(y1, mid-0.25)
iy0 = max(mid + 0.25, iy0)
if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
bbox1 = [ix0, iy0, ix1, iy1]
bbox2 = [x0, y0, x1, y1]
return bbox1, bbox2, None
else:
return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
else:
return bbox1, bbox2, None
def _remove_overlap_between_bboxes(arr):
drop_reasons = []
N = len(arr)
keeps = [True] * N
res = [None] * N
for i in range(N):
for j in range(N):
if i == j:
continue
if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
keeps[i] = False
for idx, v in enumerate(arr):
if not keeps[idx]:
continue
for i in range(N):
if res[i] is None:
continue
bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
if drop_reason is None:
v["bbox"] = bbox1
res[i]["bbox"] = bbox2
else:
if v["score"] > res[i]["score"]:
keeps[i] = False
res[i] = None
else:
keeps[idx] = False
drop_reasons.append(drop_reasons)
if keeps[idx]:
res[idx] = v
return res, drop_reasons
def remove_overlap_between_bbox_for_span(spans):
arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
spans[i]["bbox"] = res[i]["bbox"]
ret.append(spans[i])
return ret, drop_reasons
def remove_overlap_between_bbox_for_block(all_bboxes):
arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
all_bboxes[i][:4] = res[i]["bbox"]
ret.append(all_bboxes[i])
return ret, drop_reasons