import collections |
import re |
from magic_pdf.libs.commons import fitz |
def remove_special_chars(s: str) -> str: |
pattern = r"[^a-zA-Z0-9]" |
res = re.sub(pattern, "", s) |
return res |
def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool: |
return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2 |
def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool: |
return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1) |
def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool: |
return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2) |
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float): |
if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2): |
return 0, 0 |
square_1 = (R1 - L1) * (D1 - U1) |
square_2 = (R2 - L2) * (D2 - U2) |
if square_1 == 0 or square_2 == 0: |
return 0, 0 |
square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2)) |
return square_overlap / square_1, square_overlap / square_2 |
def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float): |
if max(L1, L2) > min(R1, R2): |
return 0, 0 |
if L1 == R1 or L2 == R2: |
return 0, 0 |
overlap_line = min(R1, R2) - max(L1, L2) |
return overlap_line / (R1 - L1), overlap_line / (R2 - L2) |
def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool: |
width = R - L |
height = D - U |
if width <= 3 or height <= 3: |
return True |
if width / height >= 30 or height / width >= 30: |
return True |
def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]): |
""" |
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。 |
:param page :fitz读取的当前页的内容 |
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir |
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict |
""" |
DPI = 72 |
pix = page.get_pixmap(dpi=DPI) |
pageL = 0 |
pageR = int(pix.w) |
pageU = 0 |
pageD = int(pix.h) |
textLine_blocks = [] |
blocks = page.get_text( |
"dict", |
flags=fitz.TEXTFLAGS_TEXT, |
)["blocks"] |
for i in range(len(blocks)): |
bbox = blocks[i]['bbox'] |
for tt in blocks[i]['lines']: |
cur_line_bbox = None |
for xf in tt['spans']: |
L, U, R, D = xf['bbox'] |
L, R = min(L, R), max(L, R) |
U, D = min(U, D), max(U, D) |
textLine_blocks.append((L, U, R, D)) |
textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0])) |
raw_imgs = page.get_images() |
imgs = [] |
img_names = [] |
img_bboxs = [] |
img_visited = [] |
img_ID = 0 |
for i in range(len(raw_imgs)): |
if raw_imgs[i][0] in junk_img_bojids: |
continue |
else: |
try: |
tt = page.get_image_rects(raw_imgs[i][0], transform = True) |
rec = tt[0][0] |
L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3]) |
L, R = min(L, R), max(L, R) |
U, D = min(U, D), max(U, D) |
if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD): |
continue |
if pageL == L and R == pageR: |
continue |
if pageU == U and D == pageD: |
continue |
new_img_name = "{}_{}.png".format(page_ID, i) |
img_names.append(new_img_name) |
img_bboxs.append((L, U, R, D)) |
img_visited.append(False) |
imgs.append(raw_imgs[i]) |
except: |
continue |
imgs_ok = [True for _ in range(len(imgs))] |
for i in range(len(imgs)): |
L1, U1, R1, D1 = img_bboxs[i] |
for j in range(i + 1, len(imgs)): |
L2, U2, R2, D2 = img_bboxs[j] |
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) |
s1 = abs(R1 - L1) * abs(D1 - U1) |
s2 = abs(R2 - L2) * abs(D2 - U2) |
if ratio_1 > 0 and ratio_2 > 0: |
if ratio_1 == 1 and ratio_2 > 0.8: |
imgs_ok[i] = False |
elif ratio_1 > 0.8 and ratio_2 == 1: |
imgs_ok[j] = False |
elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4: |
imgs_ok[i] = False |
imgs_ok[j] = False |
elif s1 / s2 > 5 and ratio_2 > 0.5: |
imgs_ok[j] = False |
elif s2 / s1 > 5 and ratio_1 > 0.5: |
imgs_ok[i] = False |
imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True] |
img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True] |
img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True] |
img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True] |
svgs = page.get_drawings() |
svg_rect_visited = set() |
available_svgIdx = [] |
for i in range(len(svgs)): |
L, U, R, D = svgs[i]['rect'].irect |
L, R = min(L, R), max(L, R) |
U, D = min(U, D), max(U, D) |
tt = (L, U, R, D) |
if tt not in svg_rect_visited: |
svg_rect_visited.add(tt) |
available_svgIdx.append(i) |
svgs = [svgs[i] for i in available_svgIdx] |
svg_childs = [[] for _ in range(len(svgs))] |
svg_parents = [[] for _ in range(len(svgs))] |
svg_overlaps = [[] for _ in range(len(svgs))] |
svg_visited = [False for _ in range(len(svgs))] |
svg_exceedPage = [0 for _ in range(len(svgs))] |
for i in range(len(svgs)): |
L, U, R, D = svgs[i]['rect'].irect |
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD) |
if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20): |
if ratio_2 >= 0.7: |
svg_exceedPage[i] += 4 |
else: |
if L <= pageL: |
svg_exceedPage[i] += 1 |
if pageR <= R: |
svg_exceedPage[i] += 1 |
if U <= pageU: |
svg_exceedPage[i] += 1 |
if pageD <= D: |
svg_exceedPage[i] += 1 |
if len([x for x in svg_exceedPage if x >= 1]) >= 2: |
svgs = [] |
svg_childs = [] |
svg_parents = [] |
svg_overlaps = [] |
svg_visited = [] |
svg_exceedPage = [] |
for i, p in enumerate(svgs): |
L1, U1, R1, D1 = svgs[i]["rect"].irect |
for j in range(len(svgs)): |
if i == j: |
continue |
L2, U2, R2, D2 = svgs[j]["rect"].irect |
if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: |
svg_childs[i].append(j) |
svg_parents[j].append(i) |
else: |
if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: |
svg_overlaps[i].append(j) |
eps_ERROR = 5 |
svg_ID = 0 |
svg_final_names = [] |
svg_final_bboxs = [] |
svg_final_visited = [] |
svg_idxs = [i for i in range(len(svgs))] |
svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) |
for i in svg_idxs: |
if svg_visited[i] == True: |
continue |
svg_visited[i] = True |
L, U, R, D = svgs[i]['rect'].irect |
width = R - L |
height = D - U |
if check_rect_isLine(L, U, R, D) == True: |
svg_visited[i] = False |
continue |
cur_block_element_cnt = 0 |
if len(svg_parents[i]) == 0: |
cur_block_element_cnt += len(svg_childs[i]) |
if svg_exceedPage[i] == 0: |
neglect_flag = False |
for pL, pU, pR, pD in svg_final_bboxs: |
if pL <= L <= R <= pR and pU <= U <= D <= pD: |
neglect_flag = True |
break |
if neglect_flag == True: |
continue |
q = collections.deque() |
for j in svg_overlaps[i]: |
q.append(j) |
while q: |
j = q.popleft() |
svg_visited[j] = True |
L2, U2, R2, D2 = svgs[j]['rect'].irect |
L = min(L, L2) |
R = max(R, R2) |
U = min(U, U2) |
D = max(D, D2) |
cur_block_element_cnt += 1 |
cur_block_element_cnt += len(svg_childs[j]) |
for k in svg_overlaps[j]: |
if svg_visited[k] == False and svg_exceedPage[k] == 0: |
svg_visited[k] = True |
q.append(k) |
elif svg_exceedPage[i] <= 2: |
neglect_flag = False |
for sL, sU, sR, sD in svg_final_bboxs: |
if sL <= L <= R <= sR and sU <= U <= D <= sD: |
neglect_flag = True |
break |
if neglect_flag == True: |
continue |
L, U, R, D = pageR, pageD, pageL, pageU |
for j in svg_childs[i]: |
if svg_visited[j] == True: |
continue |
if svg_exceedPage[j] >= 1: |
continue |
svg_visited[j] = True |
L2, U2, R2, D2 = svgs[j]['rect'].irect |
L = min(L, L2) |
R = max(R, R2) |
U = min(U, U2) |
D = max(D, D2) |
cur_block_element_cnt += 1 |
if check_rect_isLine(L, U, R, D) == True: |
continue |
if cur_block_element_cnt < 3: |
continue |
contain_textLineBlock_cnt = 0 |
for L2, U2, R2, D2 in textLine_blocks: |
if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True: |
contain_textLineBlock_cnt += 1 |
if contain_textLineBlock_cnt >= 10: |
continue |
new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) |
svg_final_names.append(new_svg_name) |
svg_final_bboxs.append((L, U, R, D)) |
svg_final_visited.append(False) |
svg_ID += 1 |
svg_idxs = [i for i in range(len(svg_final_bboxs))] |
svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) |
svg_final_names_2 = [] |
svg_final_bboxs_2 = [] |
svg_final_visited_2 = [] |
svg_ID_2 = 0 |
for i in range(len(svg_final_bboxs)): |
L1, U1, R1, D1 = svg_final_bboxs[i] |
for j in range(i + 1, len(svg_final_bboxs)): |
L2, U2, R2, D2 = svg_final_bboxs[j] |
if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: |
svg_final_visited[j] = True |
continue |
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2) |
if ratio_1 >= 0.7 and ratio_2 >= 0.7: |
if abs(L2 - R1) >= 20: |
continue |
LL = min(L1, L2) |
UU = min(U1, U2) |
RR = max(R1, R2) |
DD = max(D1, D2) |
svg_final_bboxs[i] = (LL, UU, RR, DD) |
svg_final_visited[j] = True |
continue |
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2) |
if ratio_1 >= 0.7 and ratio_2 >= 0.7: |
if abs(U2 - D1) >= 20: |
continue |
LL = min(L1, L2) |
UU = min(U1, U2) |
RR = max(R1, R2) |
DD = max(D1, D2) |
svg_final_bboxs[i] = (LL, UU, RR, DD) |
svg_final_visited[j] = True |
for i in range(len(svg_final_bboxs)): |
if svg_final_visited[i] == False: |
L, U, R, D = svg_final_bboxs[i] |
svg_final_bboxs_2.append((L, U, R, D)) |
L -= eps_ERROR * 2 |
U -= eps_ERROR |
R += eps_ERROR * 2 |
D += eps_ERROR |
new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) |
svg_final_names_2.append(new_svg_name) |
svg_final_bboxs_2.append((L, U, R, D)) |
svg_final_visited_2.append(False) |
svg_ID_2 += 1 |
figure_bbox_from_DocXChain = [] |
figure_from_DocXChain_visited = [] |
figure_bbox_from_DocXChain_overlappedRatio = [] |
figure_only_from_DocXChain_bboxs = [] |
figure_only_from_DocXChain_names = [] |
figure_only_from_DocXChain_visited = [] |
figure_only_ID = 0 |
xf_json = json_from_DocXchain_obj |
width_from_json = xf_json['page_info']['width'] |
height_from_json = xf_json['page_info']['height'] |
LR_scaleRatio = width_from_json / (pageR - pageL) |
UD_scaleRatio = height_from_json / (pageD - pageU) |
for xf in xf_json['layout_dets']: |
L = xf['poly'][0] / LR_scaleRatio |
U = xf['poly'][1] / UD_scaleRatio |
R = xf['poly'][2] / LR_scaleRatio |
D = xf['poly'][5] / UD_scaleRatio |
L, R = min(L, R), max(L, R) |
U, D = min(U, D), max(U, D) |
if xf["category_id"] == 1 and xf['score'] >= 0.3: |
figure_bbox_from_DocXChain.append((L, U, R, D)) |
figure_from_DocXChain_visited.append(False) |
figure_bbox_from_DocXChain_overlappedRatio.append(0.0) |
for i, b1 in enumerate(figure_bbox_from_DocXChain): |
L1, U1, R1, D1 = b1 |
for b2 in img_bboxs: |
L2, U2, R2, D2 = b2 |
s1 = abs(R1 - L1) * abs(D1 - U1) |
s2 = abs(R2 - L2) * abs(D2 - U2) |
if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: |
figure_from_DocXChain_visited[i] = True |
elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: |
if s2 / s1 > 0.8: |
figure_from_DocXChain_visited[i] = True |
elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True: |
if s1 / s2 > 0.8: |
figure_from_DocXChain_visited[i] = True |
else: |
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) |
if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8): |
figure_from_DocXChain_visited[i] = True |
else: |
figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1 |
svg_final_bboxs_2_badIdxs = [] |
for i, b1 in enumerate(figure_bbox_from_DocXChain): |
L1, U1, R1, D1 = b1 |
for j, b2 in enumerate(svg_final_bboxs_2): |
L2, U2, R2, D2 = b2 |
s1 = abs(R1 - L1) * abs(D1 - U1) |
s2 = abs(R2 - L2) * abs(D2 - U2) |
if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: |
figure_from_DocXChain_visited[i] = True |
elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: |
figure_from_DocXChain_visited[i] = True |
elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True: |
if s1 / s2 > 0.7: |
figure_from_DocXChain_visited[i] = True |
else: |
svg_final_bboxs_2_badIdxs.append(j) |
else: |
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) |
if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6): |
figure_from_DocXChain_visited[i] = True |
else: |
figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1 |
svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)] |
for i in range(len(figure_from_DocXChain_visited)): |
if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7: |
figure_from_DocXChain_visited[i] = True |
for i in range(len(figure_from_DocXChain_visited)): |
if figure_from_DocXChain_visited[i] == False: |
figure_from_DocXChain_visited[i] = True |
cur_bbox = figure_bbox_from_DocXChain[i] |
new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) |
figure_only_from_DocXChain_names.append(new_figure_name) |
figure_only_from_DocXChain_bboxs.append(cur_bbox) |
figure_only_from_DocXChain_visited.append(False) |
figure_only_ID += 1 |
img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) |
svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0])) |
figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) |
curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs |
curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) ) |
final_duplicate = set() |
for i in range(len(curPage_all_fig_bboxs)): |
L1, U1, R1, D1 = curPage_all_fig_bboxs[i] |
for j in range(len(curPage_all_fig_bboxs)): |
if i == j: |
continue |
L2, U2, R2, D2 = curPage_all_fig_bboxs[j] |
s1 = abs(R1 - L1) * abs(D1 - U1) |
s2 = abs(R2 - L2) * abs(D2 - U2) |
if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True: |
final_duplicate.add((L1, U1, R1, D1)) |
else: |
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) |
if ratio_1 >= 0.8 and ratio_2 <= 0.6: |
final_duplicate.add((L1, U1, R1, D1)) |
curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate] |
final_duplicate = set() |
final_synthetic_bboxs = [] |
for i in range(len(curPage_all_fig_bboxs)): |
L1, U1, R1, D1 = curPage_all_fig_bboxs[i] |
for j in range(len(curPage_all_fig_bboxs)): |
if i == j: |
continue |
L2, U2, R2, D2 = curPage_all_fig_bboxs[j] |
s1 = abs(R1 - L1) * abs(D1 - U1) |
s2 = abs(R2 - L2) * abs(D2 - U2) |
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) |
union_ok = False |
if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): |
union_ok = True |
if (ratio_1 > 0.2 and s2 / s1 > 5): |
union_ok = True |
if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1): |
union_ok = True |
if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2): |
union_ok = True |
if union_ok == True: |
final_duplicate.add((L1, U1, R1, D1)) |
final_duplicate.add((L2, U2, R2, D2)) |
L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2) |
final_synthetic_bboxs.append((L3, U3, R3, D3)) |
curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate] |
final_synthetic_bboxs = list(set(final_synthetic_bboxs)) |
new_images = [] |
droped_img_idx = [] |
image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs] |
for i in range(0, len(image_bboxes)): |
for j in range(i+1, len(image_bboxes)): |
if j not in droped_img_idx: |
L2, U2, R2, D2 = image_bboxes[j] |
s1 = abs(R1 - L1) * abs(D1 - U1) |
s2 = abs(R2 - L2) * abs(D2 - U2) |
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) |
union_ok = False |
if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): |
union_ok = True |
if (ratio_1 > 0.2 and s2 / s1 > 5): |
union_ok = True |
if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1): |
union_ok = True |
if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2): |
union_ok = True |
if union_ok == True: |
image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3]) |
droped_img_idx.append(j) |
for i in range(0, len(image_bboxes)): |
if i not in droped_img_idx: |
new_images.append(image_bboxes[i]) |
images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs] |
images = images1 + new_images |
return images |