File size: 30,432 Bytes
240e0a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 |
import collections # 统计库
import re
from magic_pdf.libs.commons import fitz # pyMuPDF库
#--------------------------------------- Tool Functions --------------------------------------#
# 正则化,输入文本,输出只保留a-z,A-Z,0-9
def remove_special_chars(s: str) -> str:
pattern = r"[^a-zA-Z0-9]"
res = re.sub(pattern, "", s)
return res
def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
# 判断rect1和rect2是否一模一样
return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
# 判断rect1包含了rect2
return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
# 判断rect1与rect2是否存在重叠(只有一条边重叠,也算重叠)
return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
# 计算两个rect,重叠面积各占2个rect面积的比例
if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
return 0, 0
square_1 = (R1 - L1) * (D1 - U1)
square_2 = (R2 - L2) * (D2 - U2)
if square_1 == 0 or square_2 == 0:
return 0, 0
square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
return square_overlap / square_1, square_overlap / square_2
def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
# 计算两个line,重叠区间各占2个line长度的比例
if max(L1, L2) > min(R1, R2):
return 0, 0
if L1 == R1 or L2 == R2:
return 0, 0
overlap_line = min(R1, R2) - max(L1, L2)
return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
# 判断rect其实是一条line
def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
width = R - L
height = D - U
if width <= 3 or height <= 3:
return True
if width / height >= 30 or height / width >= 30:
return True
def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
"""
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
"""
#### 通过fitz获取page信息
## 超越边界
DPI = 72 # use this resolution
pix = page.get_pixmap(dpi=DPI)
pageL = 0
pageR = int(pix.w)
pageU = 0
pageD = int(pix.h)
#----------------- 保存每一个文本块的LURD ------------------#
textLine_blocks = []
blocks = page.get_text(
"dict",
flags=fitz.TEXTFLAGS_TEXT,
#clip=clip,
)["blocks"]
for i in range(len(blocks)):
bbox = blocks[i]['bbox']
# print(bbox)
for tt in blocks[i]['lines']:
# 当前line
cur_line_bbox = None # 当前line,最右侧的section的bbox
for xf in tt['spans']:
L, U, R, D = xf['bbox']
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
textLine_blocks.append((L, U, R, D))
textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
#---------------------------------------------- 保存img --------------------------------------------------#
raw_imgs = page.get_images() # 获取所有的图片
imgs = []
img_names = [] # 保存图片的名字,方便在md中插入引用
img_bboxs = [] # 保存图片的location信息。
img_visited = [] # 记忆化,记录该图片是否在md中已经插入过了
img_ID = 0
## 获取、保存每张img的location信息(x1, y1, x2, y2, UL, DR坐标)
for i in range(len(raw_imgs)):
# 如果图片在junklist中则跳过
if raw_imgs[i][0] in junk_img_bojids:
continue
else:
try:
tt = page.get_image_rects(raw_imgs[i][0], transform = True)
rec = tt[0][0]
L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
continue
if pageL == L and R == pageR:
continue
if pageU == U and D == pageD:
continue
# pix1 = page.get_Pixmap(clip=(L,U,R,D))
new_img_name = "{}_{}.png".format(page_ID, i) # 图片name
# pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹,并命名
img_names.append(new_img_name)
img_bboxs.append((L, U, R, D))
img_visited.append(False)
imgs.append(raw_imgs[i])
except:
continue
#-------- 如果img之间有重叠。说明获取的img大小有问题,位置也不一定对。就扔掉--------#
imgs_ok = [True for _ in range(len(imgs))]
for i in range(len(imgs)):
L1, U1, R1, D1 = img_bboxs[i]
for j in range(i + 1, len(imgs)):
L2, U2, R2, D2 = img_bboxs[j]
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
if ratio_1 > 0 and ratio_2 > 0:
if ratio_1 == 1 and ratio_2 > 0.8:
imgs_ok[i] = False
elif ratio_1 > 0.8 and ratio_2 == 1:
imgs_ok[j] = False
elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
imgs_ok[i] = False
imgs_ok[j] = False
elif s1 / s2 > 5 and ratio_2 > 0.5:
imgs_ok[j] = False
elif s2 / s1 > 5 and ratio_1 > 0.5:
imgs_ok[i] = False
imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
#*******************************************************************************#
#---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
#
svgs = page.get_drawings()
#------------ preprocess, check一些大框,看是否是合理的 ----------#
## 去重。有时候会遇到rect1和rect2是完全一样的情形。
svg_rect_visited = set()
available_svgIdx = []
for i in range(len(svgs)):
L, U, R, D = svgs[i]['rect'].irect
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
tt = (L, U, R, D)
if tt not in svg_rect_visited:
svg_rect_visited.add(tt)
available_svgIdx.append(i)
svgs = [svgs[i] for i in available_svgIdx] # 去重后,有效的svgs
svg_childs = [[] for _ in range(len(svgs))]
svg_parents = [[] for _ in range(len(svgs))]
svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list,存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
svg_visited = [False for _ in range(len(svgs))]
svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界(artbox),很大,但一般是一个svg的底。
for i in range(len(svgs)):
L, U, R, D = svgs[i]['rect'].irect
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
if ratio_2 >= 0.7:
svg_exceedPage[i] += 4
else:
if L <= pageL:
svg_exceedPage[i] += 1
if pageR <= R:
svg_exceedPage[i] += 1
if U <= pageU:
svg_exceedPage[i] += 1
if pageD <= D:
svg_exceedPage[i] += 1
#### 如果有≥2个的超边界的框,就不要手写规则判断svg了。很难写对。
if len([x for x in svg_exceedPage if x >= 1]) >= 2:
svgs = []
svg_childs = []
svg_parents = []
svg_overlaps = []
svg_visited = []
svg_exceedPage = []
#---------------------------- build graph ----------------------------#
for i, p in enumerate(svgs):
L1, U1, R1, D1 = svgs[i]["rect"].irect
for j in range(len(svgs)):
if i == j:
continue
L2, U2, R2, D2 = svgs[j]["rect"].irect
## 包含
if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
svg_childs[i].append(j)
svg_parents[j].append(i)
else:
## 交叉
if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
svg_overlaps[i].append(j)
#---------------- 确定最终的svg。连通块儿的外围 -------------------#
eps_ERROR = 5 # 给识别出的svg,四周留白(为了防止pyMuPDF的rect不准)
svg_ID = 0
svg_final_names = []
svg_final_bboxs = []
svg_final_visited = [] # 为下面,text识别左准备。作用同img_visited
svg_idxs = [i for i in range(len(svgs))]
svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积,从大到小排序
for i in svg_idxs:
if svg_visited[i] == True:
continue
svg_visited[i] = True
L, U, R, D = svgs[i]['rect'].irect
width = R - L
height = D - U
if check_rect_isLine(L, U, R, D) == True:
svg_visited[i] = False
continue
# if i == 4:
# print(i, L, U, R, D)
# print(svg_parents[i])
cur_block_element_cnt = 0 # 当前要判定为svg的区域中,有多少elements,最外围的最大svg框除外。
if len(svg_parents[i]) == 0:
## 是个普通框的情形
cur_block_element_cnt += len(svg_childs[i])
if svg_exceedPage[i] == 0:
## 误差。可能已经包含在某个框里面了
neglect_flag = False
for pL, pU, pR, pD in svg_final_bboxs:
if pL <= L <= R <= pR and pU <= U <= D <= pD:
neglect_flag = True
break
if neglect_flag == True:
continue
## 搜索连通域, bfs+记忆化
q = collections.deque()
for j in svg_overlaps[i]:
q.append(j)
while q:
j = q.popleft()
svg_visited[j] = True
L2, U2, R2, D2 = svgs[j]['rect'].irect
# width2 = R2 - L2
# height2 = D2 - U2
# if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
# continue
L = min(L, L2)
R = max(R, R2)
U = min(U, U2)
D = max(D, D2)
cur_block_element_cnt += 1
cur_block_element_cnt += len(svg_childs[j])
for k in svg_overlaps[j]:
if svg_visited[k] == False and svg_exceedPage[k] == 0:
svg_visited[k] = True
q.append(k)
elif svg_exceedPage[i] <= 2:
## 误差。可能已经包含在某个svg_final_bbox框里面了
neglect_flag = False
for sL, sU, sR, sD in svg_final_bboxs:
if sL <= L <= R <= sR and sU <= U <= D <= sD:
neglect_flag = True
break
if neglect_flag == True:
continue
L, U, R, D = pageR, pageD, pageL, pageU
## 所有孩子元素的最大边界
for j in svg_childs[i]:
if svg_visited[j] == True:
continue
if svg_exceedPage[j] >= 1:
continue
svg_visited[j] = True #### 这个位置考虑一下
L2, U2, R2, D2 = svgs[j]['rect'].irect
L = min(L, L2)
R = max(R, R2)
U = min(U, U2)
D = max(D, D2)
cur_block_element_cnt += 1
# 如果是条line,就不用保存了
if check_rect_isLine(L, U, R, D) == True:
continue
# 如果当前的svg,连2个elements都没有,就不用保存了
if cur_block_element_cnt < 3:
continue
## 当前svg,框住了多少文本框。如果框多了,可能就是错了
contain_textLineBlock_cnt = 0
for L2, U2, R2, D2 in textLine_blocks:
if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
contain_textLineBlock_cnt += 1
if contain_textLineBlock_cnt >= 10:
continue
# L -= eps_ERROR * 2
# U -= eps_ERROR
# R += eps_ERROR * 2
# D += eps_ERROR
# # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
# cur_svg = page.get_pixmap(clip=(L,U,R,D))
new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name
# cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
svg_final_names.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
svg_final_bboxs.append((L, U, R, D))
svg_final_visited.append(False)
svg_ID += 1
## 识别出的svg,可能有 包含,相邻的情形。需要进一步合并
svg_idxs = [i for i in range(len(svg_final_bboxs))]
svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L)
svg_final_names_2 = []
svg_final_bboxs_2 = []
svg_final_visited_2 = [] # 为下面,text识别左准备。作用同img_visited
svg_ID_2 = 0
for i in range(len(svg_final_bboxs)):
L1, U1, R1, D1 = svg_final_bboxs[i]
for j in range(i + 1, len(svg_final_bboxs)):
L2, U2, R2, D2 = svg_final_bboxs[j]
# 如果 rect1包含了rect2
if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
svg_final_visited[j] = True
continue
# 水平并列
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
if ratio_1 >= 0.7 and ratio_2 >= 0.7:
if abs(L2 - R1) >= 20:
continue
LL = min(L1, L2)
UU = min(U1, U2)
RR = max(R1, R2)
DD = max(D1, D2)
svg_final_bboxs[i] = (LL, UU, RR, DD)
svg_final_visited[j] = True
continue
# 竖直并列
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
if ratio_1 >= 0.7 and ratio_2 >= 0.7:
if abs(U2 - D1) >= 20:
continue
LL = min(L1, L2)
UU = min(U1, U2)
RR = max(R1, R2)
DD = max(D1, D2)
svg_final_bboxs[i] = (LL, UU, RR, DD)
svg_final_visited[j] = True
for i in range(len(svg_final_bboxs)):
if svg_final_visited[i] == False:
L, U, R, D = svg_final_bboxs[i]
svg_final_bboxs_2.append((L, U, R, D))
L -= eps_ERROR * 2
U -= eps_ERROR
R += eps_ERROR * 2
D += eps_ERROR
# cur_svg = page.get_pixmap(clip=(L,U,R,D))
new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name
# cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
svg_final_bboxs_2.append((L, U, R, D))
svg_final_visited_2.append(False)
svg_ID_2 += 1
## svg收尾。识别为drawing,但是在上面没有拼成一张图的。
# 有收尾才comprehensive
# xxxx
# xxxx
# xxxx
# xxxx
#--------- 通过json_from_DocXchain来获取,figure, table, equation的bbox ---------#
figure_bbox_from_DocXChain = []
figure_from_DocXChain_visited = [] # 记忆化
figure_bbox_from_DocXChain_overlappedRatio = []
figure_only_from_DocXChain_bboxs = [] # 存储
figure_only_from_DocXChain_names = []
figure_only_from_DocXChain_visited = []
figure_only_ID = 0
xf_json = json_from_DocXchain_obj
width_from_json = xf_json['page_info']['width']
height_from_json = xf_json['page_info']['height']
LR_scaleRatio = width_from_json / (pageR - pageL)
UD_scaleRatio = height_from_json / (pageD - pageU)
for xf in xf_json['layout_dets']:
# {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
L = xf['poly'][0] / LR_scaleRatio
U = xf['poly'][1] / UD_scaleRatio
R = xf['poly'][2] / LR_scaleRatio
D = xf['poly'][5] / UD_scaleRatio
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL
# U += pageU
# D += pageU
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
# figure
if xf["category_id"] == 1 and xf['score'] >= 0.3:
figure_bbox_from_DocXChain.append((L, U, R, D))
figure_from_DocXChain_visited.append(False)
figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
#---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
## 比对imgs
for i, b1 in enumerate(figure_bbox_from_DocXChain):
# print('--------- DocXChain的图片', b1)
L1, U1, R1, D1 = b1
for b2 in img_bboxs:
# print('-------- igms得到的图', b2)
L2, U2, R2, D2 = b2
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
# 相同
if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
figure_from_DocXChain_visited[i] = True
# 包含
elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
if s2 / s1 > 0.8:
figure_from_DocXChain_visited[i] = True
elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
if s1 / s2 > 0.8:
figure_from_DocXChain_visited[i] = True
else:
# 重叠了相当一部分
# print('进入第3部分')
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
figure_from_DocXChain_visited[i] = True
else:
figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
# print('图片的重叠率是{}'.format(ratio_1))
## 比对svgs
svg_final_bboxs_2_badIdxs = []
for i, b1 in enumerate(figure_bbox_from_DocXChain):
L1, U1, R1, D1 = b1
for j, b2 in enumerate(svg_final_bboxs_2):
L2, U2, R2, D2 = b2
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
# 相同
if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
figure_from_DocXChain_visited[i] = True
# 包含
elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
figure_from_DocXChain_visited[i] = True
elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
if s1 / s2 > 0.7:
figure_from_DocXChain_visited[i] = True
else:
svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。
else:
# 重叠了相当一部分
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
figure_from_DocXChain_visited[i] = True
else:
figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
# 丢掉错误的svg
svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
for i in range(len(figure_from_DocXChain_visited)):
if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
figure_from_DocXChain_visited[i] = True
# DocXChain识别出来的figure,但是没被保存的。
for i in range(len(figure_from_DocXChain_visited)):
if figure_from_DocXChain_visited[i] == False:
figure_from_DocXChain_visited[i] = True
cur_bbox = figure_bbox_from_DocXChain[i]
# cur_figure = page.get_pixmap(clip=cur_bbox)
new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name
# cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹,并命名
figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中,方便在md中插入引用
figure_only_from_DocXChain_bboxs.append(cur_bbox)
figure_only_from_DocXChain_visited.append(False)
figure_only_ID += 1
img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
#--------------------------- 最后统一去重 -----------------------------------#
curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
#### 先考虑包含关系的小块
final_duplicate = set()
for i in range(len(curPage_all_fig_bboxs)):
L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
for j in range(len(curPage_all_fig_bboxs)):
if i == j:
continue
L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
final_duplicate.add((L1, U1, R1, D1))
else:
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
if ratio_1 >= 0.8 and ratio_2 <= 0.6:
final_duplicate.add((L1, U1, R1, D1))
curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
#### 再考虑重叠关系的块
final_duplicate = set()
final_synthetic_bboxs = []
for i in range(len(curPage_all_fig_bboxs)):
L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
for j in range(len(curPage_all_fig_bboxs)):
if i == j:
continue
L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
union_ok = False
if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
union_ok = True
if (ratio_1 > 0.2 and s2 / s1 > 5):
union_ok = True
if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
union_ok = True
if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
union_ok = True
if union_ok == True:
final_duplicate.add((L1, U1, R1, D1))
final_duplicate.add((L2, U2, R2, D2))
L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
final_synthetic_bboxs.append((L3, U3, R3, D3))
# print('---------- curPage_all_fig_bboxs ---------')
# print(curPage_all_fig_bboxs)
curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]
final_synthetic_bboxs = list(set(final_synthetic_bboxs))
## 再再考虑重叠关系。极端情况下会迭代式地2进1
new_images = []
droped_img_idx = []
image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]
for i in range(0, len(image_bboxes)):
for j in range(i+1, len(image_bboxes)):
if j not in droped_img_idx:
L2, U2, R2, D2 = image_bboxes[j]
s1 = abs(R1 - L1) * abs(D1 - U1)
s2 = abs(R2 - L2) * abs(D2 - U2)
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
union_ok = False
if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
union_ok = True
if (ratio_1 > 0.2 and s2 / s1 > 5):
union_ok = True
if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
union_ok = True
if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
union_ok = True
if union_ok == True:
# 合并
image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
droped_img_idx.append(j)
for i in range(0, len(image_bboxes)):
if i not in droped_img_idx:
new_images.append(image_bboxes[i])
# find_union_FLAG = True
# while find_union_FLAG == True:
# find_union_FLAG = False
# final_duplicate = set()
# tmp = []
# for i in range(len(final_synthetic_bboxs)):
# L1, U1, R1, D1 = final_synthetic_bboxs[i]
# for j in range(len(final_synthetic_bboxs)):
# if i == j:
# continue
# L2, U2, R2, D2 = final_synthetic_bboxs[j]
# s1 = abs(R1 - L1) * abs(D1 - U1)
# s2 = abs(R2 - L2) * abs(D2 - U2)
# ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
# union_ok = False
# if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
# union_ok = True
# if (ratio_1 > 0.2 and s2 / s1 > 5):
# union_ok = True
# if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
# union_ok = True
# if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
# union_ok = True
# if union_ok == True:
# find_union_FLAG = True
# final_duplicate.add((L1, U1, R1, D1))
# final_duplicate.add((L2, U2, R2, D2))
# L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
# tmp.append((L3, U3, R3, D3))
# if find_union_FLAG == True:
# tmp = list(set(tmp))
# final_synthetic_bboxs = tmp[:]
# curPage_all_fig_bboxs += final_synthetic_bboxs
# print('--------- final synthetic')
# print(final_synthetic_bboxs)
#**************************************************************************#
images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
images = images1 + new_images
return images
|