Spaces:

derful
/

MinerU

Runtime error

App Files Files Community

MinerU / magic_pdf /pre_proc /detect_images.py

derful

Upload folder using huggingface_hub

240e0a0 verified 4 months ago

raw

history blame contribute delete

30.4 kB

	import collections # 统计库
	import re
	from magic_pdf.libs.commons import fitz # pyMuPDF库


	#--------------------------------------- Tool Functions --------------------------------------#
	# 正则化，输入文本，输出只保留a-z,A-Z,0-9
	def remove_special_chars(s: str) -> str:
	pattern = r"[^a-zA-Z0-9]"
	res = re.sub(pattern, "", s)
	return res

	def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
	# 判断rect1和rect2是否一模一样
	return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2

	def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
	# 判断rect1包含了rect2
	return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)

	def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
	# 判断rect1与rect2是否存在重叠（只有一条边重叠，也算重叠）
	return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)

	def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
	# 计算两个rect，重叠面积各占2个rect面积的比例
	if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
	return 0, 0
	square_1 = (R1 - L1) * (D1 - U1)
	square_2 = (R2 - L2) * (D2 - U2)
	if square_1 == 0 or square_2 == 0:
	return 0, 0
	square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
	return square_overlap / square_1, square_overlap / square_2

	def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
	# 计算两个line，重叠区间各占2个line长度的比例
	if max(L1, L2) > min(R1, R2):
	return 0, 0
	if L1 == R1 or L2 == R2:
	return 0, 0
	overlap_line = min(R1, R2) - max(L1, L2)
	return overlap_line / (R1 - L1), overlap_line / (R2 - L2)


	# 判断rect其实是一条line
	def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
	width = R - L
	height = D - U
	if width <= 3 or height <= 3:
	return True
	if width / height >= 30 or height / width >= 30:
	return True



	def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
	"""
	:param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
	:param page :fitz读取的当前页的内容
	:param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
	:param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
	"""
	#### 通过fitz获取page信息
	## 超越边界
	DPI = 72 # use this resolution
	pix = page.get_pixmap(dpi=DPI)
	pageL = 0
	pageR = int(pix.w)
	pageU = 0
	pageD = int(pix.h)

	#----------------- 保存每一个文本块的LURD ------------------#
	textLine_blocks = []
	blocks = page.get_text(
	"dict",
	flags=fitz.TEXTFLAGS_TEXT,
	#clip=clip,
	)["blocks"]
	for i in range(len(blocks)):
	bbox = blocks[i]['bbox']
	# print(bbox)
	for tt in blocks[i]['lines']:
	# 当前line
	cur_line_bbox = None # 当前line，最右侧的section的bbox
	for xf in tt['spans']:
	L, U, R, D = xf['bbox']
	L, R = min(L, R), max(L, R)
	U, D = min(U, D), max(U, D)
	textLine_blocks.append((L, U, R, D))
	textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))


	#---------------------------------------------- 保存img --------------------------------------------------#
	raw_imgs = page.get_images() # 获取所有的图片
	imgs = []
	img_names = [] # 保存图片的名字，方便在md中插入引用
	img_bboxs = [] # 保存图片的location信息。
	img_visited = [] # 记忆化，记录该图片是否在md中已经插入过了
	img_ID = 0

	## 获取、保存每张img的location信息(x1, y1, x2, y2， UL, DR坐标)
	for i in range(len(raw_imgs)):
	# 如果图片在junklist中则跳过
	if raw_imgs[i][0] in junk_img_bojids:
	continue
	else:
	try:
	tt = page.get_image_rects(raw_imgs[i][0], transform = True)

	rec = tt[0][0]
	L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])

	L, R = min(L, R), max(L, R)
	U, D = min(U, D), max(U, D)
	if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
	continue
	if pageL == L and R == pageR:
	continue
	if pageU == U and D == pageD:
	continue
	# pix1 = page.get_Pixmap(clip=(L,U,R,D))
	new_img_name = "{}_{}.png".format(page_ID, i) # 图片name
	# pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹，并命名
	img_names.append(new_img_name)
	img_bboxs.append((L, U, R, D))
	img_visited.append(False)
	imgs.append(raw_imgs[i])
	except:
	continue

	#-------- 如果img之间有重叠。说明获取的img大小有问题，位置也不一定对。就扔掉--------#
	imgs_ok = [True for _ in range(len(imgs))]
	for i in range(len(imgs)):
	L1, U1, R1, D1 = img_bboxs[i]
	for j in range(i + 1, len(imgs)):
	L2, U2, R2, D2 = img_bboxs[j]
	ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
	s1 = abs(R1 - L1) * abs(D1 - U1)
	s2 = abs(R2 - L2) * abs(D2 - U2)
	if ratio_1 > 0 and ratio_2 > 0:
	if ratio_1 == 1 and ratio_2 > 0.8:
	imgs_ok[i] = False
	elif ratio_1 > 0.8 and ratio_2 == 1:
	imgs_ok[j] = False
	elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
	imgs_ok[i] = False
	imgs_ok[j] = False
	elif s1 / s2 > 5 and ratio_2 > 0.5:
	imgs_ok[j] = False
	elif s2 / s1 > 5 and ratio_1 > 0.5:
	imgs_ok[i] = False

	imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
	img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
	img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
	img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
	#*******************************************************************************#

	#---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
	#
	svgs = page.get_drawings()
	#------------ preprocess, check一些大框，看是否是合理的 ----------#
	## 去重。有时候会遇到rect1和rect2是完全一样的情形。
	svg_rect_visited = set()
	available_svgIdx = []
	for i in range(len(svgs)):
	L, U, R, D = svgs[i]['rect'].irect
	L, R = min(L, R), max(L, R)
	U, D = min(U, D), max(U, D)
	tt = (L, U, R, D)
	if tt not in svg_rect_visited:
	svg_rect_visited.add(tt)
	available_svgIdx.append(i)

	svgs = [svgs[i] for i in available_svgIdx] # 去重后，有效的svgs
	svg_childs = [[] for _ in range(len(svgs))]
	svg_parents = [[] for _ in range(len(svgs))]
	svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list，存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
	svg_visited = [False for _ in range(len(svgs))]
	svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界（artbox），很大，但一般是一个svg的底。


	for i in range(len(svgs)):
	L, U, R, D = svgs[i]['rect'].irect
	ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
	if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
	if ratio_2 >= 0.7:
	svg_exceedPage[i] += 4
	else:
	if L <= pageL:
	svg_exceedPage[i] += 1
	if pageR <= R:
	svg_exceedPage[i] += 1
	if U <= pageU:
	svg_exceedPage[i] += 1
	if pageD <= D:
	svg_exceedPage[i] += 1

	#### 如果有≥2个的超边界的框，就不要手写规则判断svg了。很难写对。
	if len([x for x in svg_exceedPage if x >= 1]) >= 2:
	svgs = []
	svg_childs = []
	svg_parents = []
	svg_overlaps = []
	svg_visited = []
	svg_exceedPage = []

	#---------------------------- build graph ----------------------------#
	for i, p in enumerate(svgs):
	L1, U1, R1, D1 = svgs[i]["rect"].irect
	for j in range(len(svgs)):
	if i == j:
	continue
	L2, U2, R2, D2 = svgs[j]["rect"].irect
	## 包含
	if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
	svg_childs[i].append(j)
	svg_parents[j].append(i)
	else:
	## 交叉
	if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
	svg_overlaps[i].append(j)

	#---------------- 确定最终的svg。连通块儿的外围 -------------------#
	eps_ERROR = 5 # 给识别出的svg，四周留白（为了防止pyMuPDF的rect不准）
	svg_ID = 0
	svg_final_names = []
	svg_final_bboxs = []
	svg_final_visited = [] # 为下面，text识别左准备。作用同img_visited

	svg_idxs = [i for i in range(len(svgs))]
	svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积，从大到小排序

	for i in svg_idxs:
	if svg_visited[i] == True:
	continue
	svg_visited[i] = True
	L, U, R, D = svgs[i]['rect'].irect
	width = R - L
	height = D - U
	if check_rect_isLine(L, U, R, D) == True:
	svg_visited[i] = False
	continue
	# if i == 4:
	# print(i, L, U, R, D)
	# print(svg_parents[i])

	cur_block_element_cnt = 0 # 当前要判定为svg的区域中，有多少elements，最外围的最大svg框除外。
	if len(svg_parents[i]) == 0:
	## 是个普通框的情形
	cur_block_element_cnt += len(svg_childs[i])
	if svg_exceedPage[i] == 0:
	## 误差。可能已经包含在某个框里面了
	neglect_flag = False
	for pL, pU, pR, pD in svg_final_bboxs:
	if pL <= L <= R <= pR and pU <= U <= D <= pD:
	neglect_flag = True
	break
	if neglect_flag == True:
	continue

	## 搜索连通域, bfs+记忆化
	q = collections.deque()
	for j in svg_overlaps[i]:
	q.append(j)
	while q:
	j = q.popleft()
	svg_visited[j] = True
	L2, U2, R2, D2 = svgs[j]['rect'].irect
	# width2 = R2 - L2
	# height2 = D2 - U2
	# if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
	# continue
	L = min(L, L2)
	R = max(R, R2)
	U = min(U, U2)
	D = max(D, D2)
	cur_block_element_cnt += 1
	cur_block_element_cnt += len(svg_childs[j])
	for k in svg_overlaps[j]:
	if svg_visited[k] == False and svg_exceedPage[k] == 0:
	svg_visited[k] = True
	q.append(k)
	elif svg_exceedPage[i] <= 2:
	## 误差。可能已经包含在某个svg_final_bbox框里面了
	neglect_flag = False
	for sL, sU, sR, sD in svg_final_bboxs:
	if sL <= L <= R <= sR and sU <= U <= D <= sD:
	neglect_flag = True
	break
	if neglect_flag == True:
	continue

	L, U, R, D = pageR, pageD, pageL, pageU
	## 所有孩子元素的最大边界
	for j in svg_childs[i]:
	if svg_visited[j] == True:
	continue
	if svg_exceedPage[j] >= 1:
	continue
	svg_visited[j] = True #### 这个位置考虑一下
	L2, U2, R2, D2 = svgs[j]['rect'].irect
	L = min(L, L2)
	R = max(R, R2)
	U = min(U, U2)
	D = max(D, D2)
	cur_block_element_cnt += 1

	# 如果是条line，就不用保存了
	if check_rect_isLine(L, U, R, D) == True:
	continue
	# 如果当前的svg，连2个elements都没有，就不用保存了
	if cur_block_element_cnt < 3:
	continue

	## 当前svg，框住了多少文本框。如果框多了，可能就是错了
	contain_textLineBlock_cnt = 0
	for L2, U2, R2, D2 in textLine_blocks:
	if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
	contain_textLineBlock_cnt += 1
	if contain_textLineBlock_cnt >= 10:
	continue

	# L -= eps_ERROR * 2
	# U -= eps_ERROR
	# R += eps_ERROR * 2
	# D += eps_ERROR
	# # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
	# cur_svg = page.get_pixmap(clip=(L,U,R,D))
	new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name
	# cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹，并命名
	svg_final_names.append(new_svg_name) # 把图片的名字存在list中，方便在md中插入引用
	svg_final_bboxs.append((L, U, R, D))
	svg_final_visited.append(False)
	svg_ID += 1

	## 识别出的svg，可能有包含，相邻的情形。需要进一步合并
	svg_idxs = [i for i in range(len(svg_final_bboxs))]
	svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L)
	svg_final_names_2 = []
	svg_final_bboxs_2 = []
	svg_final_visited_2 = [] # 为下面，text识别左准备。作用同img_visited
	svg_ID_2 = 0
	for i in range(len(svg_final_bboxs)):
	L1, U1, R1, D1 = svg_final_bboxs[i]
	for j in range(i + 1, len(svg_final_bboxs)):
	L2, U2, R2, D2 = svg_final_bboxs[j]
	# 如果 rect1包含了rect2
	if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
	svg_final_visited[j] = True
	continue
	# 水平并列
	ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
	if ratio_1 >= 0.7 and ratio_2 >= 0.7:
	if abs(L2 - R1) >= 20:
	continue
	LL = min(L1, L2)
	UU = min(U1, U2)
	RR = max(R1, R2)
	DD = max(D1, D2)
	svg_final_bboxs[i] = (LL, UU, RR, DD)
	svg_final_visited[j] = True
	continue
	# 竖直并列
	ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
	if ratio_1 >= 0.7 and ratio_2 >= 0.7:
	if abs(U2 - D1) >= 20:
	continue
	LL = min(L1, L2)
	UU = min(U1, U2)
	RR = max(R1, R2)
	DD = max(D1, D2)
	svg_final_bboxs[i] = (LL, UU, RR, DD)
	svg_final_visited[j] = True

	for i in range(len(svg_final_bboxs)):
	if svg_final_visited[i] == False:
	L, U, R, D = svg_final_bboxs[i]
	svg_final_bboxs_2.append((L, U, R, D))

	L -= eps_ERROR * 2
	U -= eps_ERROR
	R += eps_ERROR * 2
	D += eps_ERROR
	# cur_svg = page.get_pixmap(clip=(L,U,R,D))
	new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name
	# cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹，并命名
	svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中，方便在md中插入引用
	svg_final_bboxs_2.append((L, U, R, D))
	svg_final_visited_2.append(False)
	svg_ID_2 += 1

	## svg收尾。识别为drawing，但是在上面没有拼成一张图的。
	# 有收尾才comprehensive
	# xxxx
	# xxxx
	# xxxx
	# xxxx


	#--------- 通过json_from_DocXchain来获取，figure, table, equation的bbox ---------#
	figure_bbox_from_DocXChain = []

	figure_from_DocXChain_visited = [] # 记忆化
	figure_bbox_from_DocXChain_overlappedRatio = []

	figure_only_from_DocXChain_bboxs = [] # 存储
	figure_only_from_DocXChain_names = []
	figure_only_from_DocXChain_visited = []
	figure_only_ID = 0

	xf_json = json_from_DocXchain_obj
	width_from_json = xf_json['page_info']['width']
	height_from_json = xf_json['page_info']['height']
	LR_scaleRatio = width_from_json / (pageR - pageL)
	UD_scaleRatio = height_from_json / (pageD - pageU)

	for xf in xf_json['layout_dets']:
	# {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
	L = xf['poly'][0] / LR_scaleRatio
	U = xf['poly'][1] / UD_scaleRatio
	R = xf['poly'][2] / LR_scaleRatio
	D = xf['poly'][5] / UD_scaleRatio
	# L += pageL # 有的页面，artBox偏移了。不在（0,0）
	# R += pageL
	# U += pageU
	# D += pageU
	L, R = min(L, R), max(L, R)
	U, D = min(U, D), max(U, D)
	# figure
	if xf["category_id"] == 1 and xf['score'] >= 0.3:
	figure_bbox_from_DocXChain.append((L, U, R, D))
	figure_from_DocXChain_visited.append(False)
	figure_bbox_from_DocXChain_overlappedRatio.append(0.0)

	#---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#

	## 比对imgs
	for i, b1 in enumerate(figure_bbox_from_DocXChain):
	# print('--------- DocXChain的图片', b1)
	L1, U1, R1, D1 = b1
	for b2 in img_bboxs:
	# print('-------- igms得到的图', b2)
	L2, U2, R2, D2 = b2
	s1 = abs(R1 - L1) * abs(D1 - U1)
	s2 = abs(R2 - L2) * abs(D2 - U2)
	# 相同
	if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
	figure_from_DocXChain_visited[i] = True
	# 包含
	elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
	if s2 / s1 > 0.8:
	figure_from_DocXChain_visited[i] = True
	elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
	if s1 / s2 > 0.8:
	figure_from_DocXChain_visited[i] = True
	else:
	# 重叠了相当一部分
	# print('进入第3部分')
	ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
	if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
	figure_from_DocXChain_visited[i] = True
	else:
	figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
	# print('图片的重叠率是{}'.format(ratio_1))


	## 比对svgs
	svg_final_bboxs_2_badIdxs = []
	for i, b1 in enumerate(figure_bbox_from_DocXChain):
	L1, U1, R1, D1 = b1
	for j, b2 in enumerate(svg_final_bboxs_2):
	L2, U2, R2, D2 = b2
	s1 = abs(R1 - L1) * abs(D1 - U1)
	s2 = abs(R2 - L2) * abs(D2 - U2)
	# 相同
	if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
	figure_from_DocXChain_visited[i] = True
	# 包含
	elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
	figure_from_DocXChain_visited[i] = True
	elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
	if s1 / s2 > 0.7:
	figure_from_DocXChain_visited[i] = True
	else:
	svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。
	else:
	# 重叠了相当一部分
	ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
	if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
	figure_from_DocXChain_visited[i] = True
	else:
	figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1

	# 丢掉错误的svg
	svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]

	for i in range(len(figure_from_DocXChain_visited)):
	if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
	figure_from_DocXChain_visited[i] = True

	# DocXChain识别出来的figure，但是没被保存的。
	for i in range(len(figure_from_DocXChain_visited)):
	if figure_from_DocXChain_visited[i] == False:
	figure_from_DocXChain_visited[i] = True
	cur_bbox = figure_bbox_from_DocXChain[i]
	# cur_figure = page.get_pixmap(clip=cur_bbox)
	new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name
	# cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹，并命名
	figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中，方便在md中插入引用
	figure_only_from_DocXChain_bboxs.append(cur_bbox)
	figure_only_from_DocXChain_visited.append(False)
	figure_only_ID += 1

	img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
	svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
	figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
	curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs

	#--------------------------- 最后统一去重 -----------------------------------#
	curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )

	#### 先考虑包含关系的小块
	final_duplicate = set()
	for i in range(len(curPage_all_fig_bboxs)):
	L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
	for j in range(len(curPage_all_fig_bboxs)):
	if i == j:
	continue
	L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
	s1 = abs(R1 - L1) * abs(D1 - U1)
	s2 = abs(R2 - L2) * abs(D2 - U2)
	if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
	final_duplicate.add((L1, U1, R1, D1))
	else:
	ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
	if ratio_1 >= 0.8 and ratio_2 <= 0.6:
	final_duplicate.add((L1, U1, R1, D1))

	curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]

	#### 再考虑重叠关系的块
	final_duplicate = set()
	final_synthetic_bboxs = []
	for i in range(len(curPage_all_fig_bboxs)):
	L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
	for j in range(len(curPage_all_fig_bboxs)):
	if i == j:
	continue
	L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
	s1 = abs(R1 - L1) * abs(D1 - U1)
	s2 = abs(R2 - L2) * abs(D2 - U2)
	ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
	union_ok = False
	if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
	union_ok = True
	if (ratio_1 > 0.2 and s2 / s1 > 5):
	union_ok = True
	if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
	union_ok = True
	if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
	union_ok = True
	if union_ok == True:
	final_duplicate.add((L1, U1, R1, D1))
	final_duplicate.add((L2, U2, R2, D2))
	L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
	final_synthetic_bboxs.append((L3, U3, R3, D3))

	# print('---------- curPage_all_fig_bboxs ---------')
	# print(curPage_all_fig_bboxs)
	curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]
	final_synthetic_bboxs = list(set(final_synthetic_bboxs))


	## 再再考虑重叠关系。极端情况下会迭代式地2进1
	new_images = []
	droped_img_idx = []
	image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]
	for i in range(0, len(image_bboxes)):
	for j in range(i+1, len(image_bboxes)):
	if j not in droped_img_idx:
	L2, U2, R2, D2 = image_bboxes[j]
	s1 = abs(R1 - L1) * abs(D1 - U1)
	s2 = abs(R2 - L2) * abs(D2 - U2)
	ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
	union_ok = False
	if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
	union_ok = True
	if (ratio_1 > 0.2 and s2 / s1 > 5):
	union_ok = True
	if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
	union_ok = True
	if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
	union_ok = True
	if union_ok == True:
	# 合并
	image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
	droped_img_idx.append(j)

	for i in range(0, len(image_bboxes)):
	if i not in droped_img_idx:
	new_images.append(image_bboxes[i])


	# find_union_FLAG = True
	# while find_union_FLAG == True:
	# find_union_FLAG = False
	# final_duplicate = set()
	# tmp = []
	# for i in range(len(final_synthetic_bboxs)):
	# L1, U1, R1, D1 = final_synthetic_bboxs[i]
	# for j in range(len(final_synthetic_bboxs)):
	# if i == j:
	# continue
	# L2, U2, R2, D2 = final_synthetic_bboxs[j]
	# s1 = abs(R1 - L1) * abs(D1 - U1)
	# s2 = abs(R2 - L2) * abs(D2 - U2)
	# ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
	# union_ok = False
	# if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
	# union_ok = True
	# if (ratio_1 > 0.2 and s2 / s1 > 5):
	# union_ok = True
	# if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
	# union_ok = True
	# if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
	# union_ok = True
	# if union_ok == True:
	# find_union_FLAG = True
	# final_duplicate.add((L1, U1, R1, D1))
	# final_duplicate.add((L2, U2, R2, D2))
	# L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
	# tmp.append((L3, U3, R3, D3))
	# if find_union_FLAG == True:
	# tmp = list(set(tmp))
	# final_synthetic_bboxs = tmp[:]


	# curPage_all_fig_bboxs += final_synthetic_bboxs
	# print('--------- final synthetic')
	# print(final_synthetic_bboxs)
	#**************************************************************************#
	images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
	images = images1 + new_images
	return images