|
from magic_pdf.libs.commons import fitz |
|
import re |
|
|
|
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox |
|
|
|
|
|
|
|
def get_merged_line(page): |
|
""" |
|
这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线,并且将断开的线段进行了合并。 |
|
:param page :fitz读取的当前页的内容 |
|
""" |
|
drawings_bbox = [] |
|
drawings_line = [] |
|
drawings = page.get_drawings() |
|
for p in drawings: |
|
drawings_bbox.append(p["rect"].irect) |
|
|
|
lines = [] |
|
for L, U, R, D in drawings_bbox: |
|
if abs(D - U) <= 3: |
|
lines.append((L, U, R, D)) |
|
U_groups = [] |
|
visited = [False for _ in range(len(lines))] |
|
for i, (L1, U1, R1, D1) in enumerate(lines): |
|
if visited[i] == True: |
|
continue |
|
tmp_g = [(L1, U1, R1, D1)] |
|
for j, (L2, U2, R2, D2) in enumerate(lines): |
|
if i == j: |
|
continue |
|
if visited[j] == True: |
|
continue |
|
if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5: |
|
tmp_g.append((L2, U2, R2, D2)) |
|
visited[j] = True |
|
U_groups.append(tmp_g) |
|
|
|
res = [] |
|
for group in U_groups: |
|
group.sort(key = lambda LURD: (LURD[0], LURD[2])) |
|
LL, UU, RR, DD = group[0] |
|
for i, (L1, U1, R1, D1) in enumerate(group): |
|
if (L1 - RR) >= 5: |
|
cur_line = (LL, UU, RR, DD) |
|
res.append(cur_line) |
|
LL = L1 |
|
else: |
|
RR = max(RR, R1) |
|
cur_line = (LL, UU, RR, DD) |
|
res.append(cur_line) |
|
return res |
|
|
|
def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int): |
|
""" |
|
:param page :fitz读取的当前页的内容 |
|
:param table_bboxes: list类型,每一个元素是一个元祖 (L, U, R, D) |
|
:param include_table_title: 是否将表格的标题也圈进来 |
|
:param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题 |
|
""" |
|
|
|
drawings_lines = get_merged_line(page) |
|
fix_table_bboxes = [] |
|
|
|
for table in table_bboxes: |
|
(L, U, R, D) = table |
|
fix_table_L = [] |
|
fix_table_U = [] |
|
fix_table_R = [] |
|
fix_table_D = [] |
|
width = R - L |
|
width_range = width * 0.1 |
|
height = D - U |
|
height_range = height * 0.1 |
|
for line in drawings_lines: |
|
if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): |
|
if (U - height_range) < line[1] < (U + height_range): |
|
fix_table_U.append(line[1]) |
|
fix_table_L.append(line[0]) |
|
fix_table_R.append(line[2]) |
|
elif (D - height_range) < line[1] < (D + height_range): |
|
fix_table_D.append(line[1]) |
|
fix_table_L.append(line[0]) |
|
fix_table_R.append(line[2]) |
|
|
|
if fix_table_U: |
|
U = min(fix_table_U) |
|
if fix_table_D: |
|
D = max(fix_table_D) |
|
if fix_table_L: |
|
L = min(fix_table_L) |
|
if fix_table_R: |
|
R = max(fix_table_R) |
|
|
|
if include_table_title: |
|
text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] |
|
incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))] |
|
upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0] |
|
sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) |
|
|
|
for idx in range(scan_line_num): |
|
if idx+1 <= len(sorted_filtered_text_blocks): |
|
line_temp = sorted_filtered_text_blocks[idx]['lines'] |
|
if line_temp: |
|
text = line_temp[0]['spans'][0]['text'] |
|
check_en = re.match('Table', text) |
|
check_ch = re.match('表', text) |
|
if check_en or check_ch: |
|
if sorted_filtered_text_blocks[idx]['bbox'][1] < D: |
|
U = sorted_filtered_text_blocks[idx]['bbox'][1] |
|
|
|
fix_table_bboxes.append([L-2, U-2, R+2, D+2]) |
|
|
|
return fix_table_bboxes |
|
|
|
def __check_table_title_pattern(text): |
|
""" |
|
检查文本段是否是表格的标题 |
|
""" |
|
patterns = [r'^table\s\d+'] |
|
|
|
for pattern in patterns: |
|
match = re.match(pattern, text, re.IGNORECASE) |
|
if match: |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
def fix_table_text_block(pymu_blocks, table_bboxes: list): |
|
""" |
|
调整table, 如果table和上下的text block有相交区域,则将table的上下边界调整到text block的上下边界 |
|
例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf |
|
""" |
|
for tb in table_bboxes: |
|
(L, U, R, D) = tb |
|
for block in pymu_blocks: |
|
if _is_in_or_part_overlap((L, U, R, D), block['bbox']): |
|
txt = " ".join(span['text'] for line in block['lines'] for span in line['spans']) |
|
if not __check_table_title_pattern(txt) and block.get("_table", False) is False: |
|
tb[0] = min(tb[0], block['bbox'][0]) |
|
tb[1] = min(tb[1], block['bbox'][1]) |
|
tb[2] = max(tb[2], block['bbox'][2]) |
|
tb[3] = max(tb[3], block['bbox'][3]) |
|
block['_table'] = True |
|
|
|
"""如果是个table的title,但是有部分重叠,那么修正这个title,使得和table不重叠""" |
|
if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt): |
|
block['bbox'] = list(block['bbox']) |
|
if block['bbox'][3] > U: |
|
block['bbox'][3] = U-1 |
|
if block['bbox'][1] < D: |
|
block['bbox'][1] = D+1 |
|
|
|
|
|
return table_bboxes |
|
|
|
|
|
def __get_table_caption_text(text_block): |
|
txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans']) |
|
line_cnt = len(text_block['lines']) |
|
txt = txt.replace("Ž . ", '') |
|
return txt, line_cnt |
|
|
|
|
|
def include_table_title(pymu_blocks, table_bboxes: list): |
|
""" |
|
把表格的title也包含进来,扩展到table_bbox上 |
|
""" |
|
for tb in table_bboxes: |
|
max_find_cnt = 3 |
|
temp_box = tb.copy() |
|
while max_find_cnt>0: |
|
text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box) |
|
if text_block_top: |
|
txt, line_cnt = __get_table_caption_text(text_block_top) |
|
if len(txt.strip())>0: |
|
if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: |
|
max_find_cnt = max_find_cnt -1 |
|
temp_box[1] = text_block_top['bbox'][1] |
|
continue |
|
else: |
|
break |
|
else: |
|
temp_box[1] = text_block_top['bbox'][1] |
|
max_find_cnt = max_find_cnt - 1 |
|
else: |
|
break |
|
|
|
max_find_cnt = 3 |
|
temp_box = tb.copy() |
|
while max_find_cnt>0: |
|
text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box) |
|
if text_block_bottom: |
|
txt, line_cnt = __get_table_caption_text(text_block_bottom) |
|
if len(txt.strip())>0: |
|
if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: |
|
max_find_cnt = max_find_cnt - 1 |
|
temp_box[3] = text_block_bottom['bbox'][3] |
|
continue |
|
else: |
|
break |
|
else: |
|
temp_box[3] = text_block_bottom['bbox'][3] |
|
max_find_cnt = max_find_cnt - 1 |
|
else: |
|
break |
|
|
|
if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False : |
|
btn_text, _ = __get_table_caption_text(text_block_bottom) |
|
top_text, _ = __get_table_caption_text(text_block_top) |
|
if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): |
|
|
|
btn_text_distance = text_block_bottom['bbox'][1] - tb[3] |
|
top_text_distance = tb[1] - text_block_top['bbox'][3] |
|
text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top |
|
tb[0] = min(tb[0], text_block['bbox'][0]) |
|
tb[1] = min(tb[1], text_block['bbox'][1]) |
|
tb[2] = max(tb[2], text_block['bbox'][2]) |
|
tb[3] = max(tb[3], text_block['bbox'][3]) |
|
text_block_bottom['_table_caption'] = True |
|
continue |
|
|
|
|
|
text_block = text_block_top |
|
if text_block and text_block.get("_table_caption", False) is False: |
|
first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans']) |
|
if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False: |
|
tb[0] = min(tb[0], text_block['bbox'][0]) |
|
tb[1] = min(tb[1], text_block['bbox'][1]) |
|
tb[2] = max(tb[2], text_block['bbox'][2]) |
|
tb[3] = max(tb[3], text_block['bbox'][3]) |
|
text_block['_table_caption'] = True |
|
continue |
|
|
|
text_block = text_block_bottom |
|
if text_block and text_block.get("_table_caption", False) is False: |
|
first_text_line, _ = __get_table_caption_text(text_block) |
|
if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False: |
|
tb[0] = min(tb[0], text_block['bbox'][0]) |
|
tb[1] = min(tb[1], text_block['bbox'][1]) |
|
tb[2] = max(tb[2], text_block['bbox'][2]) |
|
tb[3] = max(tb[3], text_block['bbox'][3]) |
|
text_block['_table_caption'] = True |
|
continue |
|
|
|
"""向左、向右寻找,暂时只寻找一次""" |
|
left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb) |
|
if left_text_block and left_text_block.get("_image_caption", False) is False: |
|
first_text_line, _ = __get_table_caption_text(left_text_block) |
|
if __check_table_title_pattern(first_text_line): |
|
tb[0] = min(tb[0], left_text_block['bbox'][0]) |
|
tb[1] = min(tb[1], left_text_block['bbox'][1]) |
|
tb[2] = max(tb[2], left_text_block['bbox'][2]) |
|
tb[3] = max(tb[3], left_text_block['bbox'][3]) |
|
left_text_block['_image_caption'] = True |
|
continue |
|
|
|
right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb) |
|
if right_text_block and right_text_block.get("_image_caption", False) is False: |
|
first_text_line, _ = __get_table_caption_text(right_text_block) |
|
if __check_table_title_pattern(first_text_line): |
|
tb[0] = min(tb[0], right_text_block['bbox'][0]) |
|
tb[1] = min(tb[1], right_text_block['bbox'][1]) |
|
tb[2] = max(tb[2], right_text_block['bbox'][2]) |
|
tb[3] = max(tb[3], right_text_block['bbox'][3]) |
|
right_text_block['_image_caption'] = True |
|
continue |
|
|
|
return table_bboxes |