|
from loguru import logger |
|
|
|
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \ |
|
calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio |
|
from magic_pdf.libs.drop_tag import DropTag |
|
from magic_pdf.libs.ocr_content_type import ContentType, BlockType |
|
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation |
|
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span |
|
|
|
|
|
|
|
def line_sort_spans_by_left_to_right(lines): |
|
line_objects = [] |
|
for line in lines: |
|
|
|
line.sort(key=lambda span: span['bbox'][0]) |
|
line_bbox = [ |
|
min(span['bbox'][0] for span in line), |
|
min(span['bbox'][1] for span in line), |
|
max(span['bbox'][2] for span in line), |
|
max(span['bbox'][3] for span in line), |
|
] |
|
line_objects.append({ |
|
"bbox": line_bbox, |
|
"spans": line, |
|
}) |
|
return line_objects |
|
|
|
|
|
def merge_spans_to_line(spans): |
|
if len(spans) == 0: |
|
return [] |
|
else: |
|
|
|
spans.sort(key=lambda span: span['bbox'][1]) |
|
|
|
lines = [] |
|
current_line = [spans[0]] |
|
for span in spans[1:]: |
|
|
|
|
|
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any( |
|
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in |
|
current_line): |
|
|
|
lines.append(current_line) |
|
current_line = [span] |
|
continue |
|
|
|
|
|
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']): |
|
current_line.append(span) |
|
else: |
|
|
|
lines.append(current_line) |
|
current_line = [span] |
|
|
|
|
|
if current_line: |
|
lines.append(current_line) |
|
|
|
return lines |
|
|
|
|
|
def merge_spans_to_line_by_layout(spans, layout_bboxes): |
|
lines = [] |
|
new_spans = [] |
|
dropped_spans = [] |
|
for item in layout_bboxes: |
|
layout_bbox = item['layout_bbox'] |
|
|
|
layout_sapns = [] |
|
for span in spans: |
|
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6: |
|
layout_sapns.append(span) |
|
|
|
if len(layout_sapns) > 0: |
|
new_spans.append(layout_sapns) |
|
|
|
for layout_sapn in layout_sapns: |
|
spans.remove(layout_sapn) |
|
|
|
if len(new_spans) > 0: |
|
for layout_sapns in new_spans: |
|
layout_lines = merge_spans_to_line(layout_sapns) |
|
lines.extend(layout_lines) |
|
|
|
|
|
lines = line_sort_spans_by_left_to_right(lines) |
|
|
|
for span in spans: |
|
span['tag'] = DropTag.NOT_IN_LAYOUT |
|
dropped_spans.append(span) |
|
|
|
return lines, dropped_spans |
|
|
|
|
|
def merge_lines_to_block(lines): |
|
|
|
blocks = [] |
|
for line in lines: |
|
blocks.append( |
|
{ |
|
"bbox": line["bbox"], |
|
"lines": [line], |
|
} |
|
) |
|
return blocks |
|
|
|
|
|
def sort_blocks_by_layout(all_bboxes, layout_bboxes): |
|
new_blocks = [] |
|
sort_blocks = [] |
|
for item in layout_bboxes: |
|
layout_bbox = item['layout_bbox'] |
|
|
|
|
|
layout_blocks = [] |
|
for block in all_bboxes: |
|
|
|
if block[7] == BlockType.Footnote: |
|
continue |
|
block_bbox = block[:4] |
|
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8: |
|
layout_blocks.append(block) |
|
|
|
|
|
if len(layout_blocks) > 0: |
|
new_blocks.append(layout_blocks) |
|
|
|
for layout_block in layout_blocks: |
|
all_bboxes.remove(layout_block) |
|
|
|
|
|
if len(new_blocks) > 0: |
|
for bboxes_in_layout_block in new_blocks: |
|
bboxes_in_layout_block.sort(key=lambda x: x[1]) |
|
sort_blocks.extend(bboxes_in_layout_block) |
|
|
|
|
|
return sort_blocks |
|
|
|
|
|
def fill_spans_in_blocks(blocks, spans, radio): |
|
''' |
|
将allspans中的span按位置关系,放入blocks中 |
|
''' |
|
block_with_spans = [] |
|
for block in blocks: |
|
block_type = block[7] |
|
block_bbox = block[0:4] |
|
block_dict = { |
|
'type': block_type, |
|
'bbox': block_bbox, |
|
} |
|
block_spans = [] |
|
for span in spans: |
|
span_bbox = span['bbox'] |
|
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio: |
|
block_spans.append(span) |
|
|
|
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)''' |
|
|
|
|
|
|
|
|
|
'''模型识别错误的行间公式, type类型转换成行内公式''' |
|
|
|
|
|
'''bbox去除粘连''' |
|
|
|
|
|
block_dict['spans'] = block_spans |
|
block_with_spans.append(block_dict) |
|
|
|
|
|
if len(block_spans) > 0: |
|
for span in block_spans: |
|
spans.remove(span) |
|
|
|
return block_with_spans, spans |
|
|
|
|
|
def fix_block_spans(block_with_spans, img_blocks, table_blocks): |
|
''' |
|
1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系 |
|
需要将caption和footnote的text_span放入相应img_block和table_block内的 |
|
caption_block和footnote_block中 |
|
2、同时需要删除block中的spans字段 |
|
''' |
|
fix_blocks = [] |
|
for block in block_with_spans: |
|
block_type = block['type'] |
|
|
|
if block_type == BlockType.Image: |
|
block = fix_image_block(block, img_blocks) |
|
elif block_type == BlockType.Table: |
|
block = fix_table_block(block, table_blocks) |
|
elif block_type in [BlockType.Text, BlockType.Title]: |
|
block = fix_text_block(block) |
|
elif block_type == BlockType.InterlineEquation: |
|
block = fix_interline_block(block) |
|
else: |
|
continue |
|
fix_blocks.append(block) |
|
return fix_blocks |
|
|
|
|
|
def fix_discarded_block(discarded_block_with_spans): |
|
fix_discarded_blocks = [] |
|
for block in discarded_block_with_spans: |
|
block = fix_text_block(block) |
|
fix_discarded_blocks.append(block) |
|
return fix_discarded_blocks |
|
|
|
|
|
def merge_spans_to_block(spans: list, block_bbox: list, block_type: str): |
|
block_spans = [] |
|
|
|
for span in spans: |
|
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6: |
|
block_spans.append(span) |
|
block_lines = merge_spans_to_line(block_spans) |
|
|
|
sort_block_lines = line_sort_spans_by_left_to_right(block_lines) |
|
block = { |
|
'bbox': block_bbox, |
|
'type': block_type, |
|
'lines': sort_block_lines |
|
} |
|
return block, block_spans |
|
|
|
|
|
def make_body_block(span: dict, block_bbox: list, block_type: str): |
|
|
|
body_line = { |
|
'bbox': block_bbox, |
|
'spans': [span], |
|
} |
|
body_block = { |
|
'bbox': block_bbox, |
|
'type': block_type, |
|
'lines': [body_line] |
|
} |
|
return body_block |
|
|
|
|
|
def fix_image_block(block, img_blocks): |
|
block['blocks'] = [] |
|
|
|
for img_block in img_blocks: |
|
if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95): |
|
|
|
|
|
for span in block['spans']: |
|
if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']: |
|
|
|
img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody) |
|
block['blocks'].append(img_body_block) |
|
|
|
|
|
block['spans'].remove(span) |
|
break |
|
|
|
|
|
if img_block['img_caption_bbox'] is not None: |
|
img_caption_block, img_caption_spans = merge_spans_to_block( |
|
block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption |
|
) |
|
block['blocks'].append(img_caption_block) |
|
|
|
break |
|
del block['spans'] |
|
return block |
|
|
|
|
|
def fix_table_block(block, table_blocks): |
|
block['blocks'] = [] |
|
|
|
for table_block in table_blocks: |
|
if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95): |
|
|
|
|
|
for span in block['spans']: |
|
if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']: |
|
|
|
table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody) |
|
block['blocks'].append(table_body_block) |
|
|
|
|
|
block['spans'].remove(span) |
|
break |
|
|
|
|
|
if table_block['table_caption_bbox'] is not None: |
|
table_caption_block, table_caption_spans = merge_spans_to_block( |
|
block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption |
|
) |
|
block['blocks'].append(table_caption_block) |
|
|
|
|
|
if len(table_caption_spans) > 0: |
|
|
|
for span in table_caption_spans: |
|
block['spans'].remove(span) |
|
|
|
|
|
if table_block['table_footnote_bbox'] is not None: |
|
table_footnote_block, table_footnote_spans = merge_spans_to_block( |
|
block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote |
|
) |
|
block['blocks'].append(table_footnote_block) |
|
|
|
break |
|
del block['spans'] |
|
return block |
|
|
|
|
|
def fix_text_block(block): |
|
|
|
for span in block['spans']: |
|
if span['type'] == ContentType.InterlineEquation: |
|
span['type'] = ContentType.InlineEquation |
|
block_lines = merge_spans_to_line(block['spans']) |
|
sort_block_lines = line_sort_spans_by_left_to_right(block_lines) |
|
block['lines'] = sort_block_lines |
|
del block['spans'] |
|
return block |
|
|
|
|
|
def fix_interline_block(block): |
|
block_lines = merge_spans_to_line(block['spans']) |
|
sort_block_lines = line_sort_spans_by_left_to_right(block_lines) |
|
block['lines'] = sort_block_lines |
|
del block['spans'] |
|
return block |
|
|