File size: 8,309 Bytes
240e0a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
from collections import Counter
from magic_pdf.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path=None, debug_mode=False):
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
:param page :fitz读取的当前页的内容
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
#--------- 通过json_from_DocXchain来获取 footnote ---------#
footnote_bbox_from_DocXChain = []
xf_json = json_from_DocXchain_obj
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
# {0: 'title', # 标题
# 1: 'figure', # 图片
# 2: 'plain text', # 文本
# 3: 'header', # 页眉
# 4: 'page number', # 页码
# 5: 'footnote', # 脚注
# 6: 'footer', # 页脚
# 7: 'table', # 表格
# 8: 'table caption', # 表格描述
# 9: 'figure caption', # 图片描述
# 10: 'equation', # 公式
# 11: 'full column', # 单栏
# 12: 'sub column', # 多栏
# 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式
for xf in xf_json['layout_dets']:
L = xf['poly'][0] / horizontal_scale_ratio
U = xf['poly'][1] / vertical_scale_ratio
R = xf['poly'][2] / horizontal_scale_ratio
D = xf['poly'][5] / vertical_scale_ratio
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL
# U += pageU
# D += pageU
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
# if xf['category_id'] == 5 and xf['score'] >= 0.3:
if xf['category_id'] == 5 and xf['score'] >= 0.43: # 新的footnote阈值
footnote_bbox_from_DocXChain.append((L, U, R, D))
footnote_final_names = []
footnote_final_bboxs = []
footnote_ID = 0
for L, U, R, D in footnote_bbox_from_DocXChain:
if debug_mode:
# cur_footnote = page.get_pixmap(clip=(L,U,R,D))
new_footnote_name = "footnote_{}_{}.png".format(page_ID, footnote_ID) # 脚注name
# + '/' + new_footnote_name) # 把脚注存储在新建的文件夹,并命名
footnote_final_names.append(new_footnote_name) # 把脚注的名字存在list中
footnote_final_bboxs.append((L, U, R, D))
footnote_ID += 1
footnote_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
curPage_all_footnote_bboxs = footnote_final_bboxs
return curPage_all_footnote_bboxs
def need_remove(block):
if 'lines' in block and len(block['lines']) > 0:
# block中只有一行,且该行文本全是大写字母,或字体为粗体bold关键词,SB关键词,把这个block捞回来
if len(block['lines']) == 1:
if 'spans' in block['lines'][0] and len(block['lines'][0]['spans']) == 1:
font_keywords = ['SB', 'bold', 'Bold']
if block['lines'][0]['spans'][0]['text'].isupper() or any(keyword in block['lines'][0]['spans'][0]['font'] for keyword in font_keywords):
return True
for line in block['lines']:
if 'spans' in line and len(line['spans']) > 0:
for span in line['spans']:
# 检测"keyword"是否在span中,忽略大小写
if "keyword" in span['text'].lower():
return True
return False
def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_font):
remain_text_blocks (list): 包含所有待处理的文本块的列表。
page_height (float): 页面的高度。
page_id (int): 页面的ID。
list: 符合规则的脚注文本块的边界框列表。
# if page_id > 20:
if page_id > 2: # 为保证精确度,先只筛选前3页
return []
# 存储每一行的文本块大小的列表
line_sizes = []
# 存储每个文本块的平均行大小
block_sizes = []
# 存储每一行的字体信息
# font_names = []
font_names = Counter()
if len(remain_text_blocks) > 0:
for block in remain_text_blocks:
block_line_sizes = []
# block_fonts = []
block_fonts = Counter()
for line in block['lines']:
# 提取每个span的size属性,并计算行大小
span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
if span_sizes:
line_size = sum(span_sizes) / len(span_sizes)
span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0]
if span_font:
# main_text_font应该用基于字数最多的字体而不是span级别的统计
# font_names.append(font_name for font_name in span_font)
# block_fonts.append(font_name for font_name in span_font)
for font, count in span_font:
# font_names.extend([font] * count)
# block_fonts.extend([font] * count)
font_names[font] += count
block_fonts[font] += count
if block_line_sizes:
# 计算文本块的平均行大小
block_size = sum(block_line_sizes) / len(block_line_sizes)
# block_font = collections.Counter(block_fonts).most_common(1)[0][0]
block_font = block_fonts.most_common(1)[0][0]
block_sizes.append((block, block_size, block_font))
# 计算main_text_size
main_text_size = Counter(line_sizes).most_common(1)[0][0]
# 计算main_text_font
# main_text_font = collections.Counter(font_names).most_common(1)[0][0]
# main_text_font = font_names.most_common(1)[0][0]
# 删除一些可能被误识别为脚注的文本块
block_sizes = [(block, block_size, block_font) for block, block_size, block_font in block_sizes if not need_remove(block)]
# 检测footnote_block 并返回 footnote_bboxes
# footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
# block['bbox'][1] > page_height * 0.6 and block_size < main_text_size
# and (len(block['lines']) < 5 or block_font != main_text_font)]
# and len(block['lines']) < 5]
footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
block['bbox'][1] > page_height * 0.6 and
# 较为严格的规则
block_size < main_text_size and
(len(block['lines']) < 5 or
block_font != main_text_font)]
# 较为宽松的规则
# sum([block_size < main_text_size,
# len(block['lines']) < 5,
# block_font != main_text_font])
# >= 2]
return footnote_bboxes
return []