|
import os |
|
import re |
|
import numpy as np |
|
|
|
from magic_pdf.libs.nlp_utils import NLPModels |
|
|
|
from magic_pdf.para.commons import * |
|
|
|
if sys.version_info[0] >= 3: |
|
sys.stdout.reconfigure(encoding="utf-8") |
|
|
|
|
|
class TitleProcessor: |
|
def __init__(self, *doc_statistics) -> None: |
|
if len(doc_statistics) > 0: |
|
self.doc_statistics = doc_statistics[0] |
|
|
|
self.nlp_model = NLPModels() |
|
self.MAX_TITLE_LEVEL = 3 |
|
self.numbered_title_pattern = r""" |
|
^ # 行首 |
|
( # 开始捕获组 |
|
[\(\(]\d+[\)\)] # 括号内数字,支持中文和英文括号,例如:(1) 或 (1) |
|
|\d+[\)\)]\s # 数字后跟右括号和空格,支持中文和英文括号,例如:2) 或 2) |
|
|[\(\(][A-Z][\)\)] # 括号内大写字母,支持中文和英文括号,例如:(A) 或 (A) |
|
|[A-Z][\)\)]\s # 大写字母后跟右括号和空格,例如:A) 或 A) |
|
|[\(\(][IVXLCDM]+[\)\)] # 括号内罗马数字,支持中文和英文括号,例如:(I) 或 (I) |
|
|[IVXLCDM]+[\)\)]\s # 罗马数字后跟右括号和空格,例如:I) 或 I) |
|
|\d+(\.\d+)*\s # 数字或复合数字编号后跟空格,例如:1. 或 3.2.1 |
|
|[一二三四五六七八九十百千]+[、\s] # 中文序号后跟顿号和空格,例如:一、 |
|
|[\(|\(][一二三四五六七八九十百千]+[\)|\)]\s* # 中文括号内中文序号后跟空格,例如:(一) |
|
|[A-Z]\.\d+(\.\d+)?\s # 大写字母后跟点和数字,例如:A.1 或 A.1.1 |
|
|[\(\(][a-z][\)\)] # 括号内小写字母,支持中文和英文括号,例如:(a) 或 (a) |
|
|[a-z]\)\s # 小写字母后跟右括号和空格,例如:a) |
|
|[A-Z]-\s # 大写字母后跟短横线和空格,例如:A- |
|
|\w+:\s # 英文序号词后跟冒号和空格,例如:First: |
|
|第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格 |
|
|[IVXLCDM]+\. # 罗马数字后跟点,例如:I. |
|
|\d+\.\s # 单个数字后跟点和空格,例如:1. |
|
) # 结束捕获组 |
|
.+ # 标题的其余部分 |
|
""" |
|
|
|
def _is_potential_title( |
|
self, |
|
curr_line, |
|
prev_line, |
|
prev_line_is_title, |
|
next_line, |
|
avg_char_width, |
|
avg_char_height, |
|
median_font_size, |
|
): |
|
""" |
|
This function checks if the line is a potential title. |
|
|
|
Parameters |
|
---------- |
|
curr_line : dict |
|
current line |
|
prev_line : dict |
|
previous line |
|
next_line : dict |
|
next line |
|
avg_char_width : float |
|
average of char widths |
|
avg_char_height : float |
|
average of line heights |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the line is a potential title, False otherwise. |
|
""" |
|
|
|
def __is_line_centered(line_bbox, page_bbox, avg_char_width): |
|
""" |
|
This function checks if the line is centered on the page |
|
|
|
Parameters |
|
---------- |
|
line_bbox : list |
|
bbox of the line |
|
page_bbox : list |
|
bbox of the page |
|
avg_char_width : float |
|
average of char widths |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the line is centered on the page, False otherwise. |
|
""" |
|
horizontal_ratio = 0.5 |
|
horizontal_thres = horizontal_ratio * avg_char_width |
|
|
|
x0, _, x1, _ = line_bbox |
|
_, _, page_x1, _ = page_bbox |
|
|
|
return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres |
|
|
|
def __is_bold_font_line(line): |
|
""" |
|
Check if a line contains any bold font style. |
|
""" |
|
|
|
def _is_bold_span(span): |
|
|
|
if not span["text"].strip(): |
|
return False |
|
|
|
return bool(span["flags"] & 2**4) |
|
|
|
for span in line["spans"]: |
|
if not _is_bold_span(span): |
|
return False |
|
|
|
return True |
|
|
|
def __is_italic_font_line(line): |
|
""" |
|
Check if a line contains any italic font style. |
|
""" |
|
|
|
def __is_italic_span(span): |
|
return bool(span["flags"] & 2**1) |
|
|
|
for span in line["spans"]: |
|
if not __is_italic_span(span): |
|
return False |
|
|
|
return True |
|
|
|
def __is_punctuation_heavy(line_text): |
|
""" |
|
Check if the line contains a high ratio of punctuation marks, which may indicate |
|
that the line is not a title. |
|
|
|
Parameters: |
|
line_text (str): Text of the line. |
|
|
|
Returns: |
|
bool: True if the line is heavy with punctuation, False otherwise. |
|
""" |
|
|
|
pattern = r"\b\d+\.\d+\..*\b" |
|
|
|
|
|
if re.match(pattern, line_text.strip()): |
|
return False |
|
|
|
|
|
punctuation_marks = re.findall(r"[^\w\s]", line_text) |
|
number_of_punctuation_marks = len(punctuation_marks) |
|
|
|
text_length = len(line_text) |
|
|
|
if text_length == 0: |
|
return False |
|
|
|
punctuation_ratio = number_of_punctuation_marks / text_length |
|
if punctuation_ratio >= 0.1: |
|
return True |
|
|
|
return False |
|
|
|
def __has_mixed_font_styles(spans, strict_mode=False): |
|
""" |
|
This function checks if the line has mixed font styles, the strict mode will compare the font types |
|
|
|
Parameters |
|
---------- |
|
spans : list |
|
spans of the line |
|
strict_mode : bool |
|
True for strict mode, the font types will be fully compared |
|
False for non-strict mode, the font types will be compared by the most longest common prefix |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the line has mixed font styles, False otherwise. |
|
""" |
|
if strict_mode: |
|
font_styles = set() |
|
for span in spans: |
|
font_style = span["font"].lower() |
|
font_styles.add(font_style) |
|
|
|
return len(font_styles) > 1 |
|
|
|
else: |
|
font_styles = [] |
|
for span in spans: |
|
font_style = span["font"].lower() |
|
font_styles.append(font_style) |
|
|
|
if len(font_styles) > 1: |
|
longest_common_prefix = os.path.commonprefix(font_styles) |
|
if len(longest_common_prefix) > 0: |
|
return False |
|
else: |
|
return True |
|
else: |
|
return False |
|
|
|
def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type): |
|
""" |
|
This function checks if the current line has a different font type from the previous and next lines |
|
|
|
Parameters |
|
---------- |
|
curr_line_font_type : str |
|
font type of the current line |
|
prev_line_font_type : str |
|
font type of the previous line |
|
next_line_font_type : str |
|
font type of the next line |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line has a different font type from the previous and next lines, False otherwise. |
|
""" |
|
return all( |
|
curr_line_font_type != other_font_type.lower() |
|
for other_font_type in [prev_line_font_type, next_line_font_type] |
|
if other_font_type is not None |
|
) |
|
|
|
def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size): |
|
""" |
|
This function checks if the current line has a larger font size than the previous and next lines |
|
|
|
Parameters |
|
---------- |
|
curr_line_font_size : float |
|
font size of the current line |
|
prev_line_font_size : float |
|
font size of the previous line |
|
next_line_font_size : float |
|
font size of the next line |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line has a larger font size than the previous and next lines, False otherwise. |
|
""" |
|
return all( |
|
curr_line_font_size > other_font_size * 1.2 |
|
for other_font_size in [prev_line_font_size, next_line_font_size] |
|
if other_font_size is not None |
|
) |
|
|
|
def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size): |
|
""" |
|
This function checks if the current line is similar to the previous line |
|
|
|
Parameters |
|
---------- |
|
curr_line : dict |
|
current line |
|
prev_line : dict |
|
previous line |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line is similar to the previous line, False otherwise. |
|
""" |
|
|
|
if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size: |
|
return True |
|
else: |
|
return False |
|
|
|
def __is_same_font_type_of_docAvg(curr_line_font_type): |
|
""" |
|
This function checks if the current line has the same font type as the document average font type |
|
|
|
Parameters |
|
---------- |
|
curr_line_font_type : str |
|
font type of the current line |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line has the same font type as the document average font type, False otherwise. |
|
""" |
|
doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower() |
|
doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower() |
|
|
|
return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type] |
|
|
|
def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1): |
|
""" |
|
This function checks if the current line has a large enough font size |
|
|
|
Parameters |
|
---------- |
|
curr_line_font_size : float |
|
font size of the current line |
|
ratio : float |
|
ratio of the current line font size to the document average font size |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line has a large enough font size, False otherwise. |
|
""" |
|
doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0) |
|
doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0) |
|
doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size) |
|
|
|
return curr_line_font_size >= doc_avg_font_size * ratio |
|
|
|
def __is_sufficient_spacing_above_and_below( |
|
curr_line_bbox, |
|
prev_line_bbox, |
|
next_line_bbox, |
|
avg_char_height, |
|
median_font_size, |
|
): |
|
""" |
|
This function checks if the current line has sufficient spacing above and below |
|
|
|
Parameters |
|
---------- |
|
curr_line_bbox : list |
|
bbox of the current line |
|
prev_line_bbox : list |
|
bbox of the previous line |
|
next_line_bbox : list |
|
bbox of the next line |
|
avg_char_width : float |
|
average of char widths |
|
avg_char_height : float |
|
average of line heights |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line has sufficient spacing above and below, False otherwise. |
|
""" |
|
vertical_ratio = 1.25 |
|
vertical_thres = vertical_ratio * median_font_size |
|
|
|
_, y0, _, y1 = curr_line_bbox |
|
|
|
sufficient_spacing_above = False |
|
if prev_line_bbox: |
|
vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3]) |
|
sufficient_spacing_above = vertical_spacing_above > vertical_thres |
|
else: |
|
sufficient_spacing_above = True |
|
|
|
sufficient_spacing_below = False |
|
if next_line_bbox: |
|
vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1) |
|
sufficient_spacing_below = vertical_spacing_below > vertical_thres |
|
else: |
|
sufficient_spacing_below = True |
|
|
|
return (sufficient_spacing_above, sufficient_spacing_below) |
|
|
|
def __is_word_list_line_by_rules(curr_line_text): |
|
""" |
|
This function checks if the current line is a word list |
|
|
|
Parameters |
|
---------- |
|
curr_line_text : str |
|
text of the current line |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line is a name list, False otherwise. |
|
""" |
|
|
|
name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[,,;;\s]|$)" |
|
|
|
compiled_pattern = re.compile(name_list_pattern) |
|
|
|
if compiled_pattern.search(curr_line_text): |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
def __get_text_catgr_by_nlp(curr_line_text): |
|
""" |
|
This function checks if the current line is a name list using nlp model, such as spacy |
|
|
|
Parameters |
|
---------- |
|
curr_line_text : str |
|
text of the current line |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line is a name list, False otherwise. |
|
""" |
|
|
|
result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text) |
|
|
|
return result |
|
|
|
|
|
|
|
def __is_numbered_title(curr_line_text): |
|
""" |
|
This function checks if the current line is a numbered list |
|
|
|
Parameters |
|
---------- |
|
curr_line_text : str |
|
text of the current line |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line is a numbered list, False otherwise. |
|
""" |
|
|
|
compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE) |
|
|
|
if compiled_pattern.search(curr_line_text): |
|
return True |
|
else: |
|
return False |
|
|
|
def __is_end_with_ending_puncs(line_text): |
|
""" |
|
This function checks if the current line ends with a ending punctuation mark |
|
|
|
Parameters |
|
---------- |
|
line_text : str |
|
text of the current line |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line ends with a punctuation mark, False otherwise. |
|
""" |
|
end_puncs = [".", "?", "!", "。", "?", "!", "…"] |
|
|
|
line_text = line_text.rstrip() |
|
if line_text[-1] in end_puncs: |
|
return True |
|
|
|
return False |
|
|
|
def __contains_only_no_meaning_symbols(line_text): |
|
""" |
|
This function checks if the current line contains only symbols that have no meaning, if so, it is not a title. |
|
Situation contains: |
|
1. Only have punctuation marks |
|
2. Only have other non-meaning symbols |
|
|
|
Parameters |
|
---------- |
|
line_text : str |
|
text of the current line |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line contains only symbols that have no meaning, False otherwise. |
|
""" |
|
|
|
punctuation_marks = re.findall(r"[^\w\s]", line_text) |
|
number_of_punctuation_marks = len(punctuation_marks) |
|
|
|
text_length = len(line_text) |
|
|
|
if text_length == 0: |
|
return False |
|
|
|
punctuation_ratio = number_of_punctuation_marks / text_length |
|
if punctuation_ratio >= 0.9: |
|
return True |
|
|
|
return False |
|
|
|
def __is_equation(line_text): |
|
""" |
|
This function checks if the current line is an equation. |
|
|
|
Parameters |
|
---------- |
|
line_text : str |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line is an equation, False otherwise. |
|
""" |
|
equation_reg = r"\$.*?\\overline.*?\$" |
|
|
|
if re.search(equation_reg, line_text): |
|
return True |
|
else: |
|
return False |
|
|
|
def __is_title_by_len(text, max_length=200): |
|
""" |
|
This function checks if the current line is a title by length. |
|
|
|
Parameters |
|
---------- |
|
text : str |
|
text of the current line |
|
|
|
max_length : int |
|
max length of the title |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the current line is a title, False otherwise. |
|
|
|
""" |
|
text = text.strip() |
|
return len(text) <= max_length |
|
|
|
def __compute_line_font_type_and_size(curr_line): |
|
""" |
|
This function computes the font type and font size of the line. |
|
|
|
Parameters |
|
---------- |
|
line : dict |
|
line |
|
|
|
Returns |
|
------- |
|
font_type : str |
|
font type of the line |
|
font_size : float |
|
font size of the line |
|
""" |
|
spans = curr_line["spans"] |
|
max_accumulated_length = 0 |
|
max_span_font_size = curr_line["spans"][0]["size"] |
|
max_span_font_type = curr_line["spans"][0]["font"].lower() |
|
for span in spans: |
|
if span["text"].isspace(): |
|
continue |
|
span_length = span["bbox"][2] - span["bbox"][0] |
|
if span_length > max_accumulated_length: |
|
max_accumulated_length = span_length |
|
max_span_font_size = span["size"] |
|
max_span_font_type = span["font"].lower() |
|
|
|
return max_span_font_type, max_span_font_size |
|
|
|
""" |
|
Title detecting main Process. |
|
""" |
|
|
|
""" |
|
Basic features about the current line. |
|
""" |
|
curr_line_bbox = curr_line["bbox"] |
|
curr_line_text = curr_line["text"] |
|
curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line) |
|
|
|
if len(curr_line_text.strip()) == 0: |
|
return False |
|
|
|
prev_line_bbox = prev_line["bbox"] if prev_line else None |
|
if prev_line: |
|
prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line) |
|
else: |
|
prev_line_font_type, prev_line_font_size = None, None |
|
|
|
next_line_bbox = next_line["bbox"] if next_line else None |
|
if next_line: |
|
next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line) |
|
else: |
|
next_line_font_type, next_line_font_size = None, None |
|
|
|
""" |
|
Aggregated features about the current line. |
|
""" |
|
is_italc_font = __is_italic_font_line(curr_line) |
|
is_bold_font = __is_bold_font_line(curr_line) |
|
|
|
is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8) |
|
is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1) |
|
is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6) |
|
|
|
is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type) |
|
|
|
is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg |
|
|
|
is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True) |
|
is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False) |
|
|
|
is_punctuation_heavy = __is_punctuation_heavy(curr_line_text) |
|
|
|
is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text) |
|
is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"] |
|
|
|
is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors( |
|
curr_line_font_size, prev_line_font_size, next_line_font_size |
|
) |
|
|
|
is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors( |
|
curr_line_font_type, prev_line_font_type, next_line_font_type |
|
) |
|
|
|
has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below( |
|
curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size |
|
) |
|
|
|
is_similar_to_pre_line = __is_similar_to_pre_line( |
|
curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size |
|
) |
|
|
|
""" |
|
Further aggregated features about the current line. |
|
|
|
Attention: |
|
Features that start with __ are for internal use. |
|
""" |
|
|
|
__is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors( |
|
curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width |
|
) |
|
__is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors |
|
is_a_left_inline_title = ( |
|
is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors |
|
) |
|
|
|
is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font |
|
is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font |
|
|
|
is_title_by_check_pre_and_next_line = ( |
|
(prev_line is not None or next_line is not None) |
|
and has_sufficient_spaces_above |
|
and has_sufficient_spaces_below |
|
and is_potential_title_font |
|
) |
|
|
|
is_numbered_title = __is_numbered_title(curr_line_text) and ( |
|
(has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None) |
|
) |
|
|
|
is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text) |
|
|
|
is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text) |
|
|
|
is_equation = __is_equation(curr_line_text) |
|
|
|
is_title_by_len = __is_title_by_len(curr_line_text) |
|
|
|
""" |
|
Decide if the line is a title. |
|
""" |
|
|
|
|
|
|
|
is_title = ( |
|
is_not_end_with_ending_puncs |
|
and is_not_only_no_meaning_symbols |
|
and is_title_by_len |
|
and not is_equation |
|
and is_potential_title_font |
|
and ( |
|
(is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg) |
|
or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg) |
|
or ( |
|
is_much_larger_font_than_doc_avg |
|
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) |
|
) |
|
or ( |
|
is_font_size_little_less_than_doc_avg |
|
and is_bold_font |
|
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) |
|
) |
|
) |
|
and ( |
|
( |
|
not is_person_or_org_list_line_by_nlp |
|
and ( |
|
is_much_larger_font_than_doc_avg |
|
or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg) |
|
) |
|
) |
|
or ( |
|
not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp) |
|
and not is_a_left_inline_title |
|
and not is_punctuation_heavy |
|
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) |
|
) |
|
or ( |
|
is_person_or_org_list_line_by_nlp |
|
and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg) |
|
and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg) |
|
) |
|
or (is_numbered_title and not is_a_left_inline_title) |
|
) |
|
) |
|
|
|
|
|
is_name_or_org_list_to_be_removed = ( |
|
(is_person_or_org_list_line_by_nlp) |
|
and is_punctuation_heavy |
|
and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) |
|
) and not is_title |
|
|
|
if is_name_or_org_list_to_be_removed: |
|
is_author_or_org_list = True |
|
|
|
|
|
else: |
|
is_author_or_org_list = False |
|
""" |
|
# print reason why the line is a title |
|
if is_title: |
|
print_green("This line is a title.") |
|
print_green("↓" * 10) |
|
print() |
|
print("curr_line_text: ", curr_line_text) |
|
print() |
|
|
|
# print reason why the line is not a title |
|
line_text = curr_line_text.strip() |
|
test_text = "Career/Personal Life" |
|
text_content_condition = line_text == test_text |
|
|
|
if not is_title and text_content_condition: # Print specific line |
|
# if not is_title: # Print each line |
|
print_red("This line is not a title.") |
|
print_red("↓" * 10) |
|
|
|
print() |
|
print("curr_line_text: ", curr_line_text) |
|
print() |
|
|
|
if is_not_end_with_ending_puncs: |
|
print_green(f"is_not_end_with_ending_puncs") |
|
else: |
|
print_red(f"is_end_with_ending_puncs") |
|
|
|
if is_not_only_no_meaning_symbols: |
|
print_green(f"is_not_only_no_meaning_symbols") |
|
else: |
|
print_red(f"is_only_no_meaning_symbols") |
|
|
|
if is_title_by_len: |
|
print_green(f"is_title_by_len: {is_title_by_len}") |
|
else: |
|
print_red(f"is_not_title_by_len: {is_title_by_len}") |
|
|
|
if is_equation: |
|
print_red(f"is_equation") |
|
else: |
|
print_green(f"is_not_equation") |
|
|
|
if is_potential_title_font: |
|
print_green(f"is_potential_title_font") |
|
else: |
|
print_red(f"is_not_potential_title_font") |
|
|
|
if is_punctuation_heavy: |
|
print_red("is_punctuation_heavy") |
|
else: |
|
print_green("is_not_punctuation_heavy") |
|
|
|
if is_bold_font: |
|
print_green(f"is_bold_font") |
|
else: |
|
print_red(f"is_not_bold_font") |
|
|
|
if is_font_size_not_less_than_doc_avg: |
|
print_green(f"is_larger_font_than_doc_avg") |
|
else: |
|
print_red(f"is_not_larger_font_than_doc_avg") |
|
|
|
if is_much_larger_font_than_doc_avg: |
|
print_green(f"is_much_larger_font_than_doc_avg") |
|
else: |
|
print_red(f"is_not_much_larger_font_than_doc_avg") |
|
|
|
if is_not_same_font_type_of_docAvg: |
|
print_green(f"is_not_same_font_type_of_docAvg") |
|
else: |
|
print_red(f"is_same_font_type_of_docAvg") |
|
|
|
if is_word_list_line_by_rules: |
|
print_red("is_word_list_line_by_rules") |
|
else: |
|
print_green("is_not_name_list_by_rules") |
|
|
|
if is_person_or_org_list_line_by_nlp: |
|
print_red("is_person_or_org_list_line_by_nlp") |
|
else: |
|
print_green("is_not_person_or_org_list_line_by_nlp") |
|
|
|
if not is_numbered_title: |
|
print_red("is_not_numbered_title") |
|
else: |
|
print_green("is_numbered_title") |
|
|
|
if is_a_left_inline_title: |
|
print_red("is_a_left_inline_title") |
|
else: |
|
print_green("is_not_a_left_inline_title") |
|
|
|
if not is_title_by_check_prev_line: |
|
print_red("is_not_title_by_check_prev_line") |
|
else: |
|
print_green("is_title_by_check_prev_line") |
|
|
|
if not is_title_by_check_next_line: |
|
print_red("is_not_title_by_check_next_line") |
|
else: |
|
print_green("is_title_by_check_next_line") |
|
|
|
if not is_title_by_check_pre_and_next_line: |
|
print_red("is_not_title_by_check_pre_and_next_line") |
|
else: |
|
print_green("is_title_by_check_pre_and_next_line") |
|
|
|
# print_green("Common features:") |
|
# print_green("↓" * 10) |
|
|
|
# print(f" curr_line_font_type: {curr_line_font_type}") |
|
# print(f" curr_line_font_size: {curr_line_font_size}") |
|
# print() |
|
|
|
""" |
|
|
|
return is_title, is_author_or_org_list |
|
|
|
def _detect_block_title(self, input_block): |
|
""" |
|
Use the functions 'is_potential_title' to detect titles of each paragraph block. |
|
If a line is a title, then the value of key 'is_title' of the line will be set to True. |
|
""" |
|
|
|
raw_lines = input_block["lines"] |
|
|
|
prev_line_is_title_flag = False |
|
|
|
for i, curr_line in enumerate(raw_lines): |
|
prev_line = raw_lines[i - 1] if i > 0 else None |
|
next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None |
|
|
|
blk_avg_char_width = input_block["avg_char_width"] |
|
blk_avg_char_height = input_block["avg_char_height"] |
|
blk_media_font_size = input_block["median_font_size"] |
|
|
|
is_title, is_author_or_org_list = self._is_potential_title( |
|
curr_line, |
|
prev_line, |
|
prev_line_is_title_flag, |
|
next_line, |
|
blk_avg_char_width, |
|
blk_avg_char_height, |
|
blk_media_font_size, |
|
) |
|
|
|
if is_title: |
|
curr_line["is_title"] = is_title |
|
prev_line_is_title_flag = True |
|
else: |
|
curr_line["is_title"] = False |
|
prev_line_is_title_flag = False |
|
|
|
if is_author_or_org_list: |
|
curr_line["is_author_or_org_list"] = is_author_or_org_list |
|
else: |
|
curr_line["is_author_or_org_list"] = False |
|
|
|
return input_block |
|
|
|
def batch_process_blocks_detect_titles(self, pdf_dic): |
|
""" |
|
This function batch process the blocks to detect titles. |
|
|
|
Parameters |
|
---------- |
|
pdf_dict : dict |
|
result dictionary |
|
|
|
Returns |
|
------- |
|
pdf_dict : dict |
|
result dictionary |
|
""" |
|
num_titles = 0 |
|
|
|
for page_id, blocks in pdf_dic.items(): |
|
if page_id.startswith("page_"): |
|
para_blocks = [] |
|
if "para_blocks" in blocks.keys(): |
|
para_blocks = blocks["para_blocks"] |
|
|
|
all_single_line_blocks = [] |
|
for block in para_blocks: |
|
if len(block["lines"]) == 1: |
|
all_single_line_blocks.append(block) |
|
|
|
new_para_blocks = [] |
|
if not len(all_single_line_blocks) == len(para_blocks): |
|
for para_block in para_blocks: |
|
new_block = self._detect_block_title(para_block) |
|
new_para_blocks.append(new_block) |
|
num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]]) |
|
else: |
|
for para_block in para_blocks: |
|
new_para_blocks.append(para_block) |
|
num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]]) |
|
para_blocks = new_para_blocks |
|
|
|
blocks["para_blocks"] = para_blocks |
|
|
|
for para_block in para_blocks: |
|
all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"]) |
|
para_text_len = sum([len(line["text"]) for line in para_block["lines"]]) |
|
if ( |
|
all_titles and para_text_len < 200 |
|
): |
|
para_block["is_block_title"] = 1 |
|
else: |
|
para_block["is_block_title"] = 0 |
|
|
|
all_name_or_org_list_to_be_removed = all( |
|
safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"] |
|
) |
|
if all_name_or_org_list_to_be_removed and page_id == "page_0": |
|
para_block["is_block_an_author_or_org_list"] = 1 |
|
else: |
|
para_block["is_block_an_author_or_org_list"] = 0 |
|
|
|
pdf_dic["statistics"]["num_titles"] = num_titles |
|
|
|
return pdf_dic |
|
|
|
def __determine_size_based_level(self, title_blocks): |
|
""" |
|
This function determines the title level based on the font size of the title. |
|
|
|
Parameters |
|
---------- |
|
title_blocks : list |
|
|
|
Returns |
|
------- |
|
title_blocks : list |
|
""" |
|
|
|
font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks]) |
|
|
|
|
|
mean_font_size = np.mean(font_sizes) |
|
std_font_size = np.std(font_sizes) |
|
min_extreme_font_size = mean_font_size - std_font_size |
|
max_extreme_font_size = mean_font_size + std_font_size |
|
|
|
|
|
middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)] |
|
if middle_font_sizes.size > 0: |
|
middle_mean_font_size = np.mean(middle_font_sizes) |
|
level_threshold = middle_mean_font_size |
|
else: |
|
level_threshold = mean_font_size |
|
|
|
for tb in title_blocks: |
|
title_block = tb["block"] |
|
title_font_size = safe_get(title_block, "block_font_size", 0) |
|
|
|
current_level = 1 |
|
|
|
|
|
if title_font_size >= max_extreme_font_size: |
|
current_level = 1 |
|
elif title_font_size <= min_extreme_font_size: |
|
current_level = 3 |
|
elif float(title_font_size) >= float(level_threshold): |
|
current_level = 2 |
|
else: |
|
current_level = 3 |
|
|
|
|
|
title_block["block_title_level"] = current_level |
|
|
|
return title_blocks |
|
|
|
def batch_process_blocks_recog_title_level(self, pdf_dic): |
|
title_blocks = [] |
|
|
|
|
|
for page_id, blocks in pdf_dic.items(): |
|
if page_id.startswith("page_"): |
|
para_blocks = blocks.get("para_blocks", []) |
|
for block in para_blocks: |
|
if block.get("is_block_title"): |
|
title_obj = {"page_id": page_id, "block": block} |
|
title_blocks.append(title_obj) |
|
|
|
|
|
if title_blocks: |
|
|
|
title_blocks = self.__determine_size_based_level(title_blocks) |
|
|
|
return pdf_dic |
|
|