|
from collections import Counter |
|
import numpy as np |
|
|
|
from magic_pdf.para.commons import * |
|
|
|
|
|
if sys.version_info[0] >= 3: |
|
sys.stdout.reconfigure(encoding="utf-8") |
|
|
|
|
|
class BlockStatisticsCalculator: |
|
def __init__(self) -> None: |
|
pass |
|
|
|
def __calc_stats_of_new_lines(self, new_lines): |
|
""" |
|
This function calculates the paragraph metrics |
|
|
|
Parameters |
|
---------- |
|
combined_lines : list |
|
combined lines |
|
|
|
Returns |
|
------- |
|
X0 : float |
|
Median of x0 values, which represents the left average boundary of the block |
|
X1 : float |
|
Median of x1 values, which represents the right average boundary of the block |
|
avg_char_width : float |
|
Average of char widths, which represents the average char width of the block |
|
avg_char_height : float |
|
Average of line heights, which represents the average line height of the block |
|
|
|
""" |
|
x0_values = [] |
|
x1_values = [] |
|
char_widths = [] |
|
char_heights = [] |
|
|
|
block_font_types = [] |
|
block_font_sizes = [] |
|
block_directions = [] |
|
|
|
if len(new_lines) > 0: |
|
for i, line in enumerate(new_lines): |
|
line_bbox = line["bbox"] |
|
line_text = line["text"] |
|
line_spans = line["spans"] |
|
|
|
num_chars = len([ch for ch in line_text if not ch.isspace()]) |
|
|
|
x0_values.append(line_bbox[0]) |
|
x1_values.append(line_bbox[2]) |
|
|
|
if num_chars > 0: |
|
char_width = (line_bbox[2] - line_bbox[0]) / num_chars |
|
char_widths.append(char_width) |
|
|
|
for span in line_spans: |
|
block_font_types.append(span["font"]) |
|
block_font_sizes.append(span["size"]) |
|
|
|
if "dir" in line: |
|
block_directions.append(line["dir"]) |
|
|
|
|
|
char_heights = [span["size"] for span in line_spans] |
|
|
|
X0 = np.median(x0_values) if x0_values else 0 |
|
X1 = np.median(x1_values) if x1_values else 0 |
|
avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0 |
|
avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0 |
|
|
|
|
|
|
|
max_span_length = 0 |
|
max_span_font_type = None |
|
for line in new_lines: |
|
line_spans = line["spans"] |
|
for span in line_spans: |
|
span_length = span["bbox"][2] - span["bbox"][0] |
|
if span_length > max_span_length: |
|
max_span_length = span_length |
|
max_span_font_type = span["font"] |
|
|
|
max_freq_font_type = max_span_font_type |
|
|
|
avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None |
|
|
|
avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0 |
|
avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0 |
|
|
|
median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None |
|
|
|
return ( |
|
X0, |
|
X1, |
|
avg_char_width, |
|
avg_char_height, |
|
max_freq_font_type, |
|
avg_font_size, |
|
(avg_dir_horizontal, avg_dir_vertical), |
|
median_font_size, |
|
) |
|
|
|
def __make_new_block(self, input_block): |
|
new_block = {} |
|
|
|
raw_lines = input_block["lines"] |
|
stats = self.__calc_stats_of_new_lines(raw_lines) |
|
|
|
block_id = input_block["block_id"] |
|
block_bbox = input_block["bbox"] |
|
block_text = input_block["text"] |
|
block_lines = raw_lines |
|
block_avg_left_boundary = stats[0] |
|
block_avg_right_boundary = stats[1] |
|
block_avg_char_width = stats[2] |
|
block_avg_char_height = stats[3] |
|
block_font_type = stats[4] |
|
block_font_size = stats[5] |
|
block_direction = stats[6] |
|
block_median_font_size = stats[7] |
|
|
|
new_block["block_id"] = block_id |
|
new_block["bbox"] = block_bbox |
|
new_block["text"] = block_text |
|
new_block["dir"] = block_direction |
|
new_block["X0"] = block_avg_left_boundary |
|
new_block["X1"] = block_avg_right_boundary |
|
new_block["avg_char_width"] = block_avg_char_width |
|
new_block["avg_char_height"] = block_avg_char_height |
|
new_block["block_font_type"] = block_font_type |
|
new_block["block_font_size"] = block_font_size |
|
new_block["lines"] = block_lines |
|
new_block["median_font_size"] = block_median_font_size |
|
|
|
return new_block |
|
|
|
def batch_process_blocks(self, pdf_dic): |
|
""" |
|
This function processes the blocks in batch. |
|
|
|
Parameters |
|
---------- |
|
self : object |
|
The instance of the class. |
|
---------- |
|
blocks : list |
|
Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json |
|
|
|
Returns |
|
------- |
|
result_dict : dict |
|
result dictionary |
|
""" |
|
|
|
for page_id, blocks in pdf_dic.items(): |
|
if page_id.startswith("page_"): |
|
para_blocks = [] |
|
if "para_blocks" in blocks.keys(): |
|
input_blocks = blocks["para_blocks"] |
|
for input_block in input_blocks: |
|
new_block = self.__make_new_block(input_block) |
|
para_blocks.append(new_block) |
|
|
|
blocks["para_blocks"] = para_blocks |
|
|
|
return pdf_dic |
|
|
|
|
|
class DocStatisticsCalculator: |
|
def __init__(self) -> None: |
|
pass |
|
|
|
def calc_stats_of_doc(self, pdf_dict): |
|
""" |
|
This function computes the statistics of the document |
|
|
|
Parameters |
|
---------- |
|
result_dict : dict |
|
result dictionary |
|
|
|
Returns |
|
------- |
|
statistics : dict |
|
statistics of the document |
|
""" |
|
|
|
total_text_length = 0 |
|
total_num_blocks = 0 |
|
|
|
for page_id, blocks in pdf_dict.items(): |
|
if page_id.startswith("page_"): |
|
if "para_blocks" in blocks.keys(): |
|
para_blocks = blocks["para_blocks"] |
|
for para_block in para_blocks: |
|
total_text_length += len(para_block["text"]) |
|
total_num_blocks += 1 |
|
|
|
avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0 |
|
|
|
font_list = [] |
|
|
|
for page_id, blocks in pdf_dict.items(): |
|
if page_id.startswith("page_"): |
|
if "para_blocks" in blocks.keys(): |
|
input_blocks = blocks["para_blocks"] |
|
for input_block in input_blocks: |
|
block_text_length = len(input_block.get("text", "")) |
|
if block_text_length < avg_text_length * 0.5: |
|
continue |
|
block_font_type = safe_get(input_block, "block_font_type", "") |
|
block_font_size = safe_get(input_block, "block_font_size", 0) |
|
font_list.append((block_font_type, block_font_size)) |
|
|
|
font_counter = Counter(font_list) |
|
most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0) |
|
second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0) |
|
|
|
statistics = { |
|
"num_pages": 0, |
|
"num_blocks": 0, |
|
"num_paras": 0, |
|
"num_titles": 0, |
|
"num_header_blocks": 0, |
|
"num_footer_blocks": 0, |
|
"num_watermark_blocks": 0, |
|
"num_vertical_margin_note_blocks": 0, |
|
"most_common_font_type": most_common_font[0][0], |
|
"most_common_font_size": most_common_font[0][1], |
|
"number_of_most_common_font": most_common_font[1], |
|
"second_most_common_font_type": second_most_common_font[0][0], |
|
"second_most_common_font_size": second_most_common_font[0][1], |
|
"number_of_second_most_common_font": second_most_common_font[1], |
|
"avg_text_length": avg_text_length, |
|
} |
|
|
|
for page_id, blocks in pdf_dict.items(): |
|
if page_id.startswith("page_"): |
|
blocks = pdf_dict[page_id]["para_blocks"] |
|
statistics["num_pages"] += 1 |
|
for block_id, block_data in enumerate(blocks): |
|
statistics["num_blocks"] += 1 |
|
|
|
if "paras" in block_data.keys(): |
|
statistics["num_paras"] += len(block_data["paras"]) |
|
|
|
for line in block_data["lines"]: |
|
if line.get("is_title", 0): |
|
statistics["num_titles"] += 1 |
|
|
|
if block_data.get("is_header", 0): |
|
statistics["num_header_blocks"] += 1 |
|
if block_data.get("is_footer", 0): |
|
statistics["num_footer_blocks"] += 1 |
|
if block_data.get("is_watermark", 0): |
|
statistics["num_watermark_blocks"] += 1 |
|
if block_data.get("is_vertical_margin_note", 0): |
|
statistics["num_vertical_margin_note_blocks"] += 1 |
|
|
|
pdf_dict["statistics"] = statistics |
|
|
|
return pdf_dict |
|
|
|
|
|
|