MinerU / magic_pdf /para /stats.py
derful's picture
Upload folder using huggingface_hub
240e0a0 verified
from collections import Counter
import numpy as np
from magic_pdf.para.commons import *
if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
class BlockStatisticsCalculator:
def __init__(self) -> None:
pass
def __calc_stats_of_new_lines(self, new_lines):
"""
This function calculates the paragraph metrics
Parameters
----------
combined_lines : list
combined lines
Returns
-------
X0 : float
Median of x0 values, which represents the left average boundary of the block
X1 : float
Median of x1 values, which represents the right average boundary of the block
avg_char_width : float
Average of char widths, which represents the average char width of the block
avg_char_height : float
Average of line heights, which represents the average line height of the block
"""
x0_values = []
x1_values = []
char_widths = []
char_heights = []
block_font_types = []
block_font_sizes = []
block_directions = []
if len(new_lines) > 0:
for i, line in enumerate(new_lines):
line_bbox = line["bbox"]
line_text = line["text"]
line_spans = line["spans"]
num_chars = len([ch for ch in line_text if not ch.isspace()])
x0_values.append(line_bbox[0])
x1_values.append(line_bbox[2])
if num_chars > 0:
char_width = (line_bbox[2] - line_bbox[0]) / num_chars
char_widths.append(char_width)
for span in line_spans:
block_font_types.append(span["font"])
block_font_sizes.append(span["size"])
if "dir" in line:
block_directions.append(line["dir"])
# line_font_types = [span["font"] for span in line_spans]
char_heights = [span["size"] for span in line_spans]
X0 = np.median(x0_values) if x0_values else 0
X1 = np.median(x1_values) if x1_values else 0
avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
# max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
max_span_length = 0
max_span_font_type = None
for line in new_lines:
line_spans = line["spans"]
for span in line_spans:
span_length = span["bbox"][2] - span["bbox"][0]
if span_length > max_span_length:
max_span_length = span_length
max_span_font_type = span["font"]
max_freq_font_type = max_span_font_type
avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
return (
X0,
X1,
avg_char_width,
avg_char_height,
max_freq_font_type,
avg_font_size,
(avg_dir_horizontal, avg_dir_vertical),
median_font_size,
)
def __make_new_block(self, input_block):
new_block = {}
raw_lines = input_block["lines"]
stats = self.__calc_stats_of_new_lines(raw_lines)
block_id = input_block["block_id"]
block_bbox = input_block["bbox"]
block_text = input_block["text"]
block_lines = raw_lines
block_avg_left_boundary = stats[0]
block_avg_right_boundary = stats[1]
block_avg_char_width = stats[2]
block_avg_char_height = stats[3]
block_font_type = stats[4]
block_font_size = stats[5]
block_direction = stats[6]
block_median_font_size = stats[7]
new_block["block_id"] = block_id
new_block["bbox"] = block_bbox
new_block["text"] = block_text
new_block["dir"] = block_direction
new_block["X0"] = block_avg_left_boundary
new_block["X1"] = block_avg_right_boundary
new_block["avg_char_width"] = block_avg_char_width
new_block["avg_char_height"] = block_avg_char_height
new_block["block_font_type"] = block_font_type
new_block["block_font_size"] = block_font_size
new_block["lines"] = block_lines
new_block["median_font_size"] = block_median_font_size
return new_block
def batch_process_blocks(self, pdf_dic):
"""
This function processes the blocks in batch.
Parameters
----------
self : object
The instance of the class.
----------
blocks : list
Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
Returns
-------
result_dict : dict
result dictionary
"""
for page_id, blocks in pdf_dic.items():
if page_id.startswith("page_"):
para_blocks = []
if "para_blocks" in blocks.keys():
input_blocks = blocks["para_blocks"]
for input_block in input_blocks:
new_block = self.__make_new_block(input_block)
para_blocks.append(new_block)
blocks["para_blocks"] = para_blocks
return pdf_dic
class DocStatisticsCalculator:
def __init__(self) -> None:
pass
def calc_stats_of_doc(self, pdf_dict):
"""
This function computes the statistics of the document
Parameters
----------
result_dict : dict
result dictionary
Returns
-------
statistics : dict
statistics of the document
"""
total_text_length = 0
total_num_blocks = 0
for page_id, blocks in pdf_dict.items():
if page_id.startswith("page_"):
if "para_blocks" in blocks.keys():
para_blocks = blocks["para_blocks"]
for para_block in para_blocks:
total_text_length += len(para_block["text"])
total_num_blocks += 1
avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
font_list = []
for page_id, blocks in pdf_dict.items():
if page_id.startswith("page_"):
if "para_blocks" in blocks.keys():
input_blocks = blocks["para_blocks"]
for input_block in input_blocks:
block_text_length = len(input_block.get("text", ""))
if block_text_length < avg_text_length * 0.5:
continue
block_font_type = safe_get(input_block, "block_font_type", "")
block_font_size = safe_get(input_block, "block_font_size", 0)
font_list.append((block_font_type, block_font_size))
font_counter = Counter(font_list)
most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
statistics = {
"num_pages": 0,
"num_blocks": 0,
"num_paras": 0,
"num_titles": 0,
"num_header_blocks": 0,
"num_footer_blocks": 0,
"num_watermark_blocks": 0,
"num_vertical_margin_note_blocks": 0,
"most_common_font_type": most_common_font[0][0],
"most_common_font_size": most_common_font[0][1],
"number_of_most_common_font": most_common_font[1],
"second_most_common_font_type": second_most_common_font[0][0],
"second_most_common_font_size": second_most_common_font[0][1],
"number_of_second_most_common_font": second_most_common_font[1],
"avg_text_length": avg_text_length,
}
for page_id, blocks in pdf_dict.items():
if page_id.startswith("page_"):
blocks = pdf_dict[page_id]["para_blocks"]
statistics["num_pages"] += 1
for block_id, block_data in enumerate(blocks):
statistics["num_blocks"] += 1
if "paras" in block_data.keys():
statistics["num_paras"] += len(block_data["paras"])
for line in block_data["lines"]:
if line.get("is_title", 0):
statistics["num_titles"] += 1
if block_data.get("is_header", 0):
statistics["num_header_blocks"] += 1
if block_data.get("is_footer", 0):
statistics["num_footer_blocks"] += 1
if block_data.get("is_watermark", 0):
statistics["num_watermark_blocks"] += 1
if block_data.get("is_vertical_margin_note", 0):
statistics["num_vertical_margin_note_blocks"] += 1
pdf_dict["statistics"] = statistics
return pdf_dict