Spaces:

derful
/

MinerU

Runtime error

App Files Files Community

MinerU / magic_pdf /para /denoise.py

derful

Upload folder using huggingface_hub

240e0a0 verified 4 months ago

raw

history blame contribute delete

10.4 kB

	import math

	from collections import defaultdict
	from magic_pdf.para.commons import *

	if sys.version_info[0] >= 3:
	sys.stdout.reconfigure(encoding="utf-8") # type: ignore


	class HeaderFooterProcessor:
	def __init__(self) -> None:
	pass

	def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
	"""
	This function gets the most common bboxes from the bboxes

	Parameters
	----------
	bboxes : list
	bboxes
	page_height : float
	height of the page
	position : str, optional
	"top" or "bottom", by default "top"
	threshold : float, optional
	threshold, by default 0.25
	num_bboxes : int, optional
	number of bboxes to return, by default 3
	min_frequency : int, optional
	minimum frequency of the bbox, by default 2

	Returns
	-------
	common_bboxes : list
	common bboxes
	"""
	# Filter bbox by position
	if position == "top":
	filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
	else:
	filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]

	# Find the most common bbox
	bbox_count = defaultdict(int)
	for bbox in filtered_bboxes:
	bbox_count[tuple(bbox)] += 1

	# Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
	common_bboxes = [
	bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
	][:num_bboxes]
	return common_bboxes

	def detect_footer_header(self, result_dict, similarity_threshold=0.5):
	"""
	This function detects the header and footer of the document.

	Parameters
	----------
	result_dict : dict
	result dictionary

	Returns
	-------
	result_dict : dict
	result dictionary
	"""

	def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
	return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)

	def is_single_line_block(block):
	# Determine based on the width and height of the block
	block_width = block["X1"] - block["X0"]
	block_height = block["bbox"][3] - block["bbox"][1]

	# If the height of the block is close to the average character height and the width is large, it is considered a single line
	return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3

	# Traverse all blocks in the document
	single_preproc_blocks = 0
	total_blocks = 0
	single_preproc_blocks = 0

	for page_id, blocks in result_dict.items():
	if page_id.startswith("page_"):
	for block_key, block in blocks.items():
	if block_key.startswith("block_"):
	total_blocks += 1
	if is_single_line_block(block):
	single_preproc_blocks += 1

	# If there are no blocks, skip the header and footer detection
	if total_blocks == 0:
	print("No blocks found. Skipping header/footer detection.")
	return result_dict

	# If most of the blocks are single-line, skip the header and footer detection
	if single_preproc_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
	return result_dict

	# Collect the bounding boxes of all blocks
	all_bboxes = []
	all_texts = []

	for page_id, blocks in result_dict.items():
	if page_id.startswith("page_"):
	for block_key, block in blocks.items():
	if block_key.startswith("block_"):
	all_bboxes.append(block["bbox"])

	# Get the height of the page
	page_height = max(bbox[3] for bbox in all_bboxes)

	# Get the most common bbox lists for headers and footers
	common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
	common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []

	# Detect and mark headers and footers
	for page_id, blocks in result_dict.items():
	if page_id.startswith("page_"):
	for block_key, block in blocks.items():
	if block_key.startswith("block_"):
	bbox = block["bbox"]
	text = block["text"]

	is_header = compare_bbox_with_list(bbox, common_header_bboxes)
	is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)

	block["is_header"] = int(is_header)
	block["is_footer"] = int(is_footer)

	return result_dict


	class NonHorizontalTextProcessor:
	def __init__(self) -> None:
	pass

	def detect_non_horizontal_texts(self, result_dict):
	"""
	This function detects watermarks and vertical margin notes in the document.

	Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
	If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
	If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.

	Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
	If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
	If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.


	Parameters
	----------
	result_dict : dict
	The result dictionary.

	Returns
	-------
	result_dict : dict
	The updated result dictionary.
	"""
	# Dictionary to store information about potential watermarks
	potential_watermarks = {}
	potential_margin_notes = {}

	for page_id, page_content in result_dict.items():
	if page_id.startswith("page_"):
	for block_id, block_data in page_content.items():
	if block_id.startswith("block_"):
	if "dir" in block_data:
	coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text

	angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
	angle = abs(math.degrees(angle))

	if angle > 5 and angle < 85: # Check if direction is watermarks
	if coordinates_text in potential_watermarks:
	potential_watermarks[coordinates_text] += 1
	else:
	potential_watermarks[coordinates_text] = 1

	if angle > 85 and angle < 105: # Check if direction is vertical
	if coordinates_text in potential_margin_notes:
	potential_margin_notes[coordinates_text] += 1 # Increment count
	else:
	potential_margin_notes[coordinates_text] = 1 # Initialize count

	# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
	watermark_threshold = len(result_dict) // 2
	watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}

	# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
	margin_note_threshold = len(result_dict) // 2
	margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}

	# Add watermark information to the result dictionary
	for page_id, blocks in result_dict.items():
	if page_id.startswith("page_"):
	for block_id, block_data in blocks.items():
	coordinates_text = (block_data["bbox"], block_data["text"])
	if coordinates_text in watermarks:
	block_data["is_watermark"] = 1
	else:
	block_data["is_watermark"] = 0

	if coordinates_text in margin_notes:
	block_data["is_vertical_margin_note"] = 1
	else:
	block_data["is_vertical_margin_note"] = 0

	return result_dict


	class NoiseRemover:
	def __init__(self) -> None:
	pass

	def skip_data_noises(self, result_dict):
	"""
	This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
	"""
	filtered_result_dict = {}
	for page_id, blocks in result_dict.items():
	if page_id.startswith("page_"):
	filtered_blocks = {}
	for block_id, block in blocks.items():
	if block_id.startswith("block_"):
	if any(
	block.get(key, 0)
	for key in [
	"is_overlap",
	"is_header",
	"is_footer",
	"is_watermark",
	"is_vertical_margin_note",
	"is_block_title",
	]
	):
	continue
	filtered_blocks[block_id] = block
	if filtered_blocks:
	filtered_result_dict[page_id] = filtered_blocks

	return filtered_result_dict