Spaces:

CanCLID
/

srt-eval

Running

srt-eval / utils.py

initial commit

a233921 about 1 month ago

1.59 kB

	import re

	def read_srt_text(file_path: str) -> str:
	"""
	Read an SRT file and extract only the text content, ignoring timestamps.

	Args:
	file_path (str): Path to the SRT file

	Returns:
	str: Concatenated text content from the SRT file
	"""
	with open(file_path, "r", encoding="utf-8") as f:
	content = f.read()

	# Split content into subtitle blocks
	blocks = content.strip().split("\n\n")

	# Extract only the text lines (not numbers or timestamps)
	text_lines = []
	for block in blocks:
	lines = block.split("\n")
	# Skip the subtitle number and timestamp lines
	text = " ".join(lines[2:]) # Join all lines after timestamp
	text_lines.append(text)

	return " ".join(text_lines)

	def preprocess_chinese_text(text: str, include_punctuation: bool = False) -> str:
	"""
	Preprocess Chinese text for CER calculation.

	Args:
	text (str): Input Chinese text
	include_punctuation (bool): Whether to include punctuation in the calculation

	Returns:
	str: Preprocessed text with characters separated by spaces
	"""
	# Remove any English characters, numbers, and extra spaces
	text = re.sub(r"[a-zA-Z0-9\s]+", "", text)

	if not include_punctuation:
	# Remove both Chinese and English punctuation with properly escaped characters
	text = re.sub(
	r'[，。！？：；""' "（）【】《》、,\.!?:;\"'\\(\\)\\[\\]\\{\\}]", "", text
	)

	# Convert to list of characters and join with spaces
	return " ".join(list(text))