srt-eval / utils.py
laubonghaudoi's picture
initial commit
a233921
import re
def read_srt_text(file_path: str) -> str:
"""
Read an SRT file and extract only the text content, ignoring timestamps.
Args:
file_path (str): Path to the SRT file
Returns:
str: Concatenated text content from the SRT file
"""
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
# Split content into subtitle blocks
blocks = content.strip().split("\n\n")
# Extract only the text lines (not numbers or timestamps)
text_lines = []
for block in blocks:
lines = block.split("\n")
# Skip the subtitle number and timestamp lines
text = " ".join(lines[2:]) # Join all lines after timestamp
text_lines.append(text)
return " ".join(text_lines)
def preprocess_chinese_text(text: str, include_punctuation: bool = False) -> str:
"""
Preprocess Chinese text for CER calculation.
Args:
text (str): Input Chinese text
include_punctuation (bool): Whether to include punctuation in the calculation
Returns:
str: Preprocessed text with characters separated by spaces
"""
# Remove any English characters, numbers, and extra spaces
text = re.sub(r"[a-zA-Z0-9\s]+", "", text)
if not include_punctuation:
# Remove both Chinese and English punctuation with properly escaped characters
text = re.sub(
r'[,。!?:;""' "()【】《》、,\.!?:;\"'\\(\\)\\[\\]\\{\\}]", "", text
)
# Convert to list of characters and join with spaces
return " ".join(list(text))