|
import re |
|
|
|
def read_srt_text(file_path: str) -> str: |
|
""" |
|
Read an SRT file and extract only the text content, ignoring timestamps. |
|
|
|
Args: |
|
file_path (str): Path to the SRT file |
|
|
|
Returns: |
|
str: Concatenated text content from the SRT file |
|
""" |
|
with open(file_path, "r", encoding="utf-8") as f: |
|
content = f.read() |
|
|
|
|
|
blocks = content.strip().split("\n\n") |
|
|
|
|
|
text_lines = [] |
|
for block in blocks: |
|
lines = block.split("\n") |
|
|
|
text = " ".join(lines[2:]) |
|
text_lines.append(text) |
|
|
|
return " ".join(text_lines) |
|
|
|
def preprocess_chinese_text(text: str, include_punctuation: bool = False) -> str: |
|
""" |
|
Preprocess Chinese text for CER calculation. |
|
|
|
Args: |
|
text (str): Input Chinese text |
|
include_punctuation (bool): Whether to include punctuation in the calculation |
|
|
|
Returns: |
|
str: Preprocessed text with characters separated by spaces |
|
""" |
|
|
|
text = re.sub(r"[a-zA-Z0-9\s]+", "", text) |
|
|
|
if not include_punctuation: |
|
|
|
text = re.sub( |
|
r'[,。!?:;""' "()【】《》、,\.!?:;\"'\\(\\)\\[\\]\\{\\}]", "", text |
|
) |
|
|
|
|
|
return " ".join(list(text)) |
|
|