import difflib
from dataclasses import dataclass
from html import escape
from typing import List, Tuple
from utils import preprocess_chinese_text
@dataclass
class DiffResult:
reference_display: str
hypothesis_display: str
error_pairs: List[Tuple[str, str]]
def visualize_differences(
ref_text: str, hyp_text: str, include_punctuation: bool = False
) -> DiffResult:
"""
Create a visualization of the differences between reference and hypothesis texts.
Args:
ref_text (str): Reference text
hyp_text (str): Hypothesis text
include_punctuation (bool): Whether to include punctuation
Returns:
DiffResult: Containing formatted reference and hypothesis texts with error highlighting
"""
# Preprocess texts
ref_processed = preprocess_chinese_text(ref_text, include_punctuation)
hyp_processed = preprocess_chinese_text(hyp_text, include_punctuation)
# Split into characters
ref_chars = ref_processed.split()
hyp_chars = hyp_processed.split()
# Get sequence matcher
matcher = difflib.SequenceMatcher(None, ref_chars, hyp_chars)
ref_formatted = []
hyp_formatted = []
error_pairs = []
for op, ref_start, ref_end, hyp_start, hyp_end in matcher.get_opcodes():
if op == "equal":
ref_formatted.extend(ref_chars[ref_start:ref_end])
hyp_formatted.extend(hyp_chars[hyp_start:hyp_end])
elif op == "delete":
# Deletion - character in reference but not in hypothesis
for char in ref_chars[ref_start:ref_end]:
ref_formatted.append(f"[DEL]{char}[/DEL]")
hyp_formatted.append("[DEL]_[/DEL]")
error_pairs.append((char, "_"))
elif op == "insert":
# Insertion - character in hypothesis but not in reference
for char in hyp_chars[hyp_start:hyp_end]:
ref_formatted.append("[INS]_[/INS]")
hyp_formatted.append(f"[INS]{char}[/INS]")
error_pairs.append(("_", char))
elif op == "replace":
# Substitution - different characters in reference and hypothesis
for ref_char, hyp_char in zip(
ref_chars[ref_start:ref_end], hyp_chars[hyp_start:hyp_end]
):
ref_formatted.append(f"[SUB]{ref_char}[/SUB]")
hyp_formatted.append(f"[SUB]{hyp_char}[/SUB]")
error_pairs.append((ref_char, hyp_char))
return DiffResult(
reference_display="".join(ref_formatted),
hypothesis_display="".join(hyp_formatted),
error_pairs=error_pairs,
)
def generate_html_report(
ref_text: str, hyp_text: str, metrics_no_punct: dict, metrics_with_punct: dict
) -> str:
"""
Generate an HTML report with error visualization and metrics.
"""
# Get visualizations for both versions
diff_no_punct = visualize_differences(ref_text, hyp_text, False)
diff_with_punct = visualize_differences(ref_text, hyp_text, True)
def format_text_for_html(text: str) -> str:
"""Format text with HTML spans for coloring"""
text = escape(text)
text = text.replace("[DEL]", '')
text = text.replace("[/DEL]", "")
text = text.replace("[INS]", '')
text = text.replace("[/INS]", "")
text = text.replace("[SUB]", '')
text = text.replace("[/SUB]", "")
return text
def format_error_pairs(pairs: List[Tuple[str, str]]) -> str:
"""Format error pairs into HTML table rows"""
rows = []
for ref_char, hyp_char in pairs:
rows.append(
f"
{escape(ref_char)} | {escape(hyp_char)} |
"
)
return "\n".join(rows)
# Calculate metrics for no punctuation
ref_no_punct = preprocess_chinese_text(ref_text, False)
total_chars_no_punct = len(ref_no_punct.split())
# total_words_no_punct = len([w for w in ref_no_punct.split() if w.strip()])
cer_no_punct = metrics_no_punct['wer']
total_errors_no_punct = metrics_no_punct['substitutions'] + \
metrics_no_punct['deletions'] + metrics_no_punct['insertions']
substitutions_no_punct = metrics_no_punct['substitutions']
deletions_no_punct = metrics_no_punct['deletions']
insertions_no_punct = metrics_no_punct['insertions']
# Calculate metrics for with punctuation
ref_with_punct = preprocess_chinese_text(ref_text, True)
total_chars_punct = len(ref_with_punct.split())
# total_words_punct = len([w for w in ref_with_punct.split() if w.strip()])
cer_punct = metrics_with_punct['wer']
total_errors_punct = metrics_with_punct['substitutions'] + \
metrics_with_punct['deletions'] + metrics_with_punct['insertions']
substitutions_punct = metrics_with_punct['substitutions']
deletions_punct = metrics_with_punct['deletions']
insertions_punct = metrics_with_punct['insertions']
html_template = """
CER Analysis Report
Character Error Rate Analysis Report
Legend:
Deletion
Insertion
Substitution
Without Punctuation
Total Chars |
CER |
Total Errors |
Substitutions |
Deletions |
Insertions |
{total_chars_no_punct} |
{cer_no_punct:.3f} |
{total_errors_no_punct} |
{substitutions_no_punct} |
{deletions_no_punct} |
{insertions_no_punct} |
Reference Text:
{ref_no_punct}
Hypothesis Text:
{hyp_no_punct}
Error Pairs:
Reference | Hypothesis |
{pairs_no_punct}
With Punctuation
Total Chars |
CER |
Total Errors |
Substitutions |
Deletions |
Insertions |
{total_chars_punct} |
{cer_punct:.3f} |
{total_errors_punct} |
{substitutions_punct} |
{deletions_punct} |
{insertions_punct} |
Reference Text:
{ref_with_punct}
Hypothesis Text:
{hyp_with_punct}
Error Pairs:
Reference | Hypothesis |
{pairs_with_punct}
"""
return html_template.format(
cer_no_punct=cer_no_punct,
total_errors_no_punct=total_errors_no_punct,
insertions_no_punct=insertions_no_punct,
deletions_no_punct=deletions_no_punct,
substitutions_no_punct=substitutions_no_punct,
cer_punct=cer_punct,
total_errors_punct=total_errors_punct,
insertions_punct=insertions_punct,
deletions_punct=deletions_punct,
substitutions_punct=substitutions_punct,
total_chars_no_punct=total_chars_no_punct,
total_chars_punct=total_chars_punct,
ref_no_punct=format_text_for_html(diff_no_punct.reference_display),
hyp_no_punct=format_text_for_html(diff_no_punct.hypothesis_display),
pairs_no_punct=format_error_pairs(diff_no_punct.error_pairs),
ref_with_punct=format_text_for_html(diff_with_punct.reference_display),
hyp_with_punct=format_text_for_html(
diff_with_punct.hypothesis_display),
pairs_with_punct=format_error_pairs(diff_with_punct.error_pairs),
)