laubonghaudoi commited on
Commit
a233921
0 Parent(s):

initial commit

Browse files
Files changed (10) hide show
  1. .gitattributes +35 -0
  2. LICENSE +21 -0
  3. README.md +35 -0
  4. app.py +63 -0
  5. bert.srt +87 -0
  6. gold.srt +63 -0
  7. main.py +178 -0
  8. requirements.txt +2 -0
  9. utils.py +50 -0
  10. visualize.py +269 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 laubonghaudoi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Srt Eval
3
+ emoji: 🌍
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.4.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: Visualize CER / WER for SRT subtitles
12
+ ---
13
+
14
+ # SRT Evaluation Tool
15
+
16
+ This Gradio app compares two SRT files and calculates Character Error Rate (CER) and Word Error Rate (WER) metrics, with and without punctuation. It provides a detailed visualization of the differences between the files.
17
+
18
+ ## Features
19
+
20
+ - Upload and compare two SRT files
21
+ - Calculate CER/WER metrics
22
+ - Visualize text differences
23
+ - Download visualization as PNG or PDF
24
+ - Example files included for testing
25
+
26
+ ## Usage
27
+
28
+ 1. Upload a reference (golden) SRT file
29
+ 2. Upload a target SRT file for comparison
30
+ 3. Click "Process Files" to see the results
31
+ 4. Or use "Load Example" to try with sample files
32
+
33
+ ## About
34
+
35
+ This tool is particularly useful for evaluating machine-generated subtitles against human-created references, supporting both Chinese and English text.
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from main import calculate_cer_both_versions, read_srt_text
4
+ from visualize import generate_html_report
5
+
6
+
7
+ def process_srt_files(reference_file, hypothesis_file):
8
+ try:
9
+ # Handle both file objects and string paths
10
+ ref_path = reference_file.name if hasattr(
11
+ reference_file, 'name') else reference_file
12
+ hyp_path = hypothesis_file.name if hasattr(
13
+ hypothesis_file, 'name') else hypothesis_file
14
+
15
+ reference_text = read_srt_text(ref_path)
16
+ hypothesis_text = read_srt_text(hyp_path)
17
+
18
+ metrics_no_punct, metrics_with_punct = calculate_cer_both_versions(
19
+ ref_path, hyp_path)
20
+
21
+ html_content = generate_html_report(
22
+ reference_text, hypothesis_text, metrics_no_punct, metrics_with_punct)
23
+
24
+ return html_content
25
+ except Exception as e:
26
+ return f"An error occurred: {str(e)}"
27
+
28
+
29
+ def load_example():
30
+ return "gold.srt", "bert.srt"
31
+
32
+
33
+ with gr.Blocks() as iface:
34
+ gr.Markdown("# SRT File Comparison and CER Calculation")
35
+ gr.Markdown(
36
+ "## Please upload the golden reference SRT and the target SRT for calculating the CER.")
37
+ gr.Markdown(
38
+ "Note: Only CER is supported at the moment, WER will be added in a future version.")
39
+
40
+ with gr.Row():
41
+ ref_file = gr.File(label="Reference (Golden) SRT File")
42
+ hyp_file = gr.File(label="Target SRT File")
43
+
44
+ with gr.Row():
45
+ example_btn = gr.Button("Load Example")
46
+ process_btn = gr.Button("Get CER", variant="primary")
47
+
48
+ output = gr.HTML(label="Results")
49
+
50
+ process_btn.click(
51
+ fn=process_srt_files,
52
+ inputs=[ref_file, hyp_file],
53
+ outputs=output
54
+ )
55
+
56
+ example_btn.click(
57
+ fn=load_example,
58
+ inputs=None,
59
+ outputs=[ref_file, hyp_file]
60
+ )
61
+
62
+ if __name__ == "__main__":
63
+ iface.launch()
bert.srt ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0
2
+ 00:00:01,617 --> 00:00:02,760
3
+ 各位朋友。
4
+
5
+ 1
6
+ 00:00:03,755 --> 00:00:06,119
7
+ 喺講三國演義之前啊。
8
+
9
+ 2
10
+ 00:00:06,785 --> 00:00:09,040
11
+ 我念一手持俾大家聽下嚇。
12
+
13
+ 3
14
+ 00:00:10,720 --> 00:00:17,120
15
+ 滾滾長江東逝水浪花陶盡英雄。
16
+
17
+ 4
18
+ 00:00:18,082 --> 00:00:26,530
19
+ 是非成敗轉頭空青山依舊在幾道夕陽紅。
20
+
21
+ 5
22
+ 00:00:27,986 --> 00:00:33,870
23
+ 白發如潮江主上慣看秋月春風。
24
+
25
+ 6
26
+ 00:00:34,800 --> 00:00:43,829
27
+ 一壺濁酒喜相逢古今多少事都付笑談中,
28
+
29
+ 7
30
+ 00:00:45,717 --> 00:00:50,379
31
+ 好一個古今多少事都呼笑談中啊。
32
+
33
+ 8
34
+ 00:00:51,203 --> 00:00:55,940
35
+ 一部三國演義係講魏術吳三國嘅曆史。
36
+
37
+ 9
38
+ 00:00:56,943 --> 00:01:10,283
39
+ 由東漢靈帝中平元年即係公元一八四年黃巾起義嗰陣開始一直係寫到晉武帝太康元年即係公元二八零年啦,
40
+
41
+ 10
42
+ 00:01:10,698 --> 00:01:15,560
43
+ 吳國滅亡嗰陣爲止差唔多成個世紀咁長。
44
+
45
+ 11
46
+ 00:01:16,552 --> 00:01:20,250
47
+ 佢離而傢咧有成一千八百年噉耐嘞。
48
+
49
+ 12
50
+ 00:01:20,870 --> 00:01:24,960
51
+ 你話多少英雄豪傑啊已經化爲烏有。
52
+
53
+ 13
54
+ 00:01:25,938 --> 00:01:31,700
55
+ 但系佢哋嘅事跡就好似滾滾長江一直流傳到今日。
56
+
57
+ 14
58
+ 00:01:32,894 --> 00:01:40,574
59
+ 而家講起桃園結義三英戰呂報火燒赤壁六出岐山等等,
60
+
61
+ 15
62
+ 00:01:40,692 --> 00:01:42,920
63
+ 唉真系感慨好多啊。
64
+
65
+ 16
66
+ 00:01:43,895 --> 00:01:48,110
67
+ 咁至于繫唔係講完聽完只係得啖笑呢,
68
+
69
+ 17
70
+ 00:01:48,526 --> 00:01:53,929
71
+ 我睇亦未必古時啊一個朝代嘅聖衰興亡,
72
+
73
+ 18
74
+ 00:01:54,405 --> 00:01:56,483
75
+ 一個人物嘅成功失敗,
76
+
77
+ 19
78
+ 00:01:56,958 --> 00:02:01,411
79
+ 總係可以使我哋今人從中得到啲啓發同教益嘅,
80
+
81
+ 20
82
+ 00:02:01,827 --> 00:02:02,480
83
+ 好啦好啦,
84
+
85
+ 21
86
+ 00:02:03,133 --> 00:02:05,390
87
+ 閒話幽題言歸正傳。
gold.srt ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1
2
+ 00:00:01,491 --> 00:00:09,158
3
+ 各位朋友,喺講《三國演義》之前啊,我唸一首詞畀大家聽下吓。
4
+
5
+ 2
6
+ 00:00:10,342 --> 00:00:17,103
7
+ 滾滾長江東逝水,浪花淘盡英雄。
8
+
9
+ 3
10
+ 00:00:17,786 --> 00:00:26,490
11
+ 是非成敗轉頭空,青山依舊在,幾度夕陽紅。
12
+
13
+ 4
14
+ 00:00:27,695 --> 00:00:33,815
15
+ 白髮漁樵江渚上,慣看秋月春風。
16
+
17
+ 5
18
+ 00:00:34,373 --> 00:00:44,340
19
+ 一壺濁酒喜相逢,古今多少事,都付笑談中。
20
+
21
+ 6
22
+ 00:00:44,649 --> 00:00:50,377
23
+ 哈哈哈,好一個古今多少事,都付笑談中啊。
24
+
25
+ 7
26
+ 00:00:50,940 --> 00:00:55,857
27
+ 一部《三國演義》,係講魏蜀吳三國嘅歷史。
28
+
29
+ 8
30
+ 00:00:56,711 --> 00:01:04,396
31
+ 由東漢靈帝中平元年,即係公元一八四年,黃巾起義嗰陣開始。
32
+
33
+ 9
34
+ 00:01:04,796 --> 00:01:15,538
35
+ 一直寫到晉武帝太康元年,即係公元二八零年嘞,吳國滅亡嗰陣為止,差唔多成個世紀咁長。
36
+
37
+ 10
38
+ 00:01:16,295 --> 00:01:20,228
39
+ 佢哋而家呢,有成一千八百年咁耐嘞。
40
+
41
+ 11
42
+ 00:01:20,670 --> 00:01:24,920
43
+ 你話多少英雄豪傑啊,已經化為烏有。
44
+
45
+ 12
46
+ 00:01:25,745 --> 00:01:31,689
47
+ 但係佢哋嘅事蹟,就好似滾滾長江一直流傳到今日。
48
+
49
+ 13
50
+ 00:01:32,566 --> 00:01:42,883
51
+ 而家講起桃園結義、三英戰呂布、火燒赤壁、六出祁山等等,嗨真係感慨好多啊。
52
+
53
+ 14
54
+ 00:01:43,695 --> 00:01:49,863
55
+ 噉至於係唔係講完聽完,只係得啖笑呢?我睇亦未必。
56
+
57
+ 15
58
+ 00:01:50,296 --> 00:02:01,787
59
+ 古時啊,一個朝代嘅盛衰興亡,一個人物嘅成功失敗,總係可以使我哋今人從中得到啲啓發同教育嘅。
60
+
61
+ 16
62
+ 00:02:01,787 --> 00:02:05,403
63
+ 好嘞好嘞,閒話休提言歸正傳嘞。
main.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from typing import Dict, Tuple
3
+
4
+ from visualize import generate_html_report
5
+ from utils import read_srt_text, preprocess_chinese_text
6
+ import jiwer
7
+
8
+ # Expose read_srt_text function
9
+ from utils import read_srt_text
10
+
11
+ def parse_arguments():
12
+ parser = argparse.ArgumentParser(
13
+ description="Calculate Character Error Rate (CER) for Chinese SRT files",
14
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
15
+ )
16
+
17
+ parser.add_argument(
18
+ "-r",
19
+ "--reference",
20
+ required=True,
21
+ help="Path to the reference (golden) SRT file",
22
+ )
23
+
24
+ parser.add_argument(
25
+ "-i", "--input", required=True, help="Path to the input (hypothesis) SRT file"
26
+ )
27
+
28
+ parser.add_argument("-o", "--output", help="Path to save the results (optional)")
29
+
30
+ parser.add_argument("--html", help="Path to save the HTML visualization (optional)")
31
+
32
+ return parser.parse_args()
33
+
34
+ def calculate_cer_both_versions(
35
+ reference_path: str, hypothesis_path: str
36
+ ) -> Tuple[Dict, Dict]:
37
+ """
38
+ Calculate CER and related metrics between reference and hypothesis SRT files,
39
+ both with and without punctuation.
40
+
41
+ Args:
42
+ reference_path (str): Path to the reference SRT file
43
+ hypothesis_path (str): Path to the hypothesis SRT file
44
+
45
+ Returns:
46
+ Tuple[Dict, Dict]: Two dictionaries containing metrics (with and without punctuation)
47
+ """
48
+ # Read files
49
+ reference_text = read_srt_text(reference_path)
50
+ hypothesis_text = read_srt_text(hypothesis_path)
51
+
52
+ # Calculate metrics without punctuation
53
+ reference_chars_no_punct = preprocess_chinese_text(
54
+ reference_text, include_punctuation=False
55
+ )
56
+ hypothesis_chars_no_punct = preprocess_chinese_text(
57
+ hypothesis_text, include_punctuation=False
58
+ )
59
+
60
+ metrics_no_punct = jiwer.compute_measures(
61
+ reference_chars_no_punct, hypothesis_chars_no_punct
62
+ )
63
+
64
+ # Calculate metrics with punctuation
65
+ reference_chars_with_punct = preprocess_chinese_text(
66
+ reference_text, include_punctuation=True
67
+ )
68
+ hypothesis_chars_with_punct = preprocess_chinese_text(
69
+ hypothesis_text, include_punctuation=True
70
+ )
71
+
72
+ metrics_with_punct = jiwer.compute_measures(
73
+ reference_chars_with_punct, hypothesis_chars_with_punct
74
+ )
75
+
76
+ # Add character counts
77
+ metrics_no_punct["total_ref_chars"] = len(reference_chars_no_punct.replace(" ", ""))
78
+ metrics_no_punct["total_hyp_chars"] = len(
79
+ hypothesis_chars_no_punct.replace(" ", "")
80
+ )
81
+ metrics_with_punct["total_ref_chars"] = len(
82
+ reference_chars_with_punct.replace(" ", "")
83
+ )
84
+ metrics_with_punct["total_hyp_chars"] = len(
85
+ hypothesis_chars_with_punct.replace(" ", "")
86
+ )
87
+
88
+ return metrics_no_punct, metrics_with_punct
89
+
90
+
91
+ def format_metrics(metrics: dict, version: str) -> str:
92
+ """
93
+ Format metrics into a string.
94
+
95
+ Args:
96
+ metrics (dict): Dictionary of metric values
97
+ version (str): String indicating which version of metrics these are
98
+
99
+ Returns:
100
+ str: Formatted metrics string
101
+ """
102
+ output = []
103
+ output.append(f"\n=== {version} ===")
104
+ output.append(f"Character Error Rate (CER): {metrics['wer']:.3f}")
105
+ output.append(f"Total Reference Characters: {metrics['total_ref_chars']}")
106
+ output.append(f"Total Hypothesis Characters: {metrics['total_hyp_chars']}")
107
+
108
+ output.append("\nDetailed Statistics:")
109
+ output.append(f"Correct Characters: {metrics['hits']}")
110
+ output.append(f"Substitutions: {metrics['substitutions']}")
111
+ output.append(f"Deletions: {metrics['deletions']}")
112
+ output.append(f"Insertions: {metrics['insertions']}")
113
+
114
+ # Calculate and print percentage stats
115
+ total_errors = (
116
+ metrics["substitutions"] + metrics["deletions"] + metrics["insertions"]
117
+ )
118
+ total_chars = metrics["total_ref_chars"]
119
+
120
+ output.append(f"\nError Analysis:")
121
+ output.append(f"Total Errors: {total_errors}")
122
+ output.append(f"Substitution Rate: {metrics['substitutions']/total_chars:.3f}")
123
+ output.append(f"Deletion Rate: {metrics['deletions']/total_chars:.3f}")
124
+ output.append(f"Insertion Rate: {metrics['insertions']/total_chars:.3f}")
125
+
126
+ return "\n".join(output)
127
+
128
+
129
+
130
+ if __name__ == "__main__":
131
+ args = parse_arguments()
132
+
133
+ try:
134
+ # Read the original texts
135
+ reference_text = read_srt_text(args.reference)
136
+ hypothesis_text = read_srt_text(args.input)
137
+
138
+ # Calculate metrics
139
+ metrics_no_punct, metrics_with_punct = calculate_cer_both_versions(
140
+ args.reference, args.input
141
+ )
142
+
143
+ # Generate and save HTML report if requested
144
+ if args.html:
145
+ html_content = generate_html_report(
146
+ reference_text, hypothesis_text, metrics_no_punct, metrics_with_punct
147
+ )
148
+ with open(args.html, "w", encoding="utf-8") as f:
149
+ f.write(html_content)
150
+ print(f"\nHTML visualization has been saved to: {args.html}")
151
+
152
+ # Original metrics output
153
+ output_text = []
154
+ output_text.append(
155
+ format_metrics(metrics_no_punct, "Metrics Without Punctuation")
156
+ )
157
+ output_text.append(
158
+ format_metrics(metrics_with_punct, "Metrics With Punctuation")
159
+ )
160
+ output_text.append("\n=== Comparison ===")
161
+ output_text.append(f"CER without punctuation: {metrics_no_punct['wer']:.3f}")
162
+ output_text.append(f"CER with punctuation: {metrics_with_punct['wer']:.3f}")
163
+ output_text.append(
164
+ f"Difference: {abs(metrics_with_punct['wer'] - metrics_no_punct['wer']):.3f}"
165
+ )
166
+
167
+ final_output = "\n".join(output_text)
168
+ print(final_output)
169
+
170
+ if args.output:
171
+ with open(args.output, "w", encoding="utf-8") as f:
172
+ f.write(final_output)
173
+ print(f"\nResults have been saved to: {args.output}")
174
+
175
+ except FileNotFoundError as e:
176
+ print(f"Error: Could not find one of the input files - {str(e)}")
177
+ except Exception as e:
178
+ print(f"Error occurred: {str(e)}")
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ jiwer==3.0.3
2
+ gradio==3.50.2
utils.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def read_srt_text(file_path: str) -> str:
4
+ """
5
+ Read an SRT file and extract only the text content, ignoring timestamps.
6
+
7
+ Args:
8
+ file_path (str): Path to the SRT file
9
+
10
+ Returns:
11
+ str: Concatenated text content from the SRT file
12
+ """
13
+ with open(file_path, "r", encoding="utf-8") as f:
14
+ content = f.read()
15
+
16
+ # Split content into subtitle blocks
17
+ blocks = content.strip().split("\n\n")
18
+
19
+ # Extract only the text lines (not numbers or timestamps)
20
+ text_lines = []
21
+ for block in blocks:
22
+ lines = block.split("\n")
23
+ # Skip the subtitle number and timestamp lines
24
+ text = " ".join(lines[2:]) # Join all lines after timestamp
25
+ text_lines.append(text)
26
+
27
+ return " ".join(text_lines)
28
+
29
+ def preprocess_chinese_text(text: str, include_punctuation: bool = False) -> str:
30
+ """
31
+ Preprocess Chinese text for CER calculation.
32
+
33
+ Args:
34
+ text (str): Input Chinese text
35
+ include_punctuation (bool): Whether to include punctuation in the calculation
36
+
37
+ Returns:
38
+ str: Preprocessed text with characters separated by spaces
39
+ """
40
+ # Remove any English characters, numbers, and extra spaces
41
+ text = re.sub(r"[a-zA-Z0-9\s]+", "", text)
42
+
43
+ if not include_punctuation:
44
+ # Remove both Chinese and English punctuation with properly escaped characters
45
+ text = re.sub(
46
+ r'[,。!?:;""' "()【】《》、,\.!?:;\"'\\(\\)\\[\\]\\{\\}]", "", text
47
+ )
48
+
49
+ # Convert to list of characters and join with spaces
50
+ return " ".join(list(text))
visualize.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import difflib
2
+ from dataclasses import dataclass
3
+ from html import escape
4
+ from typing import List, Tuple
5
+
6
+ from utils import preprocess_chinese_text
7
+
8
+
9
+ @dataclass
10
+ class DiffResult:
11
+ reference_display: str
12
+ hypothesis_display: str
13
+ error_pairs: List[Tuple[str, str]]
14
+
15
+
16
+ def visualize_differences(
17
+ ref_text: str, hyp_text: str, include_punctuation: bool = False
18
+ ) -> DiffResult:
19
+ """
20
+ Create a visualization of the differences between reference and hypothesis texts.
21
+
22
+ Args:
23
+ ref_text (str): Reference text
24
+ hyp_text (str): Hypothesis text
25
+ include_punctuation (bool): Whether to include punctuation
26
+
27
+ Returns:
28
+ DiffResult: Containing formatted reference and hypothesis texts with error highlighting
29
+ """
30
+ # Preprocess texts
31
+ ref_processed = preprocess_chinese_text(ref_text, include_punctuation)
32
+ hyp_processed = preprocess_chinese_text(hyp_text, include_punctuation)
33
+
34
+ # Split into characters
35
+ ref_chars = ref_processed.split()
36
+ hyp_chars = hyp_processed.split()
37
+
38
+ # Get sequence matcher
39
+ matcher = difflib.SequenceMatcher(None, ref_chars, hyp_chars)
40
+
41
+ ref_formatted = []
42
+ hyp_formatted = []
43
+ error_pairs = []
44
+
45
+ for op, ref_start, ref_end, hyp_start, hyp_end in matcher.get_opcodes():
46
+ if op == "equal":
47
+ ref_formatted.extend(ref_chars[ref_start:ref_end])
48
+ hyp_formatted.extend(hyp_chars[hyp_start:hyp_end])
49
+ elif op == "delete":
50
+ # Deletion - character in reference but not in hypothesis
51
+ for char in ref_chars[ref_start:ref_end]:
52
+ ref_formatted.append(f"[DEL]{char}[/DEL]")
53
+ hyp_formatted.append("[DEL]_[/DEL]")
54
+ error_pairs.append((char, "_"))
55
+ elif op == "insert":
56
+ # Insertion - character in hypothesis but not in reference
57
+ for char in hyp_chars[hyp_start:hyp_end]:
58
+ ref_formatted.append("[INS]_[/INS]")
59
+ hyp_formatted.append(f"[INS]{char}[/INS]")
60
+ error_pairs.append(("_", char))
61
+ elif op == "replace":
62
+ # Substitution - different characters in reference and hypothesis
63
+ for ref_char, hyp_char in zip(
64
+ ref_chars[ref_start:ref_end], hyp_chars[hyp_start:hyp_end]
65
+ ):
66
+ ref_formatted.append(f"[SUB]{ref_char}[/SUB]")
67
+ hyp_formatted.append(f"[SUB]{hyp_char}[/SUB]")
68
+ error_pairs.append((ref_char, hyp_char))
69
+
70
+ return DiffResult(
71
+ reference_display="".join(ref_formatted),
72
+ hypothesis_display="".join(hyp_formatted),
73
+ error_pairs=error_pairs,
74
+ )
75
+
76
+
77
+ def generate_html_report(
78
+ ref_text: str, hyp_text: str, metrics_no_punct: dict, metrics_with_punct: dict
79
+ ) -> str:
80
+ """
81
+ Generate an HTML report with error visualization and metrics.
82
+ """
83
+ # Get visualizations for both versions
84
+ diff_no_punct = visualize_differences(ref_text, hyp_text, False)
85
+ diff_with_punct = visualize_differences(ref_text, hyp_text, True)
86
+
87
+ def format_text_for_html(text: str) -> str:
88
+ """Format text with HTML spans for coloring"""
89
+ text = escape(text)
90
+ text = text.replace("[DEL]", '<span class="deletion">')
91
+ text = text.replace("[/DEL]", "</span>")
92
+ text = text.replace("[INS]", '<span class="insertion">')
93
+ text = text.replace("[/INS]", "</span>")
94
+ text = text.replace("[SUB]", '<span class="substitution">')
95
+ text = text.replace("[/SUB]", "</span>")
96
+ return text
97
+
98
+ def format_error_pairs(pairs: List[Tuple[str, str]]) -> str:
99
+ """Format error pairs into HTML table rows"""
100
+ rows = []
101
+ for ref_char, hyp_char in pairs:
102
+ rows.append(
103
+ f"<tr><td>{escape(ref_char)}</td><td>{escape(hyp_char)}</td></tr>"
104
+ )
105
+ return "\n".join(rows)
106
+
107
+ # Calculate metrics for no punctuation
108
+ ref_no_punct = preprocess_chinese_text(ref_text, False)
109
+ total_chars_no_punct = len(ref_no_punct.split())
110
+ # total_words_no_punct = len([w for w in ref_no_punct.split() if w.strip()])
111
+
112
+ cer_no_punct = metrics_no_punct['wer']
113
+ total_errors_no_punct = metrics_no_punct['substitutions'] + \
114
+ metrics_no_punct['deletions'] + metrics_no_punct['insertions']
115
+ substitutions_no_punct = metrics_no_punct['substitutions']
116
+ deletions_no_punct = metrics_no_punct['deletions']
117
+ insertions_no_punct = metrics_no_punct['insertions']
118
+
119
+ # Calculate metrics for with punctuation
120
+ ref_with_punct = preprocess_chinese_text(ref_text, True)
121
+ total_chars_punct = len(ref_with_punct.split())
122
+ # total_words_punct = len([w for w in ref_with_punct.split() if w.strip()])
123
+
124
+ cer_punct = metrics_with_punct['wer']
125
+ total_errors_punct = metrics_with_punct['substitutions'] + \
126
+ metrics_with_punct['deletions'] + metrics_with_punct['insertions']
127
+ substitutions_punct = metrics_with_punct['substitutions']
128
+ deletions_punct = metrics_with_punct['deletions']
129
+ insertions_punct = metrics_with_punct['insertions']
130
+
131
+ html_template = """
132
+ <!DOCTYPE html>
133
+ <html>
134
+ <head>
135
+ <meta charset="UTF-8">
136
+ <title>CER Analysis Report</title>
137
+ <style>
138
+ body {{ font-family: Arial, sans-serif; margin: 20px; }}
139
+ .container {{ max-width: 100%; margin: 0 auto; }}
140
+ .metrics {{ margin: 20px 0; padding: 10px; background: #f5f5f5; }}
141
+ .visualization {{ margin: 20px 0; }}
142
+ .deletion {{ background-color: #ffd7d7; text-decoration: line-through; }}
143
+ .insertion {{ background-color: #d7ffd7; }}
144
+ .substitution {{ background-color: #fff3d7; }}
145
+ .text-display {{ font-size: 16px; line-height: 1.6; white-space: pre-wrap; }}
146
+ table {{ border-collapse: collapse; margin: 10px 0; }}
147
+ th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
148
+ th {{ background-color: #f5f5f5; }}
149
+ .legend {{ margin: 20px 0; }}
150
+ .legend span {{ padding: 2px 5px; margin-right: 10px; }}
151
+ h2 {{ margin-top: 30px; }}
152
+ .grid-container {{ display: grid; grid-template-columns: auto auto; column-gap: 24px;}}
153
+ .grid-item {{ }}
154
+ </style>
155
+ </head>
156
+ <body>
157
+ <div class="container">
158
+ <h1>Character Error Rate Analysis Report</h1>
159
+
160
+ <div class="legend">
161
+ <h3>Legend:</h3>
162
+ <span class="deletion">Deletion</span>
163
+ <span class="insertion">Insertion</span>
164
+ <span class="substitution">Substitution</span>
165
+ </div>
166
+
167
+ <div class="grid-container">
168
+ <div class="grid-item">
169
+ <h2>Without Punctuation</h2>
170
+ <table class="metrics">
171
+ <thead>
172
+ <tr>
173
+ <th>Total Chars</th>
174
+ <th>CER</th>
175
+ <th>Total Errors</th>
176
+ <th>Substitutions</th>
177
+ <th>Deletions</th>
178
+ <th>Insertions</th>
179
+ </tr>
180
+ </thead>
181
+ <tbody>
182
+ <tr>
183
+ <td>{total_chars_no_punct}</td>
184
+ <td>{cer_no_punct:.3f}</td>
185
+ <td>{total_errors_no_punct}</td>
186
+ <td>{substitutions_no_punct}</td>
187
+ <td>{deletions_no_punct}</td>
188
+ <td>{insertions_no_punct}</td>
189
+ </tr>
190
+ </tbody>
191
+ </table>
192
+ <div class="visualization">
193
+ <h3>Reference Text:</h3>
194
+ <div class="text-display">{ref_no_punct}</div>
195
+ <h3>Hypothesis Text:</h3>
196
+ <div class="text-display">{hyp_no_punct}</div>
197
+
198
+ <h3>Error Pairs:</h3>
199
+ <table>
200
+ <tr><th>Reference</th><th>Hypothesis</th></tr>
201
+ {pairs_no_punct}
202
+ </table>
203
+ </div>
204
+ </div>
205
+
206
+ <div class="grid-item">
207
+ <h2>With Punctuation</h2>
208
+ <table class="metrics">
209
+ <thead>
210
+ <tr>
211
+ <th>Total Chars</th>
212
+ <th>CER</th>
213
+ <th>Total Errors</th>
214
+ <th>Substitutions</th>
215
+ <th>Deletions</th>
216
+ <th>Insertions</th>
217
+ </tr>
218
+ </thead>
219
+ <tbody>
220
+ <tr>
221
+ <td>{total_chars_punct}</td>
222
+ <td>{cer_punct:.3f}</td>
223
+ <td>{total_errors_punct}</td>
224
+ <td>{substitutions_punct}</td>
225
+ <td>{deletions_punct}</td>
226
+ <td>{insertions_punct}</td>
227
+ </tr>
228
+ </tbody>
229
+ </table>
230
+ <div class="visualization">
231
+ <h3>Reference Text:</h3>
232
+ <div class="text-display">{ref_with_punct}</div>
233
+ <h3>Hypothesis Text:</h3>
234
+ <div class="text-display">{hyp_with_punct}</div>
235
+
236
+ <h3>Error Pairs:</h3>
237
+ <table>
238
+ <tr><th>Reference</th><th>Hypothesis</th></tr>
239
+ {pairs_with_punct}
240
+ </table>
241
+ </div>
242
+ </div>
243
+ </div>
244
+ </div>
245
+ </body>
246
+ </html>
247
+ """
248
+
249
+ return html_template.format(
250
+ cer_no_punct=cer_no_punct,
251
+ total_errors_no_punct=total_errors_no_punct,
252
+ insertions_no_punct=insertions_no_punct,
253
+ deletions_no_punct=deletions_no_punct,
254
+ substitutions_no_punct=substitutions_no_punct,
255
+ cer_punct=cer_punct,
256
+ total_errors_punct=total_errors_punct,
257
+ insertions_punct=insertions_punct,
258
+ deletions_punct=deletions_punct,
259
+ substitutions_punct=substitutions_punct,
260
+ total_chars_no_punct=total_chars_no_punct,
261
+ total_chars_punct=total_chars_punct,
262
+ ref_no_punct=format_text_for_html(diff_no_punct.reference_display),
263
+ hyp_no_punct=format_text_for_html(diff_no_punct.hypothesis_display),
264
+ pairs_no_punct=format_error_pairs(diff_no_punct.error_pairs),
265
+ ref_with_punct=format_text_for_html(diff_with_punct.reference_display),
266
+ hyp_with_punct=format_text_for_html(
267
+ diff_with_punct.hypothesis_display),
268
+ pairs_with_punct=format_error_pairs(diff_with_punct.error_pairs),
269
+ )