thanhtvt commited on
Commit
5def8da
1 Parent(s): eefcfb3

First checkpoint version

Browse files
.gitattributes CHANGED
@@ -32,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ checkpoints/avg_top5_25-29.ckpt.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
36
+ checkpoints/avg_top5_25-29.ckpt.index filter=lfs diff=lfs merge=lfs -text
37
+ checkpoints/avg_top5_27-32.ckpt.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
38
+ checkpoints/avg_top5_27-32.ckpt.index filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,11 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+
5
+ ### How to clone this repo
6
+ ```
7
+ sudo apt-get install git-lfs
8
+ git clone https://huggingface.co/thanhtvt/uetasr-conformer_30.3m
9
+ cd uetasr-conformer_30.3m
10
+ git lfs pull
11
+ ```
checkpoints/avg_top5_25-29.ckpt.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49fc029829a19daaa12a9a0803a953b65151b242201c4fafa8935898079efb75
3
+ size 246353751
checkpoints/avg_top5_25-29.ckpt.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e767271b8c9e5d7254a5da8e23c659adf3fa080909076d0dedd0eb61e8e28be
3
+ size 105822
checkpoints/avg_top5_27-32.ckpt.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5806d3e06fac6c2fde5a70f5c5d29bf31dcf81557ef744fd67339804ef736050
3
+ size 246353751
checkpoints/avg_top5_27-32.ckpt.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:620561277420e8326531c0036fe013b9f57aa6e577fb7390236679ae15c70ca7
3
+ size 105822
config.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root_dir: ..
2
+ ckpt_path: !ref <root_dir>/checkpoints/avg_top5_27-32.ckpt
3
+
4
+ text_encoder: !new:uetasr.featurizers.text.Subword
5
+ model_prefix: vocabs/subword_vietnamese_500_v3
6
+ data_path: data/transcript_v3.1.txt
7
+ character_coverage: 1.0
8
+ model_type: bpe # word bpe unigram char
9
+ num_threads: 16
10
+ unk_id: 1
11
+ pad_id: 0
12
+ eos_id: -1
13
+ unk_piece: <unk>
14
+ pad_piece: <blank>
15
+ eos_piece: </s>
16
+ vocab_size: 500
17
+
18
+ audio_encoder: !new:uetasr.featurizers.audio.LogMelSpectrogram
19
+ fs: 16000
20
+ n_fft: 512
21
+ win_length: 400
22
+ hop_length: 160
23
+ n_mels: 80
24
+ fmin: 0
25
+ fmax: 8000
26
+ htk: False
27
+
28
+ d_model: 256
29
+
30
+ encoder_model: !new:uetasr.models.encoders.Conformer
31
+ num_features: 80
32
+ window_size: 1
33
+ d_model: !ref <d_model>
34
+ input_layer: vgg2l
35
+ pos_enc_layer_type: rel_pos
36
+ dropout_rate_pos_enc: 0.2
37
+ selfattention_layer_type: rel_selfattn
38
+ attention_heads: 4
39
+ dropout_rate_att: 0.1
40
+ dropout_rate_pos_wise: 0.1
41
+ dropout_rate: 0.1
42
+ positionwise_layer_type: linear
43
+ linear_units: 1024
44
+ conv_mod_kernel: 31
45
+ num_blocks: 18
46
+ use_macaron: True
47
+ use_cnn_module: True
48
+ eps_layer_norm: 0.000000000001
49
+
50
+ decoder_model: !new:uetasr.models.decoders.RNNDecoder
51
+ vocab_size: !ref <text_encoder.vocab_size>
52
+ embedding_dim: 256
53
+ num_layers: 1
54
+ hidden_dim: !ref <d_model>
55
+ dropout_embed: 0.2
56
+ dropout_rnn: 0.1
57
+ rnn_type: LSTM
58
+
59
+ jointer_model: !new:uetasr.layers.jointer.RNNTJointer
60
+ encoder_dim: !ref <d_model>
61
+ decoder_dim: !ref <d_model>
62
+ hidden_dim: 512
63
+ output_dim: !ref <text_encoder.vocab_size>
64
+
65
+ ctc_lin: null
66
+
67
+ model: !new:uetasr.models.rnnt.RNNT
68
+ encoder: !ref <encoder_model>
69
+ decoder: !ref <decoder_model>
70
+ jointer: !ref <jointer_model>
71
+ ctc_lin: !ref <ctc_lin>
72
+ ctc_dropout: 0.1
73
+ use_cmvn: True
74
+
75
+ decoder: !new:uetasr.searchers.GreedyRNNTV2
76
+ decoder: !ref <decoder_model>
77
+ jointer: !ref <jointer_model>
78
+ text_decoder: !ref <text_encoder>
79
+ max_symbols_per_step: 10
vocabs/subword_vietnamese_500_v3.model ADDED
Binary file (245 kB). View file
 
vocabs/subword_vietnamese_500_v3.vocab ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <blank> 0
2
+ <unk> 0
3
+ ▁t -0
4
+ ng -1
5
+ ▁c -2
6
+ ▁đ -3
7
+ ▁n -4
8
+ ▁l -5
9
+ ▁th -6
10
+ ▁v -7
11
+ ▁m -8
12
+ ▁h -9
13
+ ▁ch -10
14
+ ▁b -11
15
+ nh -12
16
+ ▁k -13
17
+ ▁nh -14
18
+ ▁tr -15
19
+ ▁s -16
20
+ ▁g -17
21
+ ▁kh -18
22
+ ▁ng -19
23
+ ▁là -20
24
+ ▁p -21
25
+ ▁cá -22
26
+ ▁ph -23
27
+ ▁d -24
28
+ ông -25
29
+ ▁gi -26
30
+ iệ -27
31
+ ▁r -28
32
+ ▁q -29
33
+ ▁qu -30
34
+ ên -31
35
+ ▁và -32
36
+ ườ -33
37
+ ▁có -34
38
+ ▁x -35
39
+ ột -36
40
+ ▁củ -37
41
+ ▁của -38
42
+ iế -39
43
+ ▁cái -40
44
+ ▁một -41
45
+ ▁các -42
46
+ ới -43
47
+ ▁không -44
48
+ ượ -45
49
+ ình -46
50
+ ất -47
51
+ ▁nà -48
52
+ ai -49
53
+ ại -50
54
+ ười -51
55
+ ân -52
56
+ uy -53
57
+ ong -54
58
+ ▁thì -55
59
+ ươ -56
60
+ ôi -57
61
+ ược -58
62
+ ▁như -59
63
+ ăm -60
64
+ iề -61
65
+ ▁ngh -62
66
+ ch -63
67
+ ▁nó -64
68
+ ▁được -65
69
+ anh -66
70
+ ▁cho -67
71
+ ây -68
72
+ ▁người -69
73
+ úng -70
74
+ ▁này -71
75
+ ▁ta -72
76
+ ▁trong -73
77
+ an -74
78
+ ấy -75
79
+ ải -76
80
+ ướ -77
81
+ ▁mà -78
82
+ ạn -79
83
+ iể -80
84
+ ững -81
85
+ ần -82
86
+ ▁đó -83
87
+ em -84
88
+ ▁với -85
89
+ ▁những -86
90
+ iên -87
91
+ inh -88
92
+ ▁hai -89
93
+ ay -90
94
+ iều -91
95
+ ao -92
96
+ ▁đã -93
97
+ ▁tôi -94
98
+ ước -95
99
+ ương -96
100
+ ▁việ -97
101
+ ▁chúng -98
102
+ ▁chí -99
103
+ ▁họ -100
104
+ ▁em -101
105
+ ▁ở -102
106
+ ành -103
107
+ ▁sẽ -104
108
+ ▁để -105
109
+ ến -106
110
+ ản -107
111
+ ường -108
112
+ ức -109
113
+ ác -110
114
+ iện -111
115
+ ũng -112
116
+ ▁bạn -113
117
+ ang -114
118
+ ▁ho -115
119
+ ài -116
120
+ ực -117
121
+ ▁độ -118
122
+ ▁cũng -119
123
+ ▁ra -120
124
+ ời -121
125
+ ▁về -122
126
+ ▁phải -123
127
+ ết -124
128
+ ▁năm -125
129
+ ầu -126
130
+ ▁đến -127
131
+ ▁thể -128
132
+ ống -129
133
+ ung -130
134
+ au -131
135
+ am -132
136
+ ▁cả -133
137
+ ận -134
138
+ âu -135
139
+ ơn -136
140
+ ▁làm -137
141
+ ối -138
142
+ ▁từ -139
143
+ ồi -140
144
+ ▁mình -141
145
+ àn -142
146
+ iết -143
147
+ ật -144
148
+ ▁anh -145
149
+ ▁công -146
150
+ àng -147
151
+ ▁lại -148
152
+ ùng -149
153
+ ▁khi -150
154
+ ▁thế -151
155
+ át -152
156
+ ốn -153
157
+ ▁đi -154
158
+ ốc -155
159
+ ồng -156
160
+ on -157
161
+ iến -158
162
+ án -159
163
+ ằng -160
164
+ ▁thu -161
165
+ ▁đây -162
166
+ ▁gì -163
167
+ ăn -164
168
+ in -165
169
+ ội -166
170
+ ▁học -167
171
+ ▁vào -168
172
+ ưa -169
173
+ ▁nhiều -170
174
+ áng -171
175
+ ▁ba -172
176
+ ▁đị -173
177
+ òn -174
178
+ ▁giá -175
179
+ êu -176
180
+ ươi -177
181
+ ▁nhà -178
182
+ eo -179
183
+ ▁nước -180
184
+ ăng -181
185
+ ▁rất -182
186
+ ▁rồi -183
187
+ ▁số -184
188
+ ▁nào -185
189
+ ▁quy -186
190
+ ▁mươi -187
191
+ ▁chính -188
192
+ ▁nhưng -189
193
+ ập -190
194
+ ▁nói -191
195
+ ▁còn -192
196
+ ày -193
197
+ úc -194
198
+ ▁con -195
199
+ ▁sự -196
200
+ ▁thành -197
201
+ ộc -198
202
+ ▁trên -199
203
+ ▁đầu -200
204
+ ▁bị -201
205
+ iểm -202
206
+ ▁tiế -203
207
+ ục -204
208
+ ▁chỉ -205
209
+ ôn -206
210
+ ▁trăm -207
211
+ ậy -208
212
+ ▁động -209
213
+ ▁nam -210
214
+ ▁việc -211
215
+ ▁tại -212
216
+ ▁bả -213
217
+ ích -214
218
+ ▁ông -215
219
+ ữa -216
220
+ ▁quan -217
221
+ ▁hiện -218
222
+ ền -219
223
+ ▁việt -220
224
+ ▁sau -221
225
+ ▁theo -222
226
+ ▁mười -223
227
+ ▁lu -224
228
+ ▁do -225
229
+ ▁ngày -226
230
+ ▁dân -227
231
+ ▁nhân -228
232
+ ▁đề -229
233
+ ắc -230
234
+ ái -231
235
+ ▁y -232
236
+ ▁vậy -233
237
+ ▁mu -234
238
+ iệu -235
239
+ ▁quốc -236
240
+ ▁giờ -237
241
+ ách -238
242
+ ▁cu -239
243
+ ổi -240
244
+ ▁đồng -241
245
+ ▁cô -242
246
+ ▁hơn -243
247
+ ▁thấy -244
248
+ ▁định -245
249
+ ▁to -246
250
+ ấn -247
251
+ ▁xu -248
252
+ òng -249
253
+ ơi -250
254
+ ▁vì -251
255
+ ▁chuy -252
256
+ âm -253
257
+ ẫn -254
258
+ ặc -255
259
+ ▁tư -256
260
+ ▁trường -257
261
+ ▁nhất -258
262
+ ▁phần -259
263
+ ắt -260
264
+ ▁kho -261
265
+ áp -262
266
+ iền -263
267
+ ượng -264
268
+ ▁hàng -265
269
+ ▁trung -266
270
+ ▁biết -267
271
+ ▁dụ -268
272
+ iểu -269
273
+ ▁ạ -270
274
+ ▁mới -271
275
+ ▁thực -272
276
+ ìn -273
277
+ ếu -274
278
+ ▁sản -275
279
+ ▁thời -276
280
+ ạt -277
281
+ ạo -278
282
+ ặt -279
283
+ ánh -280
284
+ ▁điều -281
285
+ iển -282
286
+ ▁đấy -283
287
+ ▁phát -284
288
+ ▁hội -285
289
+ ▁sinh -286
290
+ ▁đang -287
291
+ ▁nghiệ -288
292
+ ▁lý -289
293
+ ▁ý -290
294
+ ▁thứ -291
295
+ ịch -292
296
+ ▁hình -293
297
+ ấp -294
298
+ ▁nguy -295
299
+ ▁nên -296
300
+ ▁bộ -297
301
+ ▁gia -298
302
+ ▁cùng -299
303
+ ▁cách -300
304
+ ▁nghĩ -301
305
+ ▁cơ -302
306
+ ▁hợ -303
307
+ ▁trình -304
308
+ ▁hợp -305
309
+ ọi -306
310
+ ▁chủ -307
311
+ ảng -308
312
+ ám -309
313
+ ▁hay -310
314
+ ễn -311
315
+ ốt -312
316
+ ▁thông -313
317
+ ứng -314
318
+ ▁đúng -315
319
+ ▁tiếp -316
320
+ áu -317
321
+ ▁trước -318
322
+ ▁tổ -319
323
+ ▁ngo -320
324
+ ▁khác -321
325
+ ▁vị -322
326
+ ▁tế -323
327
+ ỏi -324
328
+ ▁lo -325
329
+ ▁kinh -326
330
+ ▁dự -327
331
+ ính -328
332
+ ▁lên -329
333
+ ▁điểm -330
334
+ ▁chín -331
335
+ ▁nay -332
336
+ ▁tháng -333
337
+ ▁lớ -334
338
+ ▁viên -335
339
+ ▁chi -336
340
+ ởng -337
341
+ ▁qua -338
342
+ ▁bằng -339
343
+ áo -340
344
+ ▁đối -341
345
+ ▁tuy -342
346
+ êm -343
347
+ iêu -344
348
+ ▁rằng -345
349
+ ện -346
350
+ ▁bốn -347
351
+ ▁nhận -348
352
+ ▁tự -349
353
+ ▁tu -350
354
+ ọng -351
355
+ ▁mạ -352
356
+ ấu -353
357
+ ▁nếu -354
358
+ ▁tin -355
359
+ ▁à -356
360
+ ▁tới -357
361
+ iệt -358
362
+ ▁nghiệp -359
363
+ ▁muốn -360
364
+ ▁cuộc -361
365
+ ▁chưa -362
366
+ ▁cứ -363
367
+ ▁thị -364
368
+ ▁vụ -365
369
+ ▁a -366
370
+ ▁cao -367
371
+ ▁sao -368
372
+ ừa -369
373
+ ▁hành -370
374
+ ▁câu -371
375
+ ào -372
376
+ ▁tài -373
377
+ ▁dụng -374
378
+ ▁bản -375
379
+ ầy -376
380
+ ▁lúc -377
381
+ ▁ -378
382
+ n -379
383
+ h -380
384
+ t -381
385
+ c -382
386
+ i -383
387
+ g -384
388
+ a -385
389
+ m -386
390
+ à -387
391
+ đ -388
392
+ u -389
393
+ l -390
394
+ o -391
395
+ á -392
396
+ v -393
397
+ ư -394
398
+ r -395
399
+ y -396
400
+ b -397
401
+ p -398
402
+ k -399
403
+ ô -400
404
+ s -401
405
+ ó -402
406
+ ế -403
407
+ ạ -404
408
+ ì -405
409
+ ộ -406
410
+ ả -407
411
+ ờ -408
412
+ d -409
413
+ ê -410
414
+ ấ -411
415
+ â -412
416
+ ệ -413
417
+ ố -414
418
+ ớ -415
419
+ ề -416
420
+ ơ -417
421
+ ể -418
422
+ ủ -419
423
+ q -420
424
+ ợ -421
425
+ e -422
426
+ ậ -423
427
+ í -424
428
+ ă -425
429
+ ị -426
430
+ ú -427
431
+ x -428
432
+ ầ -429
433
+ ứ -430
434
+ ữ -431
435
+ ự -432
436
+ ọ -433
437
+ ở -434
438
+ ồ -435
439
+ ã -436
440
+ ụ -437
441
+ ắ -438
442
+ ừ -439
443
+ ò -440
444
+ ổ -441
445
+ ũ -442
446
+ ù -443
447
+ ẽ -444
448
+ ặ -445
449
+ ý -446
450
+ ỉ -447
451
+ ỏ -448
452
+ ằ -449
453
+ ử -450
454
+ é -451
455
+ ĩ -452
456
+ ẩ -453
457
+ ễ -454
458
+ ẫ -455
459
+ ỗ -456
460
+ ẹ -457
461
+ ẻ -458
462
+ ỹ -459
463
+ è -460
464
+ ỳ -461
465
+ õ -462
466
+ ẳ -463
467
+ ỡ -464
468
+ ỷ -465
469
+ f -466
470
+ w -467
471
+ ẵ -468
472
+ z -469
473
+ j -470
474
+ - -471
475
+ ỵ -472
476
+ 2 -473
477
+ 1 -474
478
+ ' -475
479
+ 3 -476
480
+ 0 -477
481
+ ε -478
482
+ λ -479
483
+ 4 -480
484
+ 9 -481
485
+ P -482
486
+ " -483
487
+ 6 -484
488
+ 7 -485
489
+ H -486
490
+ 5 -487
491
+ N -488
492
+ O -489
493
+ [ -490
494
+ ń -491
495
+ ǀ -492
496
+ ǹ -493
497
+ α -494
498
+ β -495
499
+ γ -496
500
+ δ -497