weewtr1121 commited on
Commit
bf19162
1 Parent(s): 54df004
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
- base_model: openai/whisper-large-v3
5
  tags:
6
  - generated_from_trainer
7
  model-index:
@@ -14,10 +14,10 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # outs
16
 
17
- This model is a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.6104
20
- - Cer: 25.0366
21
 
22
  ## Model description
23
 
@@ -36,22 +36,27 @@ More information needed
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
- - learning_rate: 8e-05
40
- - train_batch_size: 2
41
- - eval_batch_size: 2
42
  - seed: 42
 
 
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: linear
45
- - num_epochs: 3.0
 
46
  - mixed_precision_training: Native AMP
47
 
48
  ### Training results
49
 
50
- | Training Loss | Epoch | Step | Validation Loss | Cer |
51
- |:-------------:|:-----:|:----:|:---------------:|:-------:|
52
- | 0.3211 | 1.0 | 1073 | 0.7999 | 39.8078 |
53
- | 0.0795 | 2.0 | 2146 | 0.6431 | 23.1977 |
54
- | 0.0101 | 3.0 | 3219 | 0.6104 | 25.0366 |
 
 
55
 
56
 
57
  ### Framework versions
 
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
+ base_model: biodatlab/whisper-th-small-combined
5
  tags:
6
  - generated_from_trainer
7
  model-index:
 
14
 
15
  # outs
16
 
17
+ This model is a fine-tuned version of [biodatlab/whisper-th-small-combined](https://huggingface.co/biodatlab/whisper-th-small-combined) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.1552
20
+ - Cer: 13.5275
21
 
22
  ## Model description
23
 
 
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
+ - learning_rate: 2e-05
40
+ - train_batch_size: 4
41
+ - eval_batch_size: 4
42
  - seed: 42
43
+ - gradient_accumulation_steps: 2
44
+ - total_train_batch_size: 8
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
47
+ - lr_scheduler_warmup_steps: 1000
48
+ - num_epochs: 10.0
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
52
 
53
+ | Training Loss | Epoch | Step | Validation Loss | Cer |
54
+ |:-------------:|:------:|:----:|:---------------:|:-------:|
55
+ | 0.1733 | 1.8622 | 500 | 0.1206 | 7.2293 |
56
+ | 0.1159 | 3.7244 | 1000 | 0.1404 | 10.6943 |
57
+ | 0.0596 | 5.5866 | 1500 | 0.1665 | 12.2340 |
58
+ | 0.0399 | 7.4488 | 2000 | 0.1486 | 11.8316 |
59
+ | 0.0224 | 9.3110 | 2500 | 0.1552 | 13.5275 |
60
 
61
 
62
  ### Framework versions
config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "_name_or_path": "openai/whisper-large-v3",
3
  "activation_dropout": 0.0,
4
  "activation_function": "gelu",
5
- "apply_spec_augment": false,
6
  "architectures": [
7
  "WhisperForConditionalGeneration"
8
  ],
@@ -13,38 +13,39 @@
13
  ],
14
  "bos_token_id": 50257,
15
  "classifier_proj_size": 256,
16
- "d_model": 1280,
17
- "decoder_attention_heads": 20,
18
- "decoder_ffn_dim": 5120,
19
  "decoder_layerdrop": 0.0,
20
- "decoder_layers": 32,
21
  "decoder_start_token_id": 50258,
22
  "dropout": 0.0,
23
- "encoder_attention_heads": 20,
24
- "encoder_ffn_dim": 5120,
25
  "encoder_layerdrop": 0.0,
26
- "encoder_layers": 32,
27
  "eos_token_id": 50257,
 
28
  "init_std": 0.02,
29
  "is_encoder_decoder": true,
30
- "mask_feature_length": 10,
31
  "mask_feature_min_masks": 0,
32
- "mask_feature_prob": 0.0,
33
  "mask_time_length": 10,
34
  "mask_time_min_masks": 2,
35
- "mask_time_prob": 0.05,
36
  "max_length": 448,
37
  "max_source_positions": 1500,
38
  "max_target_positions": 448,
39
  "median_filter_width": 7,
40
  "model_type": "whisper",
41
- "num_hidden_layers": 32,
42
- "num_mel_bins": 128,
43
- "pad_token_id": 50256,
44
  "scale_embedding": false,
45
  "torch_dtype": "float32",
46
  "transformers_version": "4.44.2",
47
  "use_cache": true,
48
  "use_weighted_layer_sum": false,
49
- "vocab_size": 51866
50
  }
 
1
  {
2
+ "_name_or_path": "biodatlab/whisper-th-small-combined",
3
  "activation_dropout": 0.0,
4
  "activation_function": "gelu",
5
+ "apply_spec_augment": true,
6
  "architectures": [
7
  "WhisperForConditionalGeneration"
8
  ],
 
13
  ],
14
  "bos_token_id": 50257,
15
  "classifier_proj_size": 256,
16
+ "d_model": 768,
17
+ "decoder_attention_heads": 12,
18
+ "decoder_ffn_dim": 3072,
19
  "decoder_layerdrop": 0.0,
20
+ "decoder_layers": 12,
21
  "decoder_start_token_id": 50258,
22
  "dropout": 0.0,
23
+ "encoder_attention_heads": 12,
24
+ "encoder_ffn_dim": 3072,
25
  "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 12,
27
  "eos_token_id": 50257,
28
+ "forced_decoder_ids": null,
29
  "init_std": 0.02,
30
  "is_encoder_decoder": true,
31
+ "mask_feature_length": 64,
32
  "mask_feature_min_masks": 0,
33
+ "mask_feature_prob": 0.1,
34
  "mask_time_length": 10,
35
  "mask_time_min_masks": 2,
36
+ "mask_time_prob": 0.1,
37
  "max_length": 448,
38
  "max_source_positions": 1500,
39
  "max_target_positions": 448,
40
  "median_filter_width": 7,
41
  "model_type": "whisper",
42
+ "num_hidden_layers": 12,
43
+ "num_mel_bins": 80,
44
+ "pad_token_id": 50257,
45
  "scale_embedding": false,
46
  "torch_dtype": "float32",
47
  "transformers_version": "4.44.2",
48
  "use_cache": true,
49
  "use_weighted_layer_sum": false,
50
+ "vocab_size": 51865
51
  }
generation_config.json CHANGED
@@ -1,44 +1,44 @@
1
  {
2
  "alignment_heads": [
3
  [
4
- 7,
5
- 0
6
  ],
7
  [
8
- 10,
9
- 17
10
  ],
11
  [
12
- 12,
13
- 18
14
  ],
15
  [
16
- 13,
17
- 12
18
  ],
19
  [
20
- 16,
21
- 1
22
  ],
23
  [
24
- 17,
25
- 14
26
  ],
27
  [
28
- 19,
29
- 11
30
  ],
31
  [
32
- 21,
33
- 4
34
  ],
35
  [
36
- 24,
37
- 1
38
  ],
39
  [
40
- 25,
41
- 6
42
  ]
43
  ],
44
  "begin_suppress_tokens": [
@@ -148,15 +148,14 @@
148
  "<|vi|>": 50278,
149
  "<|yi|>": 50335,
150
  "<|yo|>": 50325,
151
- "<|yue|>": 50358,
152
  "<|zh|>": 50260
153
  },
154
  "language": "th",
155
  "max_initial_timestamp_index": 50,
156
  "max_length": 448,
157
- "no_timestamps_token_id": 50364,
158
  "pad_token_id": 50257,
159
- "prev_sot_token_id": 50362,
160
  "return_timestamps": false,
161
  "suppress_tokens": [
162
  1,
@@ -242,16 +241,16 @@
242
  49870,
243
  50254,
244
  50258,
 
245
  50359,
246
  50360,
247
  50361,
248
- 50362,
249
- 50363
250
  ],
251
  "task": "transcribe",
252
  "task_to_id": {
253
- "transcribe": 50360,
254
- "translate": 50359
255
  },
256
  "transformers_version": "4.44.2"
257
  }
 
1
  {
2
  "alignment_heads": [
3
  [
4
+ 5,
5
+ 3
6
  ],
7
  [
8
+ 5,
9
+ 9
10
  ],
11
  [
12
+ 8,
13
+ 0
14
  ],
15
  [
16
+ 8,
17
+ 4
18
  ],
19
  [
20
+ 8,
21
+ 7
22
  ],
23
  [
24
+ 8,
25
+ 8
26
  ],
27
  [
28
+ 9,
29
+ 0
30
  ],
31
  [
32
+ 9,
33
+ 7
34
  ],
35
  [
36
+ 9,
37
+ 9
38
  ],
39
  [
40
+ 10,
41
+ 5
42
  ]
43
  ],
44
  "begin_suppress_tokens": [
 
148
  "<|vi|>": 50278,
149
  "<|yi|>": 50335,
150
  "<|yo|>": 50325,
 
151
  "<|zh|>": 50260
152
  },
153
  "language": "th",
154
  "max_initial_timestamp_index": 50,
155
  "max_length": 448,
156
+ "no_timestamps_token_id": 50363,
157
  "pad_token_id": 50257,
158
+ "prev_sot_token_id": 50361,
159
  "return_timestamps": false,
160
  "suppress_tokens": [
161
  1,
 
241
  49870,
242
  50254,
243
  50258,
244
+ 50358,
245
  50359,
246
  50360,
247
  50361,
248
+ 50362
 
249
  ],
250
  "task": "transcribe",
251
  "task_to_id": {
252
+ "transcribe": 50359,
253
+ "translate": 50358
254
  },
255
  "transformers_version": "4.44.2"
256
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5da00a2bc05aa8cc05b70eef258cc7194ba0c011fd1b352e94fdedac589feacd
3
  size 966995080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fddfc79aa151f2ebdf6c2f7431ca578a9b15b5f750760ef25cb83c7877fc8969
3
  size 966995080
preprocessor_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "chunk_length": 30,
3
  "feature_extractor_type": "WhisperFeatureExtractor",
4
- "feature_size": 128,
5
  "hop_length": 160,
6
  "n_fft": 400,
7
  "n_samples": 480000,
 
1
  {
2
  "chunk_length": 30,
3
  "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 80,
5
  "hop_length": 160,
6
  "n_fft": 400,
7
  "n_samples": 480000,
runs/Oct08_17-42-57_897a695464dd/events.out.tfevents.1728409410.897a695464dd.2598.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:851eadf2d89d002bfb2651e1e29d4841b42c0145734ee7edf29ee58d16f219de
3
+ size 13114
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06fad7890ce3cf6185ec9294223ebb0348aed4fde156a6b405a801d12bc04ca9
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43f7ff2c50d3e04d4b8ff85050eae40b1a5219a4c6ebddf23606a80e1a42df42
3
  size 5304