ssiidd commited on
Commit
4967734
1 Parent(s): d231d7c

Add grabo dataset to app

Browse files
app.py CHANGED
@@ -8,6 +8,12 @@ from espnet2.utils.types import str_or_none
8
  from espnet2.bin.asr_inference import Speech2Text
9
  from subprocess import call
10
  import os
 
 
 
 
 
 
11
 
12
  with open('s3prl.sh', 'rb') as file:
13
  script = file.read()
@@ -35,6 +41,13 @@ speech2text_fsc = Speech2Text.from_pretrained(
35
  nbest=1
36
  )
37
 
 
 
 
 
 
 
 
38
  speech2text_catslu = Speech2Text.from_pretrained(
39
  asr_train_config="catslu/config.yaml",
40
  asr_model_file="catslu/valid.acc.ave_5best.pth",
@@ -42,6 +55,14 @@ speech2text_catslu = Speech2Text.from_pretrained(
42
  nbest=1
43
  )
44
 
 
 
 
 
 
 
 
 
45
  def inference(wav,data):
46
  with torch.no_grad():
47
  if data == "english_slurp":
@@ -78,6 +99,16 @@ def inference(wav,data):
78
  nbests = speech2text_catslu(speech)
79
  text, *_ = nbests[0]
80
  text=text.split(" ")[0]
 
 
 
 
 
 
 
 
 
 
81
  # intent=text.split(" ")[0]
82
  # action=intent.split("_")[0]
83
  # objects=intent.split("_")[1]
@@ -96,12 +127,12 @@ title = "ESPnet2-SLU"
96
  description = "Gradio demo for ESPnet2-SLU: Advancing Spoken Language Understanding through ESPnet. To use it, simply record your audio or click one of the examples to load them. Read more at the links below."
97
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
98
 
99
- examples=[['audio_slurp.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_catslu.wav',"chinese"]]
100
 
101
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
102
  gr.Interface(
103
  inference,
104
- [gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english_slurp","english_fsc","chinese"], type="value", default="english_slurp", label="Dataset")],
105
  gr.outputs.Textbox(type="str", label="Output"),
106
  title=title,
107
  description=description,
 
8
  from espnet2.bin.asr_inference import Speech2Text
9
  from subprocess import call
10
  import os
11
+ from espnet_model_zoo.downloader import ModelDownloader
12
+ d = ModelDownloader()
13
+ tag="ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best"
14
+ a1= (d.download_and_unpack(tag))
15
+ # print(a1)
16
+ # exit()
17
 
18
  with open('s3prl.sh', 'rb') as file:
19
  script = file.read()
 
41
  nbest=1
42
  )
43
 
44
+ speech2text_snips = Speech2Text.from_pretrained(
45
+ asr_train_config="espnet-slu-snips/config.yaml",
46
+ asr_model_file="espnet-slu-snips/valid.acc.ave_10best.pth",
47
+ # Decoding parameters are not included in the model file
48
+ nbest=1
49
+ )
50
+
51
  speech2text_catslu = Speech2Text.from_pretrained(
52
  asr_train_config="catslu/config.yaml",
53
  asr_model_file="catslu/valid.acc.ave_5best.pth",
 
55
  nbest=1
56
  )
57
 
58
+ speech2text_grabo = Speech2Text.from_pretrained(
59
+ asr_train_config="grabo/config.yaml",
60
+ asr_model_file="grabo/valid.acc.ave_10best.pth",
61
+ ctc_weight=0.0,
62
+ # Decoding parameters are not included in the model file
63
+ nbest=1
64
+ )
65
+
66
  def inference(wav,data):
67
  with torch.no_grad():
68
  if data == "english_slurp":
 
99
  nbests = speech2text_catslu(speech)
100
  text, *_ = nbests[0]
101
  text=text.split(" ")[0]
102
+ # elif data == "english_snips":
103
+ # print(wav.name)
104
+ # speech, rate = soundfile.read(wav.name)
105
+ # nbests = speech2text_snips(speech)
106
+ # text, *_ = nbests[0]
107
+ elif data == "dutch":
108
+ print(wav.name)
109
+ speech, rate = soundfile.read(wav.name)
110
+ nbests = speech2text_grabo(speech)
111
+ text, *_ = nbests[0]
112
  # intent=text.split(" ")[0]
113
  # action=intent.split("_")[0]
114
  # objects=intent.split("_")[1]
 
127
  description = "Gradio demo for ESPnet2-SLU: Advancing Spoken Language Understanding through ESPnet. To use it, simply record your audio or click one of the examples to load them. Read more at the links below."
128
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
129
 
130
+ examples=[['audio_slurp.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch"],['audio_catslu.wav',"chinese"]]
131
 
132
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
133
  gr.Interface(
134
  inference,
135
+ [gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english_slurp","english_fsc","dutch","chinese"], type="value", default="english_slurp", label="Dataset")],
136
  gr.outputs.Textbox(type="str", label="Output"),
137
  title=title,
138
  description=description,
audio_grabo.wav ADDED
Binary file (392 kB). View file
 
audio_snips.wav ADDED
Binary file (112 kB). View file
 
espnet-slu-snips/.DS_Store ADDED
Binary file (6.15 kB). View file
 
espnet-slu-snips/.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
espnet-slu-snips/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Fine-tune snips dataset for SLU task using pretrained ASR model with hubert feature
2
+ ---
3
+ language:
4
+ - en
5
+
6
+ receipe: "https://github.com/espnet/espnet/tree/master/egs2/snips/asr1"
7
+
8
+ datasets:
9
+ - snips: smart-lights-en-close-field
10
+
11
+ metrics:
12
+ - F1 score: 91.7
13
+ ---
espnet-slu-snips/config.yaml ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_asr_hubert_conformer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/asr_train_asr_hubert_conformer_raw_bpe100_init_paramexpexp_hubert_large_ll60k_weighted_perturbasr_train_asr_conformer7_hubert_960hr_large_raw_en_bpe5000_sp26epoch.pth:::decoder.output_layer,decoder.embed.0,ctc.ctc_lo_sp
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: null
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 500
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - acc
39
+ - max
40
+ keep_nbest_models: 10
41
+ grad_clip: 5.0
42
+ grad_clip_type: 2.0
43
+ grad_noise: false
44
+ accum_grad: 1
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ use_amp: false
49
+ log_interval: null
50
+ use_tensorboard: true
51
+ use_wandb: false
52
+ wandb_project: null
53
+ wandb_id: null
54
+ wandb_entity: null
55
+ wandb_name: null
56
+ wandb_model_log_interval: -1
57
+ detect_anomaly: false
58
+ pretrain_path: null
59
+ init_param:
60
+ - exp/exp_hubert_large_ll60k_weighted_perturb/asr_train_asr_conformer7_hubert_960hr_large_raw_en_bpe5000_sp/26epoch.pth:::decoder.output_layer,decoder.embed.0,ctc.ctc_lo
61
+ ignore_init_mismatch: false
62
+ freeze_param:
63
+ - frontend.upstream
64
+ num_iters_per_epoch: null
65
+ batch_size: 20
66
+ valid_batch_size: null
67
+ batch_bins: 1000000
68
+ valid_batch_bins: null
69
+ train_shape_file:
70
+ - exp/asr_stats_raw_bpe100_sp/train/speech_shape
71
+ - exp/asr_stats_raw_bpe100_sp/train/text_shape.bpe
72
+ valid_shape_file:
73
+ - exp/asr_stats_raw_bpe100_sp/valid/speech_shape
74
+ - exp/asr_stats_raw_bpe100_sp/valid/text_shape.bpe
75
+ batch_type: folded
76
+ valid_batch_type: null
77
+ fold_length:
78
+ - 80000
79
+ - 150
80
+ sort_in_batch: descending
81
+ sort_batch: descending
82
+ multiple_iterator: false
83
+ chunk_length: 500
84
+ chunk_shift_ratio: 0.5
85
+ num_cache_chunks: 1024
86
+ train_data_path_and_name_and_type:
87
+ - - dump/raw/train_sp/wav.scp
88
+ - speech
89
+ - sound
90
+ - - dump/raw/train_sp/text
91
+ - text
92
+ - text
93
+ valid_data_path_and_name_and_type:
94
+ - - dump/raw/dev/wav.scp
95
+ - speech
96
+ - sound
97
+ - - dump/raw/dev/text
98
+ - text
99
+ - text
100
+ allow_variable_data_keys: false
101
+ max_cache_size: 0.0
102
+ max_cache_fd: 32
103
+ valid_max_cache_size: null
104
+ optim: adam
105
+ optim_conf:
106
+ lr: 0.0002
107
+ scheduler: warmuplr
108
+ scheduler_conf:
109
+ warmup_steps: 2000
110
+ token_list:
111
+ - <blank>
112
+ - <unk>
113
+ - DECREASEBRIGHTNESS
114
+ - INCREASEBRIGHTNESS
115
+ - SETLIGHTBRIGHTNESS
116
+ - SETLIGHTCOLOR
117
+ - SWITCHLIGHTOFF
118
+ - SWITCHLIGHTON
119
+ - ▁
120
+ - ▁the
121
+ - ▁lights
122
+ - ▁to
123
+ - e
124
+ - ▁in
125
+ - ▁turn
126
+ - i
127
+ - s
128
+ - l
129
+ - d
130
+ - t
131
+ - ▁please
132
+ - o
133
+ - ▁room
134
+ - ▁light
135
+ - ke
136
+ - ▁brightness
137
+ - ▁i
138
+ - ▁off
139
+ - a
140
+ - ▁be
141
+ - ▁on
142
+ - m
143
+ - ▁ma
144
+ - nt
145
+ - ▁wa
146
+ - r
147
+ - ▁change
148
+ - u
149
+ - ▁set
150
+ - re
151
+ - ▁you
152
+ - y
153
+ - ▁can
154
+ - ▁li
155
+ - g
156
+ - ing
157
+ - ▁down
158
+ - ▁pink
159
+ - p
160
+ - ▁two
161
+ - v
162
+ - ▁lighting
163
+ - ▁of
164
+ - w
165
+ - ▁red
166
+ - at
167
+ - ting
168
+ - ▁bedroom
169
+ - ▁s
170
+ - ▁la
171
+ - ▁need
172
+ - ▁twenty
173
+ - ▁up
174
+ - ▁it
175
+ - eve
176
+ - ▁me
177
+ - f
178
+ - ou
179
+ - ▁green
180
+ - ld
181
+ - ▁increase
182
+ - ▁brighter
183
+ - ▁blue
184
+ - ▁color
185
+ - ▁bright
186
+ - ▁toilet
187
+ - ▁kitchen
188
+ - ▁dim
189
+ - ry
190
+ - ▁lower
191
+ - ▁bathroom
192
+ - ▁switch
193
+ - all
194
+ - ▁twelve
195
+ - ▁dark
196
+ - ▁basement
197
+ - ▁percent
198
+ - x
199
+ - j
200
+ - k
201
+ - c
202
+ - b
203
+ - n
204
+ - '0'
205
+ - '3'
206
+ - q
207
+ - z
208
+ - '4'
209
+ - h
210
+ - <sos/eos>
211
+ init: null
212
+ input_size: null
213
+ ctc_conf:
214
+ dropout_rate: 0.0
215
+ ctc_type: builtin
216
+ reduce: true
217
+ ignore_nan_grad: true
218
+ model_conf:
219
+ ctc_weight: 0.3
220
+ lsm_weight: 0.1
221
+ length_normalized_loss: false
222
+ extract_feats_in_collect_stats: false
223
+ use_preprocessor: true
224
+ token_type: bpe
225
+ bpemodel: data/token_list/bpe_unigram100/bpe.model
226
+ non_linguistic_symbols: null
227
+ cleaner: null
228
+ g2p: null
229
+ speech_volume_normalize: null
230
+ rir_scp: null
231
+ rir_apply_prob: 1.0
232
+ noise_scp: null
233
+ noise_apply_prob: 1.0
234
+ noise_db_range: '13_15'
235
+ frontend: s3prl
236
+ frontend_conf:
237
+ frontend_conf:
238
+ upstream: hubert_large_ll60k
239
+ download_dir: ./hub
240
+ multilayer_feature: true
241
+ fs: 16k
242
+ specaug: specaug
243
+ specaug_conf:
244
+ apply_time_warp: true
245
+ time_warp_window: 5
246
+ time_warp_mode: bicubic
247
+ apply_freq_mask: true
248
+ freq_mask_width_range:
249
+ - 0
250
+ - 30
251
+ num_freq_mask: 2
252
+ apply_time_mask: true
253
+ time_mask_width_range:
254
+ - 0
255
+ - 40
256
+ num_time_mask: 2
257
+ normalize: utterance_mvn
258
+ normalize_conf: {}
259
+ preencoder: linear
260
+ preencoder_conf:
261
+ input_size: 1024
262
+ output_size: 80
263
+ encoder: conformer
264
+ encoder_conf:
265
+ output_size: 512
266
+ attention_heads: 8
267
+ linear_units: 2048
268
+ num_blocks: 12
269
+ dropout_rate: 0.1
270
+ positional_dropout_rate: 0.1
271
+ attention_dropout_rate: 0.1
272
+ input_layer: conv2d
273
+ normalize_before: true
274
+ macaron_style: true
275
+ pos_enc_layer_type: rel_pos
276
+ selfattention_layer_type: rel_selfattn
277
+ activation_type: swish
278
+ use_cnn_module: true
279
+ cnn_module_kernel: 31
280
+ postencoder: null
281
+ postencoder_conf: {}
282
+ decoder: transformer
283
+ decoder_conf:
284
+ attention_heads: 8
285
+ linear_units: 2048
286
+ num_blocks: 6
287
+ dropout_rate: 0.1
288
+ positional_dropout_rate: 0.1
289
+ self_attention_dropout_rate: 0.1
290
+ src_attention_dropout_rate: 0.1
291
+ required:
292
+ - output_dir
293
+ - token_list
294
+ version: 0.10.3a3
295
+ distributed: false
espnet-slu-snips/valid.acc.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6976d53b33b25193e7d3a97cf9766d9ad41b1648b7ad0d807154c70ff23a33e
3
+ size 1701692895
grabo/config.yaml ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_asr.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/asr_conformer_mono16k_warmup800_lr2e-4_accum2
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 250
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - acc
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 2
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_tensorboard: true
52
+ use_wandb: false
53
+ wandb_project: null
54
+ wandb_id: null
55
+ wandb_entity: null
56
+ wandb_name: null
57
+ wandb_model_log_interval: -1
58
+ detect_anomaly: false
59
+ pretrain_path: null
60
+ init_param: []
61
+ ignore_init_mismatch: false
62
+ freeze_param: []
63
+ num_iters_per_epoch: null
64
+ batch_size: 20
65
+ valid_batch_size: null
66
+ batch_bins: 2000000
67
+ valid_batch_bins: null
68
+ train_shape_file:
69
+ - exp/asr_stats_raw_word_sp/train/speech_shape
70
+ - exp/asr_stats_raw_word_sp/train/text_shape.word
71
+ valid_shape_file:
72
+ - exp/asr_stats_raw_word_sp/valid/speech_shape
73
+ - exp/asr_stats_raw_word_sp/valid/text_shape.word
74
+ batch_type: numel
75
+ valid_batch_type: null
76
+ fold_length:
77
+ - 80000
78
+ - 150
79
+ sort_in_batch: descending
80
+ sort_batch: descending
81
+ multiple_iterator: false
82
+ chunk_length: 500
83
+ chunk_shift_ratio: 0.5
84
+ num_cache_chunks: 1024
85
+ train_data_path_and_name_and_type:
86
+ - - dump/raw/train_sp/wav.scp
87
+ - speech
88
+ - sound
89
+ - - dump/raw/train_sp/text
90
+ - text
91
+ - text
92
+ valid_data_path_and_name_and_type:
93
+ - - dump/raw/dev/wav.scp
94
+ - speech
95
+ - sound
96
+ - - dump/raw/dev/text
97
+ - text
98
+ - text
99
+ allow_variable_data_keys: false
100
+ max_cache_size: 0.0
101
+ max_cache_fd: 32
102
+ valid_max_cache_size: null
103
+ optim: adam
104
+ optim_conf:
105
+ lr: 0.0002
106
+ scheduler: warmuplr
107
+ scheduler_conf:
108
+ warmup_steps: 800
109
+ token_list:
110
+ - <blank>
111
+ - <unk>
112
+ - <move_rel-throttle="slow"-distance="little"-direction="backward"-/>
113
+ - <move_rel-throttle="slow"-distance="normal"-direction="backward"-/>
114
+ - <move_rel-throttle="slow"-distance="alot"-direction="backward"-/>
115
+ - <move_abs-throttle="fast"-pos_x="centerx"-pos_y="centery"-/>
116
+ - <move_abs-throttle="fast"-pos_x="left"-pos_y="up"-/>
117
+ - <move_abs-throttle="fast"-pos_x="right"-pos_y="down"-/>
118
+ - <move_abs-throttle="slow"-pos_x="centerx"-pos_y="centery"-/>
119
+ - <move_abs-throttle="slow"-pos_x="left"-pos_y="up"-/>
120
+ - <move_abs-throttle="slow"-pos_x="right"-pos_y="down"-/>
121
+ - <turn_rel-throttle="slow"-angle="south"-/>
122
+ - <move_rel-throttle="fast"-distance="little"-direction="forward"-/>
123
+ - <turn_rel-throttle="slow"-angle="east"-/>
124
+ - <turn_rel-throttle="slow"-angle="west"-/>
125
+ - <turn_rel-throttle="fast"-angle="south"-/>
126
+ - <turn_rel-throttle="fast"-angle="east"-/>
127
+ - <turn_rel-throttle="fast"-angle="west"-/>
128
+ - <turn_abs-angle="west"-/>
129
+ - <turn_abs-angle="east"-/>
130
+ - <turn_abs-angle="north"-/>
131
+ - <turn_abs-angle="south"-/>
132
+ - <lift-position="up"-/>
133
+ - <move_rel-throttle="fast"-distance="normal"-direction="forward"-/>
134
+ - <lift-position="down"-/>
135
+ - <approach-throttle="fast"-/>
136
+ - <approach-throttle="slow"-/>
137
+ - <grab-grabber="close"-/>
138
+ - <grab-grabber="open"-/>
139
+ - <pointer-state="off"-/>
140
+ - <pointer-state="on"-/>
141
+ - <move_rel-throttle="fast"-distance="alot"-direction="forward"-/>
142
+ - <move_rel-throttle="slow"-distance="little"-direction="forward"-/>
143
+ - <move_rel-throttle="slow"-distance="normal"-direction="forward"-/>
144
+ - <move_rel-throttle="slow"-distance="alot"-direction="forward"-/>
145
+ - <move_rel-throttle="fast"-distance="little"-direction="backward"-/>
146
+ - <move_rel-throttle="fast"-distance="normal"-direction="backward"-/>
147
+ - <move_rel-throttle="fast"-distance="alot"-direction="backward"-/>
148
+ - <sos/eos>
149
+ init: null
150
+ input_size: null
151
+ ctc_conf:
152
+ dropout_rate: 0.0
153
+ ctc_type: builtin
154
+ reduce: true
155
+ ignore_nan_grad: true
156
+ model_conf:
157
+ ctc_weight: 0.0
158
+ lsm_weight: 0.0
159
+ length_normalized_loss: false
160
+ use_preprocessor: true
161
+ token_type: word
162
+ bpemodel: null
163
+ non_linguistic_symbols: null
164
+ cleaner: null
165
+ g2p: null
166
+ speech_volume_normalize: null
167
+ rir_scp: null
168
+ rir_apply_prob: 1.0
169
+ noise_scp: null
170
+ noise_apply_prob: 1.0
171
+ noise_db_range: '13_15'
172
+ frontend: default
173
+ frontend_conf:
174
+ fs: 16000
175
+ specaug: specaug
176
+ specaug_conf:
177
+ apply_time_warp: true
178
+ time_warp_window: 5
179
+ time_warp_mode: bicubic
180
+ apply_freq_mask: true
181
+ freq_mask_width_range:
182
+ - 0
183
+ - 30
184
+ num_freq_mask: 2
185
+ apply_time_mask: true
186
+ time_mask_width_range:
187
+ - 0
188
+ - 40
189
+ num_time_mask: 2
190
+ normalize: global_mvn
191
+ normalize_conf:
192
+ stats_file: grabo/feats_stats.npz
193
+ preencoder: null
194
+ preencoder_conf: {}
195
+ encoder: conformer
196
+ encoder_conf:
197
+ output_size: 256
198
+ attention_heads: 4
199
+ linear_units: 2048
200
+ num_blocks: 12
201
+ dropout_rate: 0.1
202
+ positional_dropout_rate: 0.1
203
+ attention_dropout_rate: 0.0
204
+ input_layer: conv2d
205
+ normalize_before: true
206
+ macaron_style: true
207
+ rel_pos_type: legacy
208
+ pos_enc_layer_type: rel_pos
209
+ selfattention_layer_type: rel_selfattn
210
+ activation_type: swish
211
+ use_cnn_module: true
212
+ cnn_module_kernel: 15
213
+ postencoder: null
214
+ postencoder_conf: {}
215
+ decoder: transformer
216
+ decoder_conf:
217
+ attention_heads: 4
218
+ linear_units: 2048
219
+ num_blocks: 6
220
+ dropout_rate: 0.1
221
+ positional_dropout_rate: 0.1
222
+ self_attention_dropout_rate: 0.0
223
+ src_attention_dropout_rate: 0.0
224
+ required:
225
+ - output_dir
226
+ - token_list
227
+ version: 0.10.5a1
228
+ distributed: false
grabo/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
grabo/valid.acc.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94487c8e126609d0c78cf97de505ba83e671b4c3bc2285c6a1b02e131a4af7b9
3
+ size 172176105