File size: 4,436 Bytes
05390a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env bash
export PYTHONPATH=""
source /esat/spchtemp/scratch/jponcele/anaconda3/bin/activate espnet2
python --version

# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

####################################################
stage=12
stop_stage=12
####################################################

# notes: geen speed perturbation, geen LM, geen word LM, geen NGRAM LM

# EXP
outdir=/esat/spchtemp/scratch/jponcele/espnet2
expdir=${outdir}/exp/exp-hpc
st_tag=train_subtitling_chained_PL_C10_new_combined_nelf_all
st_stats_dir=/esat/spchtemp/scratch/jponcele/espnet2/exp/exp-st/st_stats_fbank_pitch_vl_joint_bpe5000_cgn_combined_tags_cased_punct_subs_train_all
token_dir_suffix=cgn_combined_tags_cased_punct_subs_train_all


nj=4
ngpu=1

# DATA
st_train_set=st_train_all_combined_stochastic_cased_punct_speed_perturbed_cgn_subset
st_valid_set=st_valid_nelf_12000h_combined_stochastic_cased_punct
st_test_set="st_valid_nelf_12000h_combined_stochastic_cased_punct_subs_only subs_annot cgn_test_combined_resample_stochastic_tags_cased_punct dev_s subs_annot"


asr_train_set=train_s
asr_valid_set=valid_s
asr_test_set=dev_s
subs_train_set=subs_train
subs_valid_set=subs_valid
subs_test_set=subs_test


traincomps="a;b;c;d;f;g;h;i;j;k;l;m;n;o"
decodecomps="b;f;g;h;i;j;k;l;m;n;o"
local_data_opts="--repstr false --lowercase true --outdir data --traincomps ${traincomps} --decodecomps ${decodecomps}"

subs_dir=/users/spraak/jponcele/vrt-scraper/vrtnew_subtitles_4feb
local_subs_opts="--outdir data --subsdir ${subs_dir}"

feats_type=fbank_pitch

# LM
use_word_lm=false  # not yet supported!
use_lm=false
lm_config=conf/train_lm_transformer.yaml
use_ngram=false

# ST
feats_normalize=utterance_mvn  # recommended for pretrained models instead of globalmvn
st_config=conf/tuning/train_subtitling_chained_C10_new_6layers_transformer.yaml
inference_config=conf/st_decode_chained.yaml
inference_nj=64
inference_st_model=averaged_model_13epochs.pth  #valid.acc_asr.best.pth  #averaged_model.pth  #valid.acc_asr.ave.pth
st_args="--batch_type custom_folded --valid_batch_type custom_folded"  # "--input_size 0"  # to use raw audio for w2v2 encoder

./subs.sh \
    --stage ${stage} \
    --stop_stage ${stop_stage} \
    --ngpu ${ngpu}  \
    --nj ${nj}  \
    --gpu_inference false  \
    --dumpdir ${outdir}/dump  \
    --expdir ${expdir}  \
    --feats_type ${feats_type}  \
    --audio_format wav  \
    --min_wav_duration 0.1  \
    --max_wav_duration 30  \
    --token_joint true \
    --src_token_type bpe \
    --src_nbpe 5000 \
    --src_bpemode unigram \
    --src_case lc  \
    --tgt_token_type bpe \
    --tgt_nbpe 5000 \
    --tgt_bpemode unigram \
    --tgt_case lc  \
    --oov "<unk>" \
    --lang "vl" \
    --src_lang "verbatim" \
    --tgt_lang "subtitle" \
    --local_subs_opts "${local_subs_opts}"  \
    --local_data_opts "${local_data_opts}"  \
    --use_lm ${use_lm} \
    --use_word_lm ${use_word_lm}  \
    --lm_config ${lm_config}  \
    --use_ngram ${use_ngram}  \
    --st_config ${st_config}  \
    --st_args "${st_args}"  \
    --st_tag ${st_tag}  \
    --inference_config ${inference_config}  \
    --inference_nj ${inference_nj}  \
    --feats_normalize ${feats_normalize}  \
    --st_train_set "${st_train_set}" \
    --st_valid_set "${st_valid_set}" \
    --st_test_set "${st_test_set}" \
    --asr_train_set ${asr_train_set} \
    --asr_valid_set ${asr_valid_set} \
    --asr_test_set ${asr_test_set} \
    --subs_train_set ${subs_train_set} \
    --subs_valid_set ${subs_valid_set} \
    --subs_test_set ${subs_test_set} \
    --st_stats_dir ${st_stats_dir}  \
    --inference_st_model ${inference_st_model}  \
    --token_dir_suffix ${token_dir_suffix} \
#    --pretrained_asr ${pretrained_asr} \

#train_set=train_si284
#valid_set=test_dev93
#test_sets="test_dev93 test_eval92"
#
#./asr.sh \
#    --lang "en" \
#    --use_lm true \
#    --token_type char \
#    --nbpe 80 \
#    --nlsyms_txt data/nlsyms.txt \
#    --lm_config conf/train_lm_transformer.yaml \
#    --asr_config conf/train_asr_transformer.yaml \
#    --inference_config conf/decode.yaml \
#    --train_set "${train_set}" \
#    --valid_set "${valid_set}" \
#    --test_sets "${test_sets}" \
#    --bpe_train_text "data/train_si284/text" \
#    --lm_train_text "data/train_si284/text data/local/other_text/text" "$@"