Spaces:
Running
Running
Yurii Paniv
commited on
Commit
•
7756f2b
1
Parent(s):
2a5583f
Update script for new dataset
Browse files- scripts/import_ukrainian.py +20 -5
scripts/import_ukrainian.py
CHANGED
@@ -22,6 +22,7 @@ from deepspeech_training.util.importers import (
|
|
22 |
print_import_report,
|
23 |
)
|
24 |
from ds_ctcdecoder import Alphabet
|
|
|
25 |
|
26 |
FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
27 |
SAMPLE_RATE = 16000
|
@@ -108,6 +109,12 @@ def one_sample(sample):
|
|
108 |
return (counter, rows)
|
109 |
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
|
112 |
# iterate over all data lists and write converted version near them
|
113 |
speaker_iterator = 1
|
@@ -124,8 +131,13 @@ def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_cha
|
|
124 |
os.path.dirname(subdir), "wav")
|
125 |
file_dict = dict()
|
126 |
for row in file.readlines():
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
129 |
if file_name.endswith(".wav"):
|
130 |
pass
|
131 |
elif file_name.endswith(".mp3"):
|
@@ -133,8 +145,10 @@ def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_cha
|
|
133 |
elif file_name.find(".") == -1:
|
134 |
file_name += ".wav"
|
135 |
|
136 |
-
file_name
|
137 |
-
|
|
|
|
|
138 |
|
139 |
file.close()
|
140 |
|
@@ -176,7 +190,8 @@ def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_cha
|
|
176 |
print("Writing CSV file for DeepSpeech.py as: ", output_csv)
|
177 |
writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
|
178 |
writer.writeheader()
|
179 |
-
bar = progressbar.ProgressBar(
|
|
|
180 |
for filename, file_size, transcript, speaker in bar(rows):
|
181 |
if space_after_every_character:
|
182 |
writer.writerow(
|
|
|
22 |
print_import_report,
|
23 |
)
|
24 |
from ds_ctcdecoder import Alphabet
|
25 |
+
import re
|
26 |
|
27 |
FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
|
28 |
SAMPLE_RATE = 16000
|
|
|
109 |
return (counter, rows)
|
110 |
|
111 |
|
112 |
+
def convert_transcript(transcript):
|
113 |
+
transcript = re.sub("[а-я](')[а-я]", "’", transcript)
|
114 |
+
transcript = transcript.replace("-", " ")
|
115 |
+
return transcript.strip()
|
116 |
+
|
117 |
+
|
118 |
def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
|
119 |
# iterate over all data lists and write converted version near them
|
120 |
speaker_iterator = 1
|
|
|
131 |
os.path.dirname(subdir), "wav")
|
132 |
file_dict = dict()
|
133 |
for row in file.readlines():
|
134 |
+
if row.isspace():
|
135 |
+
continue
|
136 |
+
splitted_row = row.replace("\n", "").replace(
|
137 |
+
" wav ", ".wav ").split(" ", 1)
|
138 |
+
if len(splitted_row) != 2:
|
139 |
+
continue
|
140 |
+
file_name, transcript = splitted_row
|
141 |
if file_name.endswith(".wav"):
|
142 |
pass
|
143 |
elif file_name.endswith(".mp3"):
|
|
|
145 |
elif file_name.find(".") == -1:
|
146 |
file_name += ".wav"
|
147 |
|
148 |
+
if file_name.startswith("/"):
|
149 |
+
file_name = file_name[1::]
|
150 |
+
file_name = os.path.join(dataset_dir, file_name)
|
151 |
+
file_dict[file_name] = convert_transcript(transcript)
|
152 |
|
153 |
file.close()
|
154 |
|
|
|
190 |
print("Writing CSV file for DeepSpeech.py as: ", output_csv)
|
191 |
writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
|
192 |
writer.writeheader()
|
193 |
+
bar = progressbar.ProgressBar(
|
194 |
+
max_value=len(rows), widgets=SIMPLE_BAR)
|
195 |
for filename, file_size, transcript, speaker in bar(rows):
|
196 |
if space_after_every_character:
|
197 |
writer.writerow(
|