Spaces:
Sleeping
Sleeping
Yurii Paniv
commited on
Commit
•
2a5583f
1
Parent(s):
4b03cbe
Improve file parsing from folder
Browse files- scripts/import_ukrainian.py +19 -10
scripts/import_ukrainian.py
CHANGED
@@ -111,34 +111,43 @@ def one_sample(sample):
|
|
111 |
def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
|
112 |
# iterate over all data lists and write converted version near them
|
113 |
speaker_iterator = 1
|
114 |
-
|
115 |
samples = []
|
|
|
116 |
for subdir, dirs, files in os.walk(dataset_dir):
|
117 |
for file in files:
|
118 |
# Get audiofile path and transcript for each sentence in tsv
|
119 |
-
if file
|
120 |
file_path = os.path.join(subdir, file)
|
121 |
file = open(file_path, mode="r")
|
122 |
data = []
|
|
|
|
|
|
|
123 |
for row in file.readlines():
|
124 |
file_name, transcript = row.replace(
|
125 |
" \n", "").split(" ", 1)
|
126 |
-
|
127 |
if file_name.endswith(".wav"):
|
128 |
pass
|
129 |
elif file_name.endswith(".mp3"):
|
130 |
pass
|
131 |
elif file_name.find(".") == -1:
|
132 |
file_name += ".wav"
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
(file_name, transcript, speaker_iterator))
|
137 |
-
speaker_iterator += 1
|
138 |
|
139 |
file.close()
|
140 |
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
if rows is None:
|
144 |
rows = []
|
@@ -199,7 +208,7 @@ def _maybe_convert_wav(mp3_filename, wav_filename):
|
|
199 |
transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
|
200 |
try:
|
201 |
transformer.build(mp3_filename, wav_filename)
|
202 |
-
except Exception as e:
|
203 |
pass
|
204 |
|
205 |
|
|
|
111 |
def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
|
112 |
# iterate over all data lists and write converted version near them
|
113 |
speaker_iterator = 1
|
|
|
114 |
samples = []
|
115 |
+
total_file_dict = dict()
|
116 |
for subdir, dirs, files in os.walk(dataset_dir):
|
117 |
for file in files:
|
118 |
# Get audiofile path and transcript for each sentence in tsv
|
119 |
+
if file.endswith(".data"):
|
120 |
file_path = os.path.join(subdir, file)
|
121 |
file = open(file_path, mode="r")
|
122 |
data = []
|
123 |
+
file_folder = os.path.join(
|
124 |
+
os.path.dirname(subdir), "wav")
|
125 |
+
file_dict = dict()
|
126 |
for row in file.readlines():
|
127 |
file_name, transcript = row.replace(
|
128 |
" \n", "").split(" ", 1)
|
|
|
129 |
if file_name.endswith(".wav"):
|
130 |
pass
|
131 |
elif file_name.endswith(".mp3"):
|
132 |
pass
|
133 |
elif file_name.find(".") == -1:
|
134 |
file_name += ".wav"
|
135 |
+
|
136 |
+
file_name = os.path.join(file_folder, file_name)
|
137 |
+
file_dict[file_name] = transcript
|
|
|
|
|
138 |
|
139 |
file.close()
|
140 |
|
141 |
+
for wav_subdir, wav_dirs, wav_files in os.walk(file_folder):
|
142 |
+
for wav_file in wav_files:
|
143 |
+
wav_file_path = os.path.join(wav_subdir, wav_file)
|
144 |
+
if file_dict.get(wav_file_path) is not None:
|
145 |
+
total_file_dict[wav_file_path] = file_dict[wav_file_path]
|
146 |
+
|
147 |
+
for key in total_file_dict.keys():
|
148 |
+
samples.append((key, total_file_dict[key], speaker_iterator))
|
149 |
+
speaker_iterator += 1
|
150 |
+
del(total_file_dict)
|
151 |
|
152 |
if rows is None:
|
153 |
rows = []
|
|
|
208 |
transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
|
209 |
try:
|
210 |
transformer.build(mp3_filename, wav_filename)
|
211 |
+
except Exception as e: # TODO: improve exception handling
|
212 |
pass
|
213 |
|
214 |
|