|
|
|
|
|
|
|
|
|
|
|
""" This code is modified from https://github.com/facebookresearch/libri-light/blob/main/data_preparation/cut_by_vad.py""" |
|
import pathlib |
|
import soundfile as sf |
|
import numpy as np |
|
import json |
|
import multiprocessing |
|
import tqdm |
|
|
|
|
|
def save(seq, fname, index, extension): |
|
"""save audio sequences to file""" |
|
output = np.hstack(seq) |
|
file_name = fname.parent / (fname.stem + f"_{index:04}{extension}") |
|
fname.parent.mkdir(exist_ok=True, parents=True) |
|
sf.write(file_name, output, samplerate=16000) |
|
|
|
|
|
def cut_sequence(path, vad, path_out, target_len_sec, out_extension): |
|
"""cut audio sequences based on VAD""" |
|
data, samplerate = sf.read(path) |
|
|
|
assert len(data.shape) == 1 |
|
assert samplerate == 16000 |
|
|
|
to_stitch = [] |
|
length_accumulated = 0.0 |
|
|
|
i = 0 |
|
|
|
for start, end in vad: |
|
start_index = int(start * samplerate) |
|
end_index = int(end * samplerate) |
|
slice = data[start_index:end_index] |
|
|
|
|
|
if ( |
|
length_accumulated + (end - start) > target_len_sec |
|
and length_accumulated > 0 |
|
): |
|
save(to_stitch, path_out, i, out_extension) |
|
to_stitch = [] |
|
i += 1 |
|
length_accumulated = 0 |
|
|
|
|
|
to_stitch.append(slice) |
|
length_accumulated += end - start |
|
|
|
|
|
if to_stitch: |
|
save(to_stitch, path_out, i, out_extension) |
|
|
|
|
|
def cut_book(task): |
|
"""process each book in the dataset""" |
|
path_book, root_out, target_len_sec, extension = task |
|
|
|
speaker = pathlib.Path(path_book.parent.name) |
|
|
|
for i, meta_file_path in enumerate(path_book.glob("*.json")): |
|
with open(meta_file_path, "r") as f: |
|
meta = json.loads(f.read()) |
|
book_id = meta["book_meta"]["id"] |
|
vad = meta["voice_activity"] |
|
|
|
sound_file = meta_file_path.parent / (meta_file_path.stem + ".flac") |
|
|
|
path_out = root_out / speaker / book_id / (meta_file_path.stem) |
|
cut_sequence(sound_file, vad, path_out, target_len_sec, extension) |
|
|
|
|
|
def cut_segments( |
|
input_dir, output_dir, target_len_sec=30, n_process=32, out_extension=".wav" |
|
): |
|
"""Main function to cut segments from audio files""" |
|
|
|
pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True) |
|
list_dir = pathlib.Path(input_dir).glob("*/*") |
|
list_dir = [x for x in list_dir if x.is_dir()] |
|
|
|
print(f"{len(list_dir)} directories detected") |
|
print(f"Launching {n_process} processes") |
|
|
|
|
|
tasks = [ |
|
(path_book, output_dir, target_len_sec, out_extension) for path_book in list_dir |
|
] |
|
|
|
|
|
with multiprocessing.Pool(processes=n_process) as pool: |
|
for _ in tqdm.tqdm(pool.imap_unordered(cut_book, tasks), total=len(tasks)): |
|
pass |
|
|
|
|
|
if __name__ == "__main__": |
|
input_dir = "/path/to/input_dir" |
|
output_dir = "/path/to/output_dir" |
|
target_len_sec = 10 |
|
n_process = 16 |
|
cut_segments(input_dir, output_dir, target_len_sec, n_process) |
|
|