Spaces:
Running
Running
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
""" This code is modified from https://github.com/facebookresearch/libri-light/blob/main/data_preparation/cut_by_vad.py""" | |
import pathlib | |
import soundfile as sf | |
import numpy as np | |
import json | |
import multiprocessing | |
import tqdm | |
def save(seq, fname, index, extension): | |
"""save audio sequences to file""" | |
output = np.hstack(seq) | |
file_name = fname.parent / (fname.stem + f"_{index:04}{extension}") | |
fname.parent.mkdir(exist_ok=True, parents=True) | |
sf.write(file_name, output, samplerate=16000) | |
def cut_sequence(path, vad, path_out, target_len_sec, out_extension): | |
"""cut audio sequences based on VAD""" | |
data, samplerate = sf.read(path) | |
assert len(data.shape) == 1 | |
assert samplerate == 16000 | |
to_stitch = [] | |
length_accumulated = 0.0 | |
i = 0 | |
# Iterate over VAD segments | |
for start, end in vad: | |
start_index = int(start * samplerate) | |
end_index = int(end * samplerate) | |
slice = data[start_index:end_index] | |
# Save slices that exceed the target length or if there's already accumulated audio | |
if ( | |
length_accumulated + (end - start) > target_len_sec | |
and length_accumulated > 0 | |
): | |
save(to_stitch, path_out, i, out_extension) | |
to_stitch = [] | |
i += 1 | |
length_accumulated = 0 | |
# Add the current slice to the list to be stitched | |
to_stitch.append(slice) | |
length_accumulated += end - start | |
# Save any remaining slices | |
if to_stitch: | |
save(to_stitch, path_out, i, out_extension) | |
def cut_book(task): | |
"""process each book in the dataset""" | |
path_book, root_out, target_len_sec, extension = task | |
speaker = pathlib.Path(path_book.parent.name) | |
for i, meta_file_path in enumerate(path_book.glob("*.json")): | |
with open(meta_file_path, "r") as f: | |
meta = json.loads(f.read()) | |
book_id = meta["book_meta"]["id"] | |
vad = meta["voice_activity"] | |
sound_file = meta_file_path.parent / (meta_file_path.stem + ".flac") | |
path_out = root_out / speaker / book_id / (meta_file_path.stem) | |
cut_sequence(sound_file, vad, path_out, target_len_sec, extension) | |
def cut_segments( | |
input_dir, output_dir, target_len_sec=30, n_process=32, out_extension=".wav" | |
): | |
"""Main function to cut segments from audio files""" | |
pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True) | |
list_dir = pathlib.Path(input_dir).glob("*/*") | |
list_dir = [x for x in list_dir if x.is_dir()] | |
print(f"{len(list_dir)} directories detected") | |
print(f"Launching {n_process} processes") | |
# Create tasks for multiprocessing | |
tasks = [ | |
(path_book, output_dir, target_len_sec, out_extension) for path_book in list_dir | |
] | |
# Process tasks in parallel using multiprocessing | |
with multiprocessing.Pool(processes=n_process) as pool: | |
for _ in tqdm.tqdm(pool.imap_unordered(cut_book, tasks), total=len(tasks)): | |
pass | |
if __name__ == "__main__": | |
input_dir = "/path/to/input_dir" | |
output_dir = "/path/to/output_dir" | |
target_len_sec = 10 | |
n_process = 16 | |
cut_segments(input_dir, output_dir, target_len_sec, n_process) | |