# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ This code is modified from https://github.com/facebookresearch/libri-light/blob/main/data_preparation/cut_by_vad.py""" import pathlib import soundfile as sf import numpy as np import json import multiprocessing import tqdm def save(seq, fname, index, extension): """save audio sequences to file""" output = np.hstack(seq) file_name = fname.parent / (fname.stem + f"_{index:04}{extension}") fname.parent.mkdir(exist_ok=True, parents=True) sf.write(file_name, output, samplerate=16000) def cut_sequence(path, vad, path_out, target_len_sec, out_extension): """cut audio sequences based on VAD""" data, samplerate = sf.read(path) assert len(data.shape) == 1 assert samplerate == 16000 to_stitch = [] length_accumulated = 0.0 i = 0 # Iterate over VAD segments for start, end in vad: start_index = int(start * samplerate) end_index = int(end * samplerate) slice = data[start_index:end_index] # Save slices that exceed the target length or if there's already accumulated audio if ( length_accumulated + (end - start) > target_len_sec and length_accumulated > 0 ): save(to_stitch, path_out, i, out_extension) to_stitch = [] i += 1 length_accumulated = 0 # Add the current slice to the list to be stitched to_stitch.append(slice) length_accumulated += end - start # Save any remaining slices if to_stitch: save(to_stitch, path_out, i, out_extension) def cut_book(task): """process each book in the dataset""" path_book, root_out, target_len_sec, extension = task speaker = pathlib.Path(path_book.parent.name) for i, meta_file_path in enumerate(path_book.glob("*.json")): with open(meta_file_path, "r") as f: meta = json.loads(f.read()) book_id = meta["book_meta"]["id"] vad = meta["voice_activity"] sound_file = meta_file_path.parent / (meta_file_path.stem + ".flac") path_out = root_out / speaker / book_id / (meta_file_path.stem) cut_sequence(sound_file, vad, path_out, target_len_sec, extension) def cut_segments( input_dir, output_dir, target_len_sec=30, n_process=32, out_extension=".wav" ): """Main function to cut segments from audio files""" pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True) list_dir = pathlib.Path(input_dir).glob("*/*") list_dir = [x for x in list_dir if x.is_dir()] print(f"{len(list_dir)} directories detected") print(f"Launching {n_process} processes") # Create tasks for multiprocessing tasks = [ (path_book, output_dir, target_len_sec, out_extension) for path_book in list_dir ] # Process tasks in parallel using multiprocessing with multiprocessing.Pool(processes=n_process) as pool: for _ in tqdm.tqdm(pool.imap_unordered(cut_book, tasks), total=len(tasks)): pass if __name__ == "__main__": input_dir = "/path/to/input_dir" output_dir = "/path/to/output_dir" target_len_sec = 10 n_process = 16 cut_segments(input_dir, output_dir, target_len_sec, n_process)