MusiConGen / preproc /dump_jsonl.py
fffiloni's picture
Upload 290 files
1eabf9f verified
raw
history blame
2.6 kB
import os
import json
import sys
import librosa
def traverse_dir(
root_dir,
extension,
amount=None,
str_include=None,
str_exclude=None,
is_pure=False,
is_sort=False,
is_ext=True):
file_list = []
cnt = 0
for root, _, files in os.walk(root_dir):
for file in files:
if file.endswith(extension):
# path
mix_path = os.path.join(root, file)
pure_path = mix_path[len(root_dir)+1:] if is_pure else mix_path
# amount
if (amount is not None) and (cnt == amount):
if is_sort:
file_list.sort()
return file_list
# check string
if (str_include is not None) and (str_include not in pure_path):
continue
if (str_exclude is not None) and (str_exclude in pure_path):
continue
if not is_ext:
ext = pure_path.split('.')[-1]
pure_path = pure_path[:-(len(ext)+1)]
file_list.append(pure_path)
cnt += 1
if is_sort:
file_list.sort()
return file_list
if __name__ == '__main__':
root_dir = '../audiocraft/dataset/example/clip'
path_jsonl = '../audiocraft/egs/example/data.jsonl'
filelist = traverse_dir(
root_dir,
extension='wav',
str_include='no_vocal',
is_sort=True)
num_files = len(filelist)
with open(path_jsonl, "w") as train_file:
for fidx in range(num_files):
print(f'==={fidx}/{num_files}================')
path_wave = filelist[fidx]
path_json = os.path.join(
os.path.dirname(path_wave), 'tags.json')
sr = librosa.get_samplerate(path_wave)
print('path_wave:', path_wave)
print('path_json:', path_json)
with open(path_json, 'r') as f:
data = json.load(f)
assert sr == data['sample_rate']
final = {
'path': data['path'],
'duration': data['duration'],
"sample_rate": data['sample_rate'],
"bpm": data['bpm'],
"amplitude": None,
"weight": None,
"info_path": None
}
train_file.write(json.dumps(final) + '\n')
print('\n\n\n==================')
print('num files:', num_files)