Spaces:
Sleeping
Sleeping
File size: 8,680 Bytes
96ea36d 03adfb9 96ea36d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import os
import json5
import utils
def check_json_script(data):
foreground_mandatory_attrs_map = {
'music': ['vol', 'len', 'desc'],
'sound_effect': ['vol', 'len', 'desc'],
'speech': ['vol', 'text']
}
background_mandatory_attrs_map = {
'music': ['vol', 'desc'],
'sound_effect': ['vol', 'desc'],
}
def check_by_audio_type(audio, mandatory_attrs_map, audio_str):
if audio['audio_type'] not in mandatory_attrs_map:
raise ValueError('audio_type is not allowed in this layout, audio={audio_str}')
for attr_name in mandatory_attrs_map[audio['audio_type']]:
if attr_name not in audio:
raise ValueError(f'{attr_name} does not exist, audio={audio_str}')
# Check json's format
for audio in data:
audio_str = json5.dumps(audio, indent=None)
if 'layout' not in audio:
raise ValueError(f'layout missing, audio={audio_str}')
elif 'audio_type' not in audio:
raise ValueError(f'audio_type missing, audio={audio_str}')
elif audio['layout'] == 'foreground':
check_by_audio_type(audio, foreground_mandatory_attrs_map, audio_str)
elif audio['layout'] == 'background':
if 'id' not in audio:
raise ValueError(f'id not in background audio, audio={audio_str}')
if 'action' not in audio:
raise ValueError(f'action not in background audio, audio={audio_str}')
if audio['action'] == 'begin':
check_by_audio_type(audio, background_mandatory_attrs_map, audio_str)
else:
if audio['action'] != 'end':
raise ValueError(f'Unknown action, audio={audio_str}')
else:
raise ValueError(f'Unknown layout, audio={audio_str}')
#except Exception as err:
# sys.stderr.write(f'PARSING ERROR: {err}, audio={json5.dumps(audio, indent=None)}\n')
# all_clear = False
def collect_and_check_audio_data(data):
fg_audio_id = 0
fg_audios = []
bg_audios = []
# Collect all the foreground and background audio ids used to calculate background audio length later
for audio in data:
if audio['layout'] == 'foreground':
audio['id'] = fg_audio_id
fg_audios.append(audio)
fg_audio_id += 1
else: # background
if audio['action'] == 'begin':
audio['begin_fg_audio_id'] = fg_audio_id
bg_audios.append(audio)
else: # ends
# find the backgound with the id, and update its 'end_fg_audio_id'
for bg_audio in bg_audios:
if bg_audio['id'] == audio['id'] and bg_audio['audio_type'] == audio['audio_type']:
bg_audio['end_fg_audio_id'] = fg_audio_id
break
# check if all background audios are valid
for bg_audio in bg_audios:
if 'begin_fg_audio_id' not in bg_audio:
raise ValueError(f'begin of background missing, audio={bg_audio}')
elif 'end_fg_audio_id' not in bg_audio:
raise ValueError(f'end of background missing, audio={bg_audio}')
if bg_audio['begin_fg_audio_id'] > bg_audio['end_fg_audio_id']:
raise ValueError(f'background audio ends before start, audio={bg_audio}')
elif bg_audio['begin_fg_audio_id'] == bg_audio['end_fg_audio_id']:
raise ValueError(f'background audio contains no foreground audio, audio={bg_audio}')
#except Exception as err:
# sys.stderr.write(f'ALIGNMENT ERROR: {err}, audio={bg_audio}\n')
# return None, None
return fg_audios, bg_audios
class AudioCodeGenerator:
def __init__(self):
self.wav_counters = {
'bg_sound_effect': 0,
'bg_music': 0,
'idle': 0,
'fg_sound_effect': 0,
'fg_music': 0,
'fg_speech': 0,
}
self.code = ''
def append_code(self, content):
self.code = f'{self.code}{content}\n'
def generate_code(self, fg_audios, bg_audios, output_path, result_filename):
def get_wav_name(audio):
audio_type = audio['audio_type']
layout = 'fg' if audio['layout'] == 'foreground' else 'bg'
wav_type = f'{layout}_{audio_type}' if layout else audio_type
desc = audio['text'] if 'text' in audio else audio['desc']
desc = utils.text_to_abbrev_prompt(desc)
wav_filename = f'{wav_type}_{self.wav_counters[wav_type]}_{desc}.wav'
self.wav_counters[wav_type] += 1
return wav_filename
header = f'''
import os
import sys
import datetime
from APIs import TTM, TTS, TTA, MIX, CAT, COMPUTE_LEN
fg_audio_lens = []
wav_path = \"{output_path.absolute()}/audio\"
os.makedirs(wav_path, exist_ok=True)
'''
self.append_code(header)
fg_audio_wavs = []
for fg_audio in fg_audios:
wav_name = get_wav_name(fg_audio)
if fg_audio['audio_type'] == 'sound_effect':
self.append_code(f'TTA(text=\"{fg_audio["desc"]}\", length={fg_audio["len"]}, volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
elif fg_audio['audio_type'] == 'music':
self.append_code(f'TTM(text=\"{fg_audio["desc"]}\", length={fg_audio["len"]}, volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
elif fg_audio['audio_type'] == 'speech':
npz_path = self.char_to_voice_map[fg_audio["character"]]["npz_path"]
npz_full_path = os.path.abspath(npz_path) if os.path.exists(npz_path) else npz_path
self.append_code(f'TTS(text=\"{fg_audio["text"]}\", speaker_id=\"{self.char_to_voice_map[fg_audio["character"]]["id"]}\", volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"), speaker_npz=\"{npz_full_path}\")')
fg_audio_wavs.append(wav_name)
self.append_code(f'fg_audio_lens.append(COMPUTE_LEN(os.path.join(wav_path, \"{wav_name}\")))\n')
# cat all foreground audio together
self.append_code(f'fg_audio_wavs = []')
for wav_filename in fg_audio_wavs:
self.append_code(f'fg_audio_wavs.append(os.path.join(wav_path, \"{wav_filename}\"))')
self.append_code(f'CAT(wavs=fg_audio_wavs, out_wav=os.path.join(wav_path, \"foreground.wav\"))')
bg_audio_wavs = []
self.append_code(f'\nbg_audio_offsets = []')
for bg_audio in bg_audios:
wav_name = get_wav_name(bg_audio)
self.append_code(f'bg_audio_len = sum(fg_audio_lens[{bg_audio["begin_fg_audio_id"]}:{bg_audio["end_fg_audio_id"]}])')
self.append_code(f'bg_audio_offset = sum(fg_audio_lens[:{bg_audio["begin_fg_audio_id"]}])')
if bg_audio['audio_type'] == 'sound_effect':
self.append_code(f'TTA(text=\"{bg_audio["desc"]}\", volume={bg_audio["vol"]}, length=bg_audio_len, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
elif bg_audio['audio_type'] == 'music':
self.append_code(f'TTM(text=\"{bg_audio["desc"]}\", volume={bg_audio["vol"]}, length=bg_audio_len, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
else:
raise ValueError()
bg_audio_wavs.append(wav_name)
self.append_code(f'bg_audio_offsets.append(bg_audio_offset)\n')
self.append_code(f'bg_audio_wavs = []')
for wav_filename in bg_audio_wavs:
self.append_code(f'bg_audio_wavs.append(os.path.join(wav_path, \"{wav_filename}\"))')
self.append_code(f'bg_audio_wav_offset_pairs = list(zip(bg_audio_wavs, bg_audio_offsets))')
self.append_code(f'bg_audio_wav_offset_pairs.append((os.path.join(wav_path, \"foreground.wav\"), 0))')
self.append_code(f'MIX(wavs=bg_audio_wav_offset_pairs, out_wav=os.path.join(wav_path, \"{result_filename}.wav\"))')
def init_char_to_voice_map(self, filename):
with open(filename, 'r') as file:
self.char_to_voice_map = json5.load(file)
def parse_and_generate(self, script_filename, char_to_voice_map_filename, output_path, result_filename='result'):
self.code = ''
self.init_char_to_voice_map(char_to_voice_map_filename)
with open(script_filename, 'r') as file:
data = json5.load(file)
check_json_script(data)
fg_audios, bg_audios = collect_and_check_audio_data(data)
self.generate_code(fg_audios, bg_audios, output_path, result_filename)
return self.code
|