File size: 6,570 Bytes
029074a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import os
import argparse
import json
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--add_auxiliary_data", type=bool, help="Whether to add extra data as fine-tuning helper")
parser.add_argument("--languages", default="CJE")
args = parser.parse_args()
if args.languages == "CJE":
langs = ["[ZH]", "[JA]", "[EN]"]
elif args.languages == "CJ":
langs = ["[ZH]", "[JA]"]
elif args.languages == "C":
langs = ["[ZH]"]
elif args.languages == "J":
langs = ["[JA]"]
new_annos = []
# Source 1: transcribed short audios
if os.path.exists("short_character_anno.txt"):
with open("short_character_anno.txt", 'r', encoding='utf-8') as f:
short_character_anno = f.readlines()
new_annos += short_character_anno
# Source 2: transcribed long audio segments
if os.path.exists("./long_character_anno.txt"):
with open("./long_character_anno.txt", 'r', encoding='utf-8') as f:
long_character_anno = f.readlines()
new_annos += long_character_anno
# Get all speaker names
speakers = []
for line in new_annos:
path, speaker, text = line.split("|")
if speaker not in speakers:
speakers.append(speaker)
assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure."
# Source 3 (Optional): sampled audios as extra training helpers
if args.add_auxiliary_data:
with open("./sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
old_annos = f.readlines()
# filter old_annos according to supported languages
filtered_old_annos = []
for line in old_annos:
for lang in langs:
if lang in line:
filtered_old_annos.append(line)
old_annos = filtered_old_annos
for line in old_annos:
path, speaker, text = line.split("|")
if speaker not in speakers:
speakers.append(speaker)
num_old_voices = len(old_annos)
num_new_voices = len(new_annos)
# STEP 1: balance number of new & old voices
cc_duplicate = num_old_voices // num_new_voices
if cc_duplicate == 0:
cc_duplicate = 1
# STEP 2: modify config file
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
hps = json.load(f)
# assign ids to new speakers
speaker2id = {}
for i, speaker in enumerate(speakers):
speaker2id[speaker] = i
# modify n_speakers
hps['data']["n_speakers"] = len(speakers)
# overwrite speaker names
hps['speakers'] = speaker2id
hps['train']['log_interval'] = 10
hps['train']['eval_interval'] = 100
hps['train']['batch_size'] = 16
hps['data']['training_files'] = "final_annotation_train.txt"
hps['data']['validation_files'] = "final_annotation_val.txt"
# save modified config
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
# STEP 3: clean annotations, replace speaker names with assigned speaker IDs
import text
cleaned_new_annos = []
for i, line in enumerate(new_annos):
path, speaker, txt = line.split("|")
if len(txt) > 150:
continue
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
cleaned_old_annos = []
for i, line in enumerate(old_annos):
path, speaker, txt = line.split("|")
if len(txt) > 150:
continue
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
cleaned_old_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
# merge with old annotation
final_annos = cleaned_old_annos + cc_duplicate * cleaned_new_annos
# save annotation file
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
for line in final_annos:
f.write(line)
# save annotation file for validation
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
for line in cleaned_new_annos:
f.write(line)
print("finished")
else:
# Do not add extra helper data
# STEP 1: modify config file
with open("./configs/amitaro_jp_base.json", 'r', encoding='utf-8') as f:
hps = json.load(f)
# assign ids to new speakers
speaker2id = {}
for i, speaker in enumerate(speakers):
speaker2id[speaker] = i
# modify n_speakers
hps['data']["n_speakers"] = len(speakers)
# overwrite speaker names
hps['speakers'] = speaker2id
hps['train']['log_interval'] = 10
hps['train']['eval_interval'] = 100
hps['train']['batch_size'] = 16
hps['data']['training_files'] = "final_annotation_train.txt"
hps['data']['validation_files'] = "final_annotation_val.txt"
# save modified config
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
# STEP 2: clean annotations, replace speaker names with assigned speaker IDs
import text
cleaned_new_annos = []
for i, line in enumerate(new_annos):
path, speaker, txt = line.split("|")
if len(txt) > 150:
continue
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']).replace("[ZH]", "")
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
final_annos = cleaned_new_annos
# save annotation file
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
for line in final_annos:
f.write(line)
# save annotation file for validation
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
for line in cleaned_new_annos:
f.write(line)
print("finished")
|