Spaces:

Lycoris53
/

VITS-TTS-Japanese-Only-Amitaro

Running

App Files Files Community

VITS-TTS-Japanese-Only-Amitaro / short_audio_transcribe.py

Lycoris53

Create short_audio_transcribe.py

18893bc over 1 year ago

raw

history blame

4.02 kB

	import whisper
	import os
	import json
	import torchaudio
	import argparse
	import torch
	from tqdm import tqdm

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--whisper_size", default="large")
	args = parser.parse_args()
	#assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
	model = whisper.load_model(args.whisper_size, device="cpu")
	parent_dir = "./custom_character_voice/"
	speaker_names = list(os.walk(parent_dir))[0][1]
	speaker_annos = []
	total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
	# resample audios
	# 2023/4/21: Get the target sampling rate
	with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
	hps = json.load(f)
	target_sr = hps['data']['sampling_rate']
	processed_files = 0
	for speaker in speaker_names:
	filelist = (list(os.walk(parent_dir + speaker))[0][2])
	for i, wavfile in tqdm(enumerate(filelist), desc="Processing Audio:", total=len(filelist)):
	# try to load file as audio
	if wavfile.startswith("processed_"):
	continue
	#try:
	wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
	channels_first=True)
	wav = wav.mean(dim=0).unsqueeze(0)
	if sr != target_sr:
	wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
	if wav.shape[1] / sr > 20:
	print(f"{wavfile} too long, ignoring\n")
	save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
	torchaudio.save(save_path, wav, target_sr, channels_first=True)
	# transcribe text
	#lang, text = transcribe_one(save_path)

	audio = whisper.load_audio(save_path)
	audio = whisper.pad_or_trim(audio)

	# make log-Mel spectrogram and move to the same device as the model
	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	options = whisper.DecodingOptions(beam_size=5, language="ja", fp16 = False)
	result = whisper.decode(model, mel, options)

	text = "[JA]"+ result.text + "[JA]\n"
	speaker_annos.append(save_path + "\|" + speaker + "\|" + text)

	processed_files += 1
	#print(f"Processed: {processed_files}/{total_files}")
	#except:
	# print(f"Error occurred: {wavfile}")
	# continue

	# # clean annotation
	# import argparse
	# import text
	# from utils import load_filepaths_and_text
	# for i, line in enumerate(speaker_annos):
	# path, sid, txt = line.split("\|")
	# cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
	# cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
	# speaker_annos[i] = path + "\|" + sid + "\|" + cleaned_text
	# write into annotation
	if len(speaker_annos) == 0:
	print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
	print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
	with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
	for line in speaker_annos:
	f.write(line)

	# import json
	# # generate new config
	# with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
	# hps = json.load(f)
	# # modify n_speakers
	# hps['data']["n_speakers"] = 1000 + len(speaker2id)
	# # add speaker names
	# for speaker in speaker_names:
	# hps['speakers'][speaker] = speaker2id[speaker]
	# # save modified config
	# with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
	# json.dump(hps, f, indent=2)
	# print("finished")