|
import whisper |
|
import os |
|
import json |
|
import torchaudio |
|
import argparse |
|
import torch |
|
from tqdm import tqdm |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--whisper_size", default="large") |
|
args = parser.parse_args() |
|
|
|
model = whisper.load_model(args.whisper_size, device="cpu") |
|
parent_dir = "./custom_character_voice/" |
|
speaker_names = list(os.walk(parent_dir))[0][1] |
|
speaker_annos = [] |
|
total_files = sum([len(files) for r, d, files in os.walk(parent_dir)]) |
|
|
|
|
|
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f: |
|
hps = json.load(f) |
|
target_sr = hps['data']['sampling_rate'] |
|
processed_files = 0 |
|
for speaker in speaker_names: |
|
filelist = (list(os.walk(parent_dir + speaker))[0][2]) |
|
for i, wavfile in tqdm(enumerate(filelist), desc="Processing Audio:", total=len(filelist)): |
|
|
|
if wavfile.startswith("processed_"): |
|
continue |
|
|
|
wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True, |
|
channels_first=True) |
|
wav = wav.mean(dim=0).unsqueeze(0) |
|
if sr != target_sr: |
|
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav) |
|
if wav.shape[1] / sr > 20: |
|
print(f"{wavfile} too long, ignoring\n") |
|
save_path = parent_dir + speaker + "/" + f"processed_{i}.wav" |
|
torchaudio.save(save_path, wav, target_sr, channels_first=True) |
|
|
|
|
|
|
|
audio = whisper.load_audio(save_path) |
|
audio = whisper.pad_or_trim(audio) |
|
|
|
|
|
mel = whisper.log_mel_spectrogram(audio).to(model.device) |
|
|
|
options = whisper.DecodingOptions(beam_size=5, language="ja", fp16 = False) |
|
result = whisper.decode(model, mel, options) |
|
|
|
text = "[JA]"+ result.text + "[JA]\n" |
|
speaker_annos.append(save_path + "|" + speaker + "|" + text) |
|
|
|
processed_files += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(speaker_annos) == 0: |
|
print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.") |
|
print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.") |
|
with open("short_character_anno.txt", 'w', encoding='utf-8') as f: |
|
for line in speaker_annos: |
|
f.write(line) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|