Why is the audio output file not created?(We need your HELP!!!!!!!๐ญ)
My code below looks like this:
It is a code that includes training and inference at the same time. However, I need evaluation metrics such as precision, recall, and f1, and an audio file, but the following code does not create an audio file. What should we modify to get additional evaluation indicators? help me๐ญ
import os
import sys
import io
import torch
import torchaudio
import wandb # wandb ์ํฌํธ
import torch.nn.functional as F
from TTS.config.shared_configs import BaseDatasetConfig, BaseAudioConfig
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.configs.shared_configs import CharactersConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
stdout์ ์ธ์ฝ๋ฉ์ UTF-8๋ก ์ค์
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
class TrainerArgs:
pass
class Trainer:
def init(self, args, config, output_path, model, train_samples, eval_samples):
self.args = args
self.config = config
self.output_path = output_path
self.model = model
self.train_samples = train_samples
self.eval_samples = eval_samples
self.optimizer = optimizer # Placeholder for the optimizer
def set_optimizer(self, optimizer):
self.optimizer = optimizer
def load_audio(self, audio_file):
waveform, sample_rate = torchaudio.load(audio_file)
if sample_rate != self.config.audio.sample_rate:
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.config.audio.sample_rate)(waveform)
return waveform
def train_epoch(self):
self.model.train()
total_loss = 0
for sample in self.train_samples:
self.optimizer.zero_grad()
input_data = self.load_audio(sample['audio_file'])
x_lengths = torch.tensor([input_data.shape[1]])
target_data = self.load_audio(sample['audio_file'])
y = target_data.unsqueeze(0)
output = self.model(input_data, x_lengths, y)
loss = self.compute_loss(output, y)
loss.backward()
self.optimizer.step()
total_loss += loss.item()
average_loss = total_loss / len(self.train_samples)
print(f"Training Loss: {average_loss}")
return average_loss
def evaluate(self, samples):
self.model.eval()
total_loss = 0
with torch.no_grad():
for sample in samples:
input_data = self.load_audio(sample['audio_file'])
x_lengths = torch.tensor([input_data.shape[1]])
target_data = self.load_audio(sample['audio_file'])
y = target_data.unsqueeze(0)
output = self.model(input_data, x_lengths, y)
loss = self.compute_loss(output, y)
total_loss += loss.item()
average_loss = total_loss / len(samples)
return average_loss
def compute_loss(self, output, target):
loss = F.mse_loss(output, target)
return loss
def formatter(root_path, manifest_file, **kwargs):
"""Assumes each line as
.wav|
"""
txt_file = os.path.join(root_path, manifest_file)
items = []
speaker_name = "my_Sherlock"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.strip().split("|")
wav_file = os.path.join(root_path, "wavs", cols[0])
text = cols[1]
# ๋๋ฒ๊น
์ ์ํด ํ์ผ ๊ฒฝ๋ก์ ํ
์คํธ ์ถ๋ ฅ
print(f"Processing file: {wav_file}, Text: {text}")
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def load_model(config, checkpoint_path):
# ์ค๋์ค ํ๋ก์ธ์ ์ด๊ธฐํ
ap = AudioProcessor.init_from_config(config)
# ํ ํฌ๋์ด์ ์ด๊ธฐํ ๋ฐ ์ค์ ๊ฐฑ์
tokenizer, config = TTSTokenizer.init_from_config(config)
# GlowTTS ๋ชจ๋ธ ์ด๊ธฐํ
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
model.load_checkpoint(checkpoint_path)
return model, ap, tokenizer
def infer(model, text, ap, tokenizer):
# ์
๋ ฅ ํ
์คํธ๋ฅผ ํ ํฐ์ผ๋ก ๋ณํ
tokens = tokenizer.text_to_ids(text)
# ํ ํฐ์ ํ
์๋ก ๋ณํํ๊ณ ๋ฐฐ์น ์ฐจ์ ์ถ๊ฐ
tokens = torch.LongTensor(tokens).unsqueeze(0)
# ์ถ๋ก ๋ชจ๋ ์ค์ (๊ธฐ์ธ๊ธฐ ๊ณ์ฐ ๋นํ์ฑํ)
with torch.no_grad():
# ๋ชจ๋ธ ์ถ๋ก ์ํ
outputs = model.inference(tokens)
# ๋ชจ๋ธ ์ถ๋ ฅ์ ์ด์ฉํด ์ค๋์ค ํํ ์์ฑ
waveform = ap.invert_spectrogram(outputs["model_outputs"].squeeze(0))
# ์์ฑ๋ ์ค๋์ค ํํ ๋ฐํ
return waveform
def perform_inference(config, checkpoint_path):
# ๋ชจ๋ธ, ์ค๋์ค ํ๋ก์ธ์, ํ ํฌ๋์ด์ ์ด๊ธฐํ
model, ap, tokenizer = load_model(config, checkpoint_path)
# ์ฌ์ฉ์๋ก๋ถํฐ ํ
์คํธ ์
๋ ฅ ๋ฐ๊ธฐ
text = input("๊ฒฐ๊ณผ text ๋ด์ฉ์ ์ ์ด์ฃผ์ธ์!: ")
# ์ถ๋ก ์ํ
waveform = infer(model, text, ap, tokenizer)
# ๊ฒฐ๊ณผ ์ค๋์ค ํ์ผ ์ ์ฅ
output_file = os.path.join(config.output_path, "output_0807_1511.wav") # ํ์ผ ํ์: output_๋ ์ง_์๊ฐ.wav ๋ก ํ ๊ฒ!
ap.save_wav(waveform, output_file)
print(f"ํ์ต ์ดํ ๋์ด ์ค์ ํ text๋ก ์ถ๋ก ์ค๋์ค๊ฐ ์์ฑ๋์ต๋๋ค~!..์ถ๋ก ์ค๋์ค: {output_file}")
def main():
# wandb ์ค์ ๋ฐ ์ด๊ธฐํ
wandb.init(project="sherlock")
wandb.require("core") # ์๋ก์ด ๋ฐฑ์๋ ์ฌ์ฉ ์ค์
# torch์ ๊ธฐ๋ณธ ํ
์ ์ ํ์ CPU ํ
์๋ก ์ค์
torch.set_default_tensor_type(torch.FloatTensor)
# ํ์ฌ ์คํฌ๋ฆฝํธ์ ๋๋ ํ ๋ฆฌ๋ฅผ ์ถ๋ ฅ ๊ฒฝ๋ก๋ก ์ค์
output_path = os.path.dirname(os.path.abspath(__file__))
# ๋ฐ์ดํฐ์
์ค์ ์ ์
dataset_config = BaseDatasetConfig(
formatter="Sherlock",
meta_file_train="metadata.txt", # ์ฌ๋ฐ๋ฅธ ๊ฒฝ๋ก๋ก ์์
path="MyTTSDataset" # ์ฌ๋ฐ๋ฅธ ๊ฒฝ๋ก๋ก ์์
)
audio_config = BaseAudioConfig(
sample_rate=12050,
)
character_config = CharactersConfig(
characters_class="TTS.tts.utils.text.characters.Graphemes",
pad="_",
eos="~",
bos="^",
blank="@",
characters="Iabdfgijklmnprstuvxzษษษฃษจษซษฑสสสฒหหฬฏอกฮฒ",
punctuations="!,.?: -โโโโฆ",
)
# phoneme_cache_path ์ค์
phoneme_cache_path = os.path.join(output_path, "phoneme_cache")
# phoneme_cache ๋๋ ํ ๋ฆฌ ์์ฑ
if not os.path.exists(phoneme_cache_path):
os.makedirs(phoneme_cache_path)
# ํ์ต ์ค์ ์ด๊ธฐํ
config = GlowTTSConfig(
run_name="Testrun",
run_description="Desc",
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="english_cleaners",
use_phonemes=True,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=False,
mixed_precision=True,
output_path=output_path,
datasets=[dataset_config],
audio=audio_config,
characters=character_config,
eval_split_size=0.2,
test_sentences=[]
)
# ์ค๋์ค ํ๋ก์ธ์ ์ด๊ธฐํ
ap = AudioProcessor.init_from_config(config)
# ํ ํฌ๋์ด์ ์ด๊ธฐํ
tokenizer, config = TTSTokenizer.init_from_config(config)
# ๋ฐ์ดํฐ ์ํ ๋ก๋
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
formatter=formatter,
eval_split_size=config.eval_split_size,
eval_split_max_size=config.eval_split_max_size,
)
# ๋ชจ๋ธ ์ด๊ธฐํ
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
# ํธ๋ ์ด๋ ์ด๊ธฐํ
trainer_args = TrainerArgs()
trainer = Trainer(
trainer_args, config, output_path, model=model, optimizer, train_samples=train_samples
)
# ์ตํฐ๋ง์ด์ ์ด๊ธฐํ (Adam ์ตํฐ๋ง์ด์ )
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
trainer.set_optimizer(optimizer)
# ํ์ต ์์
for epoch in range(config.epochs):
train_loss = trainer.train_epoch()
eval_loss = trainer.evaluate(eval_samples)
# ํ์ต ์์ค๊ณผ ํ๊ฐ ์์ค ๋ก๊ทธ ๊ธฐ๋ก
wandb.log({"epoch": epoch, "train_loss": train_loss, "eval_loss": eval_loss})
print(f"Epoch {epoch}, Train Loss: {train_loss}, Eval Loss: {eval_loss}")
# ๋ชจ๋ธ ๊ฐ์ค์น ์ ์ฅ
checkpoint_path = os.path.join(output_path, "checkpoints", "์
๋ก_model.pth") # ์ค์ ์ฒดํฌํฌ์ธํธ ๊ฒฝ๋ก๋ก ์์
torch.save(model.state_dict(), checkpoint_path)
wandb.save(checkpoint_path) # wandb์ ๋ชจ๋ธ ๊ฐ์ค์น ์ ์ฅ
# ์ถ๋ก ์ํ
perform_inference(config, checkpoint_path)
if name == 'main':
main()
You know, you can simply record your OS's stereo mix.