Working Hacky Code
This is code I'm using to poke the model and create output.
import torch
from datasets import load_dataset
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf
import numpy as np
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
prompt_file = '/docker/nix-tts/prompt.txt'
output_dir = '/docker/nix-tts/'
batch_size = 2 # Number of lines to process at a time
with open(prompt_file, 'r') as file:
lines = file.readlines()
num_lines = len(lines)
# Calculate the number of batches
num_batches = (num_lines + batch_size - 1) // batch_size
for i in range(num_batches):
start_idx = i * batch_size
end_idx = (i + 1) * batch_size
batch_lines = lines[start_idx:end_idx]
# Join the batch_lines into a single string
prompt_text = "".join(batch_lines)
inputs = processor(text=prompt_text, return_tensors="pt")
# load xvector containing speaker's voice characteristics from a dataset
embeddings_dataset = load_dataset(
"Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(
embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(
inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
output_filename = f"speech_{i}.wav"
output_path = output_dir + output_filename
sf.write(output_path, speech.numpy(), samplerate=16000)
print(
f"Batch {i+1}/{num_batches} processed. Output saved as {output_filename}")
It loads a file from a poorly named directory (docker isn't involved) and batch writes output for every two lines. I'm finding success with having a line of speech (not too long, 500 characters) and then a space for a pause (a line break). Then a 46 line script I just wrote comes out in 30 files. They aren't long, you have to stitch them together but you can re-record portions as needed much easier. In my case it misread an abbreviation that contained the letter I, I had to respell it "eye" to get it to read it. Instead of re-doing the entire script, I can re-do just this line.
Figured I'll use Audacity or another to put the files together into a single output after recording a second of silence for extra-inserted-pauses.
This script will take the above generated WAV files and combine them into an MP3 (stitch them together);
from pydub import AudioSegment
import os
# Set the directory where the WAV files are located
wav_directory = '/docker/nix-tts/'
# Set the output path and filename for the final MP3 file
output_path = '/docker/nix-tts/'
output_filename = 'output.mp3'
# Get a sorted list of WAV files in the directory
wav_files = sorted(
[f for f in os.listdir(wav_directory) if f.endswith(".wav")],
key=lambda f: int(os.path.splitext(f)[0].split("_")[1])
)
# Initialize an empty AudioSegment object to store the combined audio
combined_audio = AudioSegment.silent(duration=0)
# Iterate over the sorted WAV files
for filename in wav_files:
wav_file = os.path.join(wav_directory, filename)
# Load the WAV file using pydub
audio = AudioSegment.from_wav(wav_file)
# Append the current audio to the combined audio
combined_audio += audio
# Export the combined audio as an MP3 file
combined_audio.export(os.path.join(output_path, output_filename), format="mp3")
print("Conversion to MP3 complete!")
Do you find that the model hallucinates? I've found that its output is great until it isn't then it just makes stuff up