Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,677 Bytes
0fecc29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import torchaudio
from whisperspeech.pipeline import Pipeline
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Convert text to audio.")
parser.add_argument(
"--text",
type=str,
required=True,
help="The text to convert to audio.",
)
return parser.parse_args()
def convert_text_to_audio(pipe: Pipeline, text: str):
"""Convert text to audio.
Args:
pipe (Pipeline): The pipeline to use for text-to-speech.
text (str): The text to convert to audio.
Returns:
torch.Tensor: The generated audio.
"""
return pipe.generate(text)
def convert_text_to_audio_file(pipe: Pipeline, text: str, output_path: str):
"""Convert text to audio and save it to a file.
Args:
pipe (Pipeline): The pipeline to use for text-to-speech.
text (str): The text to convert to audio.
output_path (str): The path to save the audio file.
"""
pipe.generate_to_file(output_path, text)
class TTSProcessor:
def __init__(self, device: str):
"""Initialize the TTS Processor with a specified device."""
self.pipe = Pipeline(
s2a_ref="collabora/whisperspeech:s2a-q4-tiny-en+pl.model", device=device
)
def get_reference_voice_embedding(self, path: str):
"""Get the reference voice embedding from the given audio file.
Args:
path (str): The path to the audio file.
Returns:
torch.Tensor: The reference voice embedding."""
return self.pipe.extract_spk_emb(path).cpu()
def convert_text_to_audio(self, text: str, speaker=None):
"""Convert text to audio.
Args:
text (str): The text to convert to audio.
Returns:
torch.Tensor: The generated audio.
"""
return self.pipe.generate(text, speaker=speaker)
def convert_text_to_audio_file(self, text: str, output_path: str, speaker=None):
"""Convert text to audio and save it to a file.
Args:
text (str): The text to convert to audio.
output_path (str): The path to save the audio file.
"""
self.pipe.generate_to_file(output_path, text, speaker=speaker)
if __name__ == "__main__":
args = parse_args()
processor = TTSProcessor("cuda")
text = args.text
text = text.lower()
text_split = "_".join(text.lower().split(" "))
# remove the last character if it is a period
if text_split[-1] == ".":
text_split = text_split[:-1]
print(text_split)
path = f"./examples/{text_split}.wav"
processor.convert_text_to_audio_file(text, path)
|