import gradio as gr import librosa import numpy as np import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan checkpoint = "burraco135/speecht5_finetuned_voxpopuli_it" processor = SpeechT5Processor.from_pretrained(checkpoint) model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") speaker_embeddings = r"C:\Users\ester\OneDrive\Documenti\VSCode\Hugging Face\tts\speaker_embeddings.npy" def predict(text, speaker): # se il testo è vuoto, fai un array vuoto da 16000 if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) # preprocess text inputs = processor(text=text, return_tensors="pt") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") speech = model.generate_speech(speaker_embeddings, vocoder=vocoder) return (16000, speech) title = "SpeechT5: Speech Synthesis" examples = [ ["It is not in the stars to hold our destiny but in ourselves.", "BDL (male)"], ["The octopus and Oliver went to the opera in October.", "CLB (female)"], ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "RMS (male)"], ["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "SLT (female)"], ["A synonym for cinnamon is a cinnamon synonym.", "BDL (male)"], ["How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood as a woodchuck would if a woodchuck could chuck wood.", "CLB (female)"], ] gr.Interface( fn=predict, inputs=[ gr.Text(label="Input Text") ], outputs=[ gr.Audio(label="Generated Speech", type="numpy"), ], title=title, examples=examples, ).launch()