Spaces:
Runtime error
Runtime error
import gradio as gr | |
import librosa | |
import numpy as np | |
import torch | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from datasets import load_dataset, Audio | |
dataset = load_dataset( | |
"divakaivan/glaswegian_audio" | |
) | |
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train'] | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts") | |
tokenizer = processor.tokenizer | |
def extract_all_chars(batch): | |
all_text = " ".join(batch["transcription"]) | |
vocab = list(set(all_text)) | |
return {"vocab": [vocab], "all_text": [all_text]} | |
vocabs = dataset.map( | |
extract_all_chars, | |
batched=True, | |
batch_size=-1, | |
keep_in_memory=True, | |
remove_columns=dataset.column_names, | |
) | |
dataset_vocab = set(vocabs["vocab"][0]) | |
tokenizer_vocab = {k for k,_ in tokenizer.get_vocab().items()} | |
import os | |
import torch | |
from speechbrain.inference.speaker import EncoderClassifier | |
spk_model_name = "speechbrain/spkrec-xvect-voxceleb" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
speaker_model = EncoderClassifier.from_hparams( | |
source=spk_model_name, | |
run_opts={"device": device}, | |
savedir=os.path.join("/tmp", spk_model_name), | |
) | |
def create_speaker_embedding(waveform): | |
with torch.no_grad(): | |
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) | |
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) | |
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() | |
return speaker_embeddings | |
def prepare_dataset(example): | |
# load the audio data; if necessary, this resamples the audio to 16kHz | |
audio = example["audio"] | |
# feature extraction and tokenization | |
example = processor( | |
text=example["transcription"], | |
audio_target=audio["array"], | |
sampling_rate=audio["sampling_rate"], | |
return_attention_mask=False, | |
) | |
# strip off the batch dimension | |
example["labels"] = example["labels"][0] | |
# use SpeechBrain to obtain x-vector | |
example["speaker_embeddings"] = create_speaker_embedding(audio["array"]) | |
return example | |
processed_example = prepare_dataset(dataset[0]) | |
from transformers import SpeechT5HifiGan | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
spectrogram = torch.tensor(processed_example["labels"]) | |
with torch.no_grad(): | |
speech = vocoder(spectrogram) | |
dataset = dataset.map( | |
prepare_dataset, remove_columns=dataset.column_names, | |
) | |
dataset = dataset.train_test_split(test_size=0.1) | |
def predict(text, speaker): | |
if len(text.strip()) == 0: | |
return (16000, np.zeros(0).astype(np.int16)) | |
inputs = processor(text=text, return_tensors="pt") | |
# limit input length | |
# input_ids = inputs["input_ids"] | |
# input_ids = input_ids[..., :model.config.max_text_positions] | |
### ### ### | |
example = dataset['train'][888] | |
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) | |
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
with torch.no_grad(): | |
speech = vocoder(spectrogram) | |
speech = (speech.numpy() * 32767).astype(np.int16) | |
return (16000, speech) | |
title = "Glaswegian TTS" | |
article = "Model fine-tuned and gradle demo generated thanks to this notebook: https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ#scrollTo=wm7B3zxrumfF" | |
gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Text(label="Input Text"), | |
], | |
outputs=[ | |
gr.Audio(label="Generated Speech", type="numpy"), | |
], | |
title=title, | |
article=article, | |
).launch() | |