Spaces:
Runtime error
Runtime error
import gradio as gr | |
import librosa | |
import numpy as np | |
import torch | |
import re | |
from num2words import num2words | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
checkpoint = "GreenCounsel/speecht5_tts_common_voice_5_sv" | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
speaker_embeddings = { | |
"Female": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy", | |
"Male": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy", | |
"Experimental":"spkemb/embeddings.npy", | |
} | |
def predict(text, speaker): | |
if len(text.strip()) == 0 or len(text.strip())>200: | |
text="Du måste ha minst ett och max 200 tecken." | |
ar=[int(s) for s in re.findall(r'\b\d+\b',text)] | |
for arr in ar: | |
text=text.replace(str(arr),num2words(arr,lang="sv")) | |
repl = [ | |
('Ä', 'ae'), | |
('Å', 'o'), | |
('Ö', 'oe'), | |
('ä', 'ae'), | |
('å', 'o'), | |
('ö', 'oe'), | |
('ô','oe'), | |
('-',''), | |
('‘',''), | |
('’',''), | |
('“',''), | |
('”',''), | |
] | |
for src, dst in repl: | |
text = text.replace(src, dst) | |
inputs = processor(text=text, return_tensors="pt") | |
# limit input length | |
input_ids = inputs["input_ids"] | |
input_ids = input_ids[..., :model.config.max_text_positions] | |
speaker_embedding = np.load(speaker_embeddings[speaker]) | |
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) | |
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) | |
speech = (speech.numpy() * 32767).astype(np.int16) | |
return (16000, speech) | |
title = "SpeechT5 finetuned Swedish, TTS " | |
description = """ | |
SpeechT5 text-to-speech model finetuned on the Swedish language from the | |
Common Voice dataset. Inference runs on a basic CPU (2 vCPU, 16 GB ram) so | |
please have patience if it takes some time. As a company founded by a female | |
coder, our resources are extremely limited (female founders in tech only get approx. | |
1 % of the venture capital and the women who receive funding seldom are the | |
ones actually handling the tech). We are in a very biased sphere where | |
female coders' companies seldom get the resources which would normally | |
be necessary to do what they do. The app uses the SpeechT5 model | |
finetuned for swedish by GreenCounsel, available here: [https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv](https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv). | |
""" | |
article = """ | |
<div style='margin:20px auto;'> | |
<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> | | |
<a href="https://github.com/microsoft/SpeechT5/">original SpeechT5</a> | | |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p> | |
<pre> | |
@article{Ao2021SpeechT5, | |
title = {SpeechT5: Unified-Modal Encoder-Decoder Pre-training for Spoken Language Processing}, | |
author = {Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei}, | |
eprint={2110.07205}, | |
archivePrefix={arXiv}, | |
primaryClass={eess.AS}, | |
year={2021} | |
} | |
</pre> | |
</div> | |
""" | |
examples = [ | |
["GreenCounsel grundades i Malmö för sex år sedan.", "Female"], | |
["Med hjälp av maskininlärning kan mycket av juridiken automatiseras samtidigt som juristerna fokuserar på frågor där de ger störst värde.", "Male"], | |
["GreenCounsel har byggt en chatbott som kan förstå frågor på många olika språk och ge kvalitetssäkrade svar.", "Female"], | |
["Vi har också byggt ett system för att automatisera arbetsflöden för juridiska tjänster via internet.", "Male"], | |
["Talsyntesen bygger på en engelsk modell och kan därför upplevas som att jag bryter lite på engelska.","Female"] | |
] | |
gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Text(label="Input Text"), | |
gr.Radio(label="Speaker", choices=[ | |
"Female", | |
"Male", | |
"Experimental", | |
], | |
value="Female"), | |
], | |
outputs=[ | |
gr.Audio(label="Generated Speech", type="numpy"), | |
], | |
title=title, | |
description=description, | |
article=article, | |
examples=examples, | |
).launch() |