burraco135 commited on
Commit
7e2ad98
1 Parent(s): 990d75e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -13
app.py CHANGED
@@ -1,18 +1,56 @@
1
  import gradio as gr
 
 
 
2
 
3
- # Load model directly
4
- from transformers import AutoProcessor, SpeechT5ForTextToSpeech
5
 
6
- processor = AutoProcessor.from_pretrained("burraco135/speecht5_finetuned_voxpopuli_it")
7
- model = SpeechT5ForTextToSpeech.from_pretrained("burraco135/speecht5_finetuned_voxpopuli_it")
 
 
 
 
 
 
8
 
9
- def tts(text):
10
  inputs = processor(text=text, return_tensors="pt")
11
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
13
-
14
- def greet(name):
15
- return "Hello " + name + "!!"
16
-
17
- iface = gr.Interface(fn=tts, inputs="text", outputs="audio")
18
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
 
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
7
 
8
+ checkpoint = "burraco135/speecht5_finetuned_voxpopuli_it"
9
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
10
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
11
+ vocoder = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
+
13
+ def predict(text, speaker):
14
+ if len(text.strip()) == 0:
15
+ return (16000, np.zeros(0).astype(np.int16))
16
 
 
17
  inputs = processor(text=text, return_tensors="pt")
18
+
19
+ # limit input length
20
+ input_ids = inputs["input_ids"]
21
+ input_ids = input_ids[..., :model.config.max_text_positions]
22
+
23
+ if speaker == "Surprise Me!":
24
+ # load one of the provided speaker embeddings at random
25
+ idx = np.random.randint(len(speaker_embeddings))
26
+ key = list(speaker_embeddings.keys())[idx]
27
+ speaker_embedding = np.load(speaker_embeddings[key])
28
+
29
+ # randomly shuffle the elements
30
+ np.random.shuffle(speaker_embedding)
31
+
32
+ # randomly flip half the values
33
+ x = (np.random.rand(512) >= 0.5) * 1.0
34
+ x[x == 0] = -1.0
35
+ speaker_embedding *= x
36
+
37
+ #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
38
+ else:
39
+ speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
40
+
41
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
42
+
43
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
44
+
45
+ speech = (speech.numpy() * 32767).astype(np.int16)
46
+ return (16000, speech)
47
+
48
+ gr.Interface(
49
+ fn=predict,
50
+ inputs=[
51
+ gr.Text(label="Input Text"),
52
+ ],
53
+ outputs=[
54
+ gr.Audio(label="Generated Speech", type="numpy"),
55
+ ]
56
+ ).launch()