Korakoe commited on
Commit
d10c5e3
β€’
1 Parent(s): 0cad0bd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -0
app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ from styletts2 import tts
4
+ import re
5
+ import numpy as np
6
+ from scipy.io.wavfile import write
7
+ import pyaudio
8
+ import nltk
9
+
10
+ nltk.download('punkt')
11
+ from nltk.tokenize import word_tokenize
12
+
13
+ import torch
14
+
15
+ import phonemizer # en-us
16
+
17
+ INTRO = """
18
+ <style>
19
+
20
+ .TitleContainer {
21
+ background-color: #ffff;
22
+ margin-bottom: 0rem;
23
+ margin-left: auto;
24
+ margin-right: auto;
25
+ width: 40%;
26
+ height: 30%;
27
+ border-radius: 10rem;
28
+ border: 0.5vw solid #ff593e;
29
+ text-align: center;
30
+ display: flex;
31
+ justify-content: center;
32
+ transition: .6s;
33
+ }
34
+
35
+ .TitleContainer:hover {
36
+ transform: scale(1.05);
37
+ }
38
+
39
+ .VokanLogo {
40
+ margin: auto;
41
+ display: block;
42
+ }
43
+
44
+ </style>
45
+
46
+ <div class="TitleContainer">
47
+ <img src="https://huggingface.co/spaces/ShoukanLabs/Vokan/resolve/main/Vokan.gif" class="VokanLogo">
48
+ </div>
49
+
50
+ <p align="center", style="font-size: 1vw; font-weight: bold; color: #ff593e;">A StyleTTS2 fine-tune, designed for expressiveness.</p>
51
+
52
+ <hr>
53
+ """
54
+
55
+
56
+
57
+ js_func = """
58
+ function refresh() {
59
+ const url = new URL(window.location);
60
+
61
+ if (url.searchParams.get('__theme') !== 'light') {
62
+ url.searchParams.set('__theme', 'light');
63
+ window.location.href = url.href;
64
+ }
65
+ }
66
+ """
67
+
68
+ theme = gr.themes.Soft(
69
+ primary_hue=gr.themes.Color(c100="#ffd7d1", c200="#ff593e", c300="#ff593e", c400="#ff593e", c50="#fff0f0", c500="#ff593e", c600="#ea580c", c700="#c2410c", c800="#9a3412", c900="#7c2d12", c950="#6c2e12"),
70
+ secondary_hue="orange",
71
+ radius_size=gr.themes.Size(lg="20px", md="8px", sm="6px", xl="30px", xs="4px", xxl="40px", xxs="2px"),
72
+ font=[gr.themes.GoogleFont('M PLUS Rounded 1c'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
73
+ ).set(
74
+ block_background_fill='*neutral_50'
75
+ )
76
+
77
+ global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us',
78
+ preserve_punctuation=True,
79
+ with_stress=True,
80
+ language_switch="remove-flags",
81
+ tie=False)
82
+
83
+
84
+ def split_and_recombine_text(text, desired_length=200, max_length=300):
85
+ """Split text it into chunks of a desired length trying to keep sentences intact."""
86
+ # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
87
+ text = re.sub(r'\n\n+', '\n', text)
88
+ text = re.sub(r'\s+', ' ', text)
89
+ text = re.sub(r'[β€œβ€]', '"', text)
90
+
91
+ rv = []
92
+ in_quote = False
93
+ current = ""
94
+ split_pos = []
95
+ pos = -1
96
+ end_pos = len(text) - 1
97
+
98
+ def seek(delta):
99
+ nonlocal pos, in_quote, current
100
+ is_neg = delta < 0
101
+ for _ in range(abs(delta)):
102
+ if is_neg:
103
+ pos -= 1
104
+ current = current[:-1]
105
+ else:
106
+ pos += 1
107
+ current += text[pos]
108
+ if text[pos] == '"':
109
+ in_quote = not in_quote
110
+ return text[pos]
111
+
112
+ def peek(delta):
113
+ p = pos + delta
114
+ return text[p] if p < end_pos and p >= 0 else ""
115
+
116
+ def commit():
117
+ nonlocal rv, current, split_pos
118
+ rv.append(current)
119
+ current = ""
120
+ split_pos = []
121
+
122
+ while pos < end_pos:
123
+ c = seek(1)
124
+ # do we need to force a split?
125
+ if len(current) >= max_length:
126
+ if len(split_pos) > 0 and len(current) > (desired_length / 2):
127
+ # we have at least one sentence and we are over half the desired length, seek back to the last split
128
+ d = pos - split_pos[-1]
129
+ seek(-d)
130
+ else:
131
+ # no full sentences, seek back until we are not in the middle of a word and split there
132
+ while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
133
+ c = seek(-1)
134
+ commit()
135
+ # check for sentence boundaries
136
+ elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')):
137
+ # seek forward if we have consecutive boundary markers but still within the max length
138
+ while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
139
+ c = seek(1)
140
+ split_pos.append(pos)
141
+ if len(current) >= desired_length:
142
+ commit()
143
+ # treat end of quote as a boundary if its followed by a space or newline
144
+ elif in_quote and peek(1) == '"' and peek(2) in '\n ':
145
+ seek(2)
146
+ split_pos.append(pos)
147
+ rv.append(current)
148
+
149
+ # clean up, remove lines with only whitespace or punctuation
150
+ rv = [s.strip() for s in rv]
151
+ rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]
152
+
153
+ return rv
154
+
155
+
156
+ def text_to_phonemes(text):
157
+ text = text.strip()
158
+ print("Text before phonemization: ", text)
159
+ ps = global_phonemizer.phonemize([text])
160
+ print("Text after phonemization: ", ps)
161
+ ps = word_tokenize(ps[0])
162
+ ps = ' '.join(ps)
163
+ print("Final text after tokenization: ", ps)
164
+ return ps
165
+
166
+
167
+ @spaces.GPU
168
+ def generate(audio_path, ins, speed, alpha, beta, embedding, steps=100):
169
+ ref_s = other_tts.compute_style(audio_path)
170
+ print(ref_s.size())
171
+ s_prev = None
172
+
173
+ texts = split_and_recombine_text(ins)
174
+ audio = np.array([])
175
+
176
+ P = pyaudio.PyAudio()
177
+
178
+ for i in texts:
179
+ i = text_to_phonemes(i)
180
+ synthaud, s_prev = other_tts.long_inference_segment(i, diffusion_steps=steps,
181
+ alpha=alpha, beta=beta, is_phonemes=True,
182
+ embedding_scale=embedding, prev_s=s_prev, ref_s=ref_s,
183
+ speed=speed, t=0.7)
184
+ audio = np.concatenate((audio, synthaud))
185
+ scaled = np.int16(audio / np.max(np.abs(audio)) * 32767)
186
+
187
+ return 24000, scaled
188
+
189
+ if torch.cuda.is_available():
190
+ other_tts = tts.StyleTTS2(model_checkpoint_path='./epoch_2nd_00012.pth', config_path="models/config_ft.yml")
191
+ else:
192
+ other_tts = None
193
+
194
+ with gr.Blocks(theme=theme, js=js_func) as clone:
195
+ gr.HTML(INTRO)
196
+ with gr.Row():
197
+ with gr.Column(scale=1):
198
+ inp = gr.Textbox(label="Text", info="What do you want Vokan to say?", interactive=True)
199
+ voice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#FF593E'})
200
+ steps = gr.Slider(minimum=3, maximum=60, value=20, step=1, label="Diffusion Steps", info="Higher produces better results typically", interactive=True)
201
+ embscale = gr.Slider(minimum=1, maximum=10, value=2, step=0.1, label="Embedding Scale", info="Defaults to 2 | low scales may produce unexpected results", interactive=True)
202
+ alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
203
+ beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
204
+ speed = gr.Slider(minimum=0.5, maximum=1.5, value=1, step=0.1, label="Speed of speech", info="Defaults to 1", interactive=True)
205
+ with gr.Column(scale=1):
206
+ clbtn = gr.Button("Synthesize", variant="primary")
207
+ claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#FF593E'})
208
+ clbtn.click(generate, inputs=[voice, inp, speed, alpha, beta, embscale, steps], outputs=[claudio], concurrency_limit=4)
209
+
210
+ if __name__ == "__main__":
211
+ # demo.queue(api_open=False, max_size=15).launch(show_api=False)
212
+ clone.queue(api_open=False, max_size=15).launch(show_api=False)