kdrkdrkdr commited on
Commit
670af6d
1 Parent(s): 381866a

edit app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -11
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import os
3
  import re
 
4
  import librosa
5
  import numpy as np
6
  import torch
@@ -14,7 +15,6 @@ from mel_processing import spectrogram_torch
14
 
15
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
16
 
17
- max_length = 100
18
 
19
  def get_text(text, hps, is_phoneme):
20
  text_norm = text_to_sequence(text, hps.symbols, [] if is_phoneme else hps.data.text_cleaners)
@@ -28,7 +28,12 @@ def create_tts_fn(model, hps, speaker_ids):
28
  def tts_fn(text, speaker, speed, is_phoneme):
29
  if limitation:
30
  text_len = len(text)
31
- max_len = max_length
 
 
 
 
 
32
  if text_len > max_len:
33
  return "Error: Text is too long", None
34
 
@@ -46,6 +51,9 @@ def create_tts_fn(model, hps, speaker_ids):
46
  return tts_fn
47
 
48
 
 
 
 
49
  def create_to_phoneme_fn(hps):
50
  def to_phoneme_fn(text):
51
  return _clean_text(text, hps.data.text_cleaners) if text != "" else ""
@@ -94,31 +102,59 @@ if __name__ == '__main__':
94
 
95
  t = 'vits'
96
  models_tts.append((name, cover_path, speakers, lang, example,
97
- create_tts_fn(model, hps, speaker_ids),
98
  create_to_phoneme_fn(hps)))
99
-
100
  app = gr.Blocks(css=css)
101
 
102
  with app:
103
  gr.Markdown("# BlueArchive Hoshino TTS Using Vits Model\n"
104
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=kdrkdrkdr.HoshinoTTS)\n\n")
105
 
106
- for i, (name, cover_path, speakers, lang, example, tts_fn, to_phoneme_fn) in enumerate(models_tts):
 
107
 
108
  with gr.Column():
109
  gr.Markdown(f"## {name}\n\n"
110
  f"![cover](file/{cover_path})\n\n"
111
  f"lang: {lang}")
112
- tts_input1 = gr.TextArea(label=f"Text ({max_length} words limitation)", value=example,
113
  elem_id=f"tts-input{i}")
114
  tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
115
  type="index", value=speakers[0])
116
- tts_input3 = gr.Slider(label="Speed", value=0.9, minimum=0.5, maximum=2, step=0.1)
117
-
 
 
 
 
 
 
118
  tts_submit = gr.Button("Generate", variant="primary")
119
  tts_output1 = gr.Textbox(label="Output Message")
120
  tts_output2 = gr.Audio(label="Output Audio")
121
- tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3],
122
  [tts_output1, tts_output2])
123
-
124
- app.queue(concurrency_count=3).launch(show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
  import re
4
+
5
  import librosa
6
  import numpy as np
7
  import torch
 
15
 
16
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
17
 
 
18
 
19
  def get_text(text, hps, is_phoneme):
20
  text_norm = text_to_sequence(text, hps.symbols, [] if is_phoneme else hps.data.text_cleaners)
 
28
  def tts_fn(text, speaker, speed, is_phoneme):
29
  if limitation:
30
  text_len = len(text)
31
+ max_len = 100
32
+ if is_phoneme:
33
+ max_len *= 3
34
+ else:
35
+ if len(hps.data.text_cleaners) > 0 and hps.data.text_cleaners[0] == "zh_ja_mixture_cleaners":
36
+ text_len = len(re.sub("(\[ZH\]|\[JA\])", "", text))
37
  if text_len > max_len:
38
  return "Error: Text is too long", None
39
 
 
51
  return tts_fn
52
 
53
 
54
+
55
+
56
+
57
  def create_to_phoneme_fn(hps):
58
  def to_phoneme_fn(text):
59
  return _clean_text(text, hps.data.text_cleaners) if text != "" else ""
 
102
 
103
  t = 'vits'
104
  models_tts.append((name, cover_path, speakers, lang, example,
105
+ hps.symbols, create_tts_fn(model, hps, speaker_ids),
106
  create_to_phoneme_fn(hps)))
107
+
108
  app = gr.Blocks(css=css)
109
 
110
  with app:
111
  gr.Markdown("# BlueArchive Hoshino TTS Using Vits Model\n"
112
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=kdrkdrkdr.HoshinoTTS)\n\n")
113
 
114
+ for i, (name, cover_path, speakers, lang, example, symbols, tts_fn,
115
+ to_phoneme_fn) in enumerate(models_tts):
116
 
117
  with gr.Column():
118
  gr.Markdown(f"## {name}\n\n"
119
  f"![cover](file/{cover_path})\n\n"
120
  f"lang: {lang}")
121
+ tts_input1 = gr.TextArea(label="Text (100 words limitation)", value=example,
122
  elem_id=f"tts-input{i}")
123
  tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
124
  type="index", value=speakers[0])
125
+ tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.1, maximum=2, step=0.1)
126
+ with gr.Accordion(label="Advanced Options", open=False):
127
+ phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
128
+ to_phoneme_btn = gr.Button("Covert text to phoneme")
129
+ phoneme_list = gr.Dataset(label="Phoneme list", components=[tts_input1],
130
+ samples=[[x] for x in symbols],
131
+ elem_id=f"phoneme-list{i}")
132
+ phoneme_list_json = gr.Json(value=symbols, visible=False)
133
  tts_submit = gr.Button("Generate", variant="primary")
134
  tts_output1 = gr.Textbox(label="Output Message")
135
  tts_output2 = gr.Audio(label="Output Audio")
136
+ tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
137
  [tts_output1, tts_output2])
138
+ to_phoneme_btn.click(to_phoneme_fn, [tts_input1], [tts_input1])
139
+ phoneme_list.click(None, [phoneme_list, phoneme_list_json], [],
140
+ _js=f"""
141
+ (i,phonemes) => {{
142
+ let root = document.querySelector("body > gradio-app");
143
+ if (root.shadowRoot != null)
144
+ root = root.shadowRoot;
145
+ let text_input = root.querySelector("#tts-input{i}").querySelector("textarea");
146
+ let startPos = text_input.selectionStart;
147
+ let endPos = text_input.selectionEnd;
148
+ let oldTxt = text_input.value;
149
+ let result = oldTxt.substring(0, startPos) + phonemes[i] + oldTxt.substring(endPos);
150
+ text_input.value = result;
151
+ let x = window.scrollX, y = window.scrollY;
152
+ text_input.focus();
153
+ text_input.selectionStart = startPos + phonemes[i].length;
154
+ text_input.selectionEnd = startPos + phonemes[i].length;
155
+ text_input.blur();
156
+ window.scrollTo(x, y);
157
+ return [];
158
+ }}""")
159
+
160
+ app.queue(concurrency_count=3).launch(show_api=False)