skytnt commited on
Commit
3561130
β€’
1 Parent(s): 8935672

add models

Browse files
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
 
3
  os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
@@ -22,62 +23,64 @@ def get_text(text, hps):
22
  return text_norm
23
 
24
 
25
- def tts_fn(text, speaker):
26
- if len(text) > 150:
27
- return "Error: Text is too long", None
28
- model, hps = models[model_idx[speaker]]
29
- speaker_id = speaker_idx[speaker]
30
- stn_tst = get_text(text, hps)
31
- with no_grad():
32
- x_tst = stn_tst.unsqueeze(0)
33
- x_tst_lengths = LongTensor([stn_tst.size(0)])
34
- sid = LongTensor([speaker_id])
35
- audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
36
- 0, 0].data.cpu().float().numpy()
37
- return "Success", (hps.data.sampling_rate, audio)
38
-
39
-
40
- def vc_fn(original_speaker, target_speaker, input_audio):
41
- if input_audio is None:
42
- return "You need to upload an audio", None
43
- sampling_rate, audio = input_audio
44
- duration = audio.shape[0] / sampling_rate
45
- if duration > 30:
46
- return "Error: Audio is too long", None
47
- if model_idx[original_speaker] != model_idx[target_speaker]:
48
- return "Error: Can not convert voice between different model", None
49
-
50
- model, hps = models[model_idx[original_speaker]]
51
- original_speaker_id = speaker_idx[original_speaker]
52
- target_speaker_id = speaker_idx[target_speaker]
53
-
54
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
55
- if len(audio.shape) > 1:
56
- audio = librosa.to_mono(audio.transpose(1, 0))
57
- if sampling_rate != hps.data.sampling_rate:
58
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
59
- y = torch.FloatTensor(audio)
60
- y = y.unsqueeze(0)
61
- spec = spectrogram_torch(y, hps.data.filter_length,
62
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
63
- center=False)
64
- spec_lengths = LongTensor([spec.size(-1)])
65
- sid_src = LongTensor([original_speaker_id])
66
- sid_tgt = LongTensor([target_speaker_id])
67
- with no_grad():
68
- audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
69
- 0, 0].data.cpu().float().numpy()
70
- return "Success", (hps.data.sampling_rate, audio)
 
 
71
 
72
 
73
  if __name__ == '__main__':
74
  models = []
75
- model_idx = []
76
- speaker_idx = []
77
- speakers = []
78
- for i in range(0, 2):
79
  config_path = f"saved_model/{i}/config.json"
80
  model_path = f"saved_model/{i}/model.pth"
 
81
  hps = utils.get_hparams_from_file(config_path)
82
  model = SynthesizerTrn(
83
  len(hps.symbols),
@@ -87,38 +90,50 @@ if __name__ == '__main__':
87
  **hps.model)
88
  utils.load_checkpoint(model_path, model, None)
89
  model.eval()
90
- models.append((model, hps))
91
- speakers = speakers + [f"model{i}/{x}" for x in hps.speakers]
92
- model_idx = model_idx + [i] * len(hps.speakers)
93
- speaker_idx = speaker_idx + list(range(0, len(hps.speakers)))
 
94
 
95
  app = gr.Blocks()
96
 
97
  with app:
98
  gr.Markdown("# Moe Japanese TTS And Voice Conversion Using VITS Model\n\n"
99
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=skytnt.moegoe)\n\n"
100
- "unofficial demo for [https://github.com/CjangCjengh/MoeGoe](https://github.com/CjangCjengh/MoeGoe)"
 
 
101
  )
102
  with gr.Tabs():
103
  with gr.TabItem("TTS"):
104
- with gr.Column():
105
- tts_input1 = gr.TextArea(label="Text (150 words limitation)", value="こんにけは。")
106
- tts_input2 = gr.Dropdown(label="Speaker", choices=speakers, type="index", value=speakers[0])
107
- tts_submit = gr.Button("Generate", variant="primary")
108
- tts_output1 = gr.Textbox(label="Output Message")
109
- tts_output2 = gr.Audio(label="Output Audio")
 
 
 
 
 
 
 
110
  with gr.TabItem("Voice Conversion"):
111
- with gr.Column():
112
- vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
113
- value=speakers[0])
114
- vc_input2 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
115
- value=speakers[1])
116
- vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
117
- vc_submit = gr.Button("Convert", variant="primary")
118
- vc_output1 = gr.Textbox(label="Output Message")
119
- vc_output2 = gr.Audio(label="Output Audio")
120
-
121
- tts_submit.click(tts_fn, [tts_input1, tts_input2], [tts_output1, tts_output2])
122
- vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
 
 
123
 
124
  app.launch()
 
1
+ import json
2
  import os
3
 
4
  os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
 
23
  return text_norm
24
 
25
 
26
+ def create_tts_fn(model, hps, speaker_ids):
27
+ def tts_fn(text, speaker):
28
+ if len(text) > 150:
29
+ return "Error: Text is too long", None
30
+ speaker_id = speaker_ids[speaker]
31
+ stn_tst = get_text(text, hps)
32
+ with no_grad():
33
+ x_tst = stn_tst.unsqueeze(0)
34
+ x_tst_lengths = LongTensor([stn_tst.size(0)])
35
+ sid = LongTensor([speaker_id])
36
+ audio = \
37
+ model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
38
+ 0, 0].data.cpu().float().numpy()
39
+ return "Success", (hps.data.sampling_rate, audio)
40
+
41
+ return tts_fn
42
+
43
+
44
+ def create_vc_fn(model, hps, speaker_ids):
45
+ def vc_fn(original_speaker, target_speaker, input_audio):
46
+ if input_audio is None:
47
+ return "You need to upload an audio", None
48
+ sampling_rate, audio = input_audio
49
+ duration = audio.shape[0] / sampling_rate
50
+ if duration > 30:
51
+ return "Error: Audio is too long", None
52
+ original_speaker_id = speaker_ids[original_speaker]
53
+ target_speaker_id = speaker_ids[target_speaker]
54
+
55
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
56
+ if len(audio.shape) > 1:
57
+ audio = librosa.to_mono(audio.transpose(1, 0))
58
+ if sampling_rate != hps.data.sampling_rate:
59
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
60
+ y = torch.FloatTensor(audio)
61
+ y = y.unsqueeze(0)
62
+ spec = spectrogram_torch(y, hps.data.filter_length,
63
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
64
+ center=False)
65
+ spec_lengths = LongTensor([spec.size(-1)])
66
+ sid_src = LongTensor([original_speaker_id])
67
+ sid_tgt = LongTensor([target_speaker_id])
68
+ with no_grad():
69
+ audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
70
+ 0, 0].data.cpu().float().numpy()
71
+ return "Success", (hps.data.sampling_rate, audio)
72
+
73
+ return vc_fn
74
 
75
 
76
  if __name__ == '__main__':
77
  models = []
78
+ with open("saved_model/names.json", "r", encoding="utf-8") as f:
79
+ models_names = json.load(f)
80
+ for i, models_name in models_names.items():
 
81
  config_path = f"saved_model/{i}/config.json"
82
  model_path = f"saved_model/{i}/model.pth"
83
+ cover_path = f"saved_model/{i}/cover.jpg"
84
  hps = utils.get_hparams_from_file(config_path)
85
  model = SynthesizerTrn(
86
  len(hps.symbols),
 
90
  **hps.model)
91
  utils.load_checkpoint(model_path, model, None)
92
  model.eval()
93
+ speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
94
+ speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
95
+
96
+ models.append((models_name, cover_path, speakers,
97
+ create_tts_fn(model, hps, speaker_ids), create_vc_fn(model, hps, speaker_ids)))
98
 
99
  app = gr.Blocks()
100
 
101
  with app:
102
  gr.Markdown("# Moe Japanese TTS And Voice Conversion Using VITS Model\n\n"
103
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=skytnt.moegoe)\n\n"
104
+ "unofficial demo for \n\n"
105
+ "- [https://github.com/CjangCjengh/MoeGoe](https://github.com/CjangCjengh/MoeGoe)\n"
106
+ "- [https://github.com/Francis-Komizu/VITS](https://github.com/Francis-Komizu/VITS)"
107
  )
108
  with gr.Tabs():
109
  with gr.TabItem("TTS"):
110
+ with gr.Tabs():
111
+ for i, (models_name, cover_path, speakers, tts_fn, vc_fn) in enumerate(models):
112
+ with gr.TabItem(f"model{i}"):
113
+ with gr.Column():
114
+ gr.Markdown(f"## {models_name}\n\n"
115
+ f"![cover](file/{cover_path})")
116
+ tts_input1 = gr.TextArea(label="Text (150 words limitation)", value="こんにけは。")
117
+ tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
118
+ type="index", value=speakers[0])
119
+ tts_submit = gr.Button("Generate", variant="primary")
120
+ tts_output1 = gr.Textbox(label="Output Message")
121
+ tts_output2 = gr.Audio(label="Output Audio")
122
+ tts_submit.click(tts_fn, [tts_input1, tts_input2], [tts_output1, tts_output2])
123
  with gr.TabItem("Voice Conversion"):
124
+ with gr.Tabs():
125
+ for i, (models_name, cover_path, speakers, tts_fn, vc_fn) in enumerate(models):
126
+ with gr.TabItem(f"model{i}"):
127
+ gr.Markdown(f"## {models_name}\n\n"
128
+ f"![cover](file/{cover_path})")
129
+ vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
130
+ value=speakers[0])
131
+ vc_input2 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
132
+ value=speakers[1])
133
+ vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
134
+ vc_submit = gr.Button("Convert", variant="primary")
135
+ vc_output1 = gr.Textbox(label="Output Message")
136
+ vc_output2 = gr.Audio(label="Output Audio")
137
+ vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
138
 
139
  app.launch()
saved_model/0/cover.jpg ADDED

Git LFS Details

  • SHA256: 2d443da7d7eb5c5b054077ece85b68b2b94bf5db2b51001fe32404deea7f0717
  • Pointer size: 130 Bytes
  • Size of remote file: 39.9 kB
saved_model/1/cover.jpg ADDED

Git LFS Details

  • SHA256: 0123d1fa78031a85890869891b843b2f079c66fed12cf510cb6025e2e4db04c3
  • Pointer size: 130 Bytes
  • Size of remote file: 50.3 kB
saved_model/2/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf80c1c1a185384ea9c4c8ea63964e2fc592e6d2aad3f8566b534a512ed90c28
3
+ size 1294
saved_model/2/cover.jpg ADDED

Git LFS Details

  • SHA256: cf387dd1775ebf0f98245e433686a9f8f75bcc5aa8c4ceb192b8a98d0ec42432
  • Pointer size: 130 Bytes
  • Size of remote file: 60.2 kB
saved_model/2/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16adcc6dd3f23ac4407176769f1e6843f86a5b16e04b8abb5a6a11132e6b9751
3
+ size 476622149
saved_model/3/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8507d589dd869adbc8f2f49f083930db85561184afaa9dc472006434e4bb1d7e
3
+ size 1246
saved_model/3/cover.jpg ADDED

Git LFS Details

  • SHA256: 1284933d68ad829768df808feaee25ad68693b8b004c44f675462750b94dd1d8
  • Pointer size: 130 Bytes
  • Size of remote file: 47.3 kB
saved_model/3/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60dfd6e56a1f895e3db4c054fd94d5a0362103dd5d2e19941e17dd1be41e6b11
3
+ size 476796721
saved_model/names.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ea53575cc490320713273241bc46b30c0cb399fa1e49647a5c7e13cf5e2e444
3
+ size 201