chilge commited on
Commit
84d6588
1 Parent(s): 8a55d7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -57
app.py CHANGED
@@ -1,17 +1,12 @@
1
  import gradio as gr
2
- import os
3
- import random
4
- import IPython.display as ipd
5
  import commons
6
  import utils
7
- import json
8
- import torch
9
- import tempfile
10
- import numpy as np
11
  from models import SynthesizerTrn
12
  from text.symbols import symbols
13
  from text import text_to_sequence
14
- from scipy.io.wavfile import write
 
15
 
16
  def get_text(text, hps):
17
  text_norm = text_to_sequence(text, hps.data.text_cleaners)
@@ -19,16 +14,6 @@ def get_text(text, hps):
19
  text_norm = commons.intersperse(text_norm, 0)
20
  text_norm = torch.LongTensor(text_norm)
21
  return text_norm
22
-
23
- def get_text_byroma(text, hps):
24
- text_norm = []
25
- for i in text:
26
- text_norm.append(symbols.index(i))
27
- if hps.data.add_blank:
28
- text_norm = commons.intersperse(text_norm, 0)
29
- text_norm = torch.LongTensor(text_norm)
30
- return text_norm
31
-
32
  hps = utils.get_hparams_from_file("./configs/leo.json")
33
  net_g = SynthesizerTrn(
34
  len(symbols),
@@ -37,61 +22,90 @@ net_g = SynthesizerTrn(
37
  n_speakers=hps.data.n_speakers,
38
  **hps.model)
39
  _ = net_g.eval()
40
- _ = utils.load_checkpoint("logs/leo/G_4000.pth", net_g, None)
41
 
42
- # 随机抽取情感参考音频的根目录
43
- random_emotion_root = "wavs"
44
- emotion_dict = json.load(open("configs/leo.json", "r"))
45
-
46
- def tts(txt, emotion, temp_file_path):
47
- """emotion为参考情感音频路径或random_sample(随机抽取)"""
48
- if roma:
49
- stn_tst = get_text_byroma(txt, hps)
50
- else:
51
- stn_tst = get_text(txt, hps)
 
 
52
  with torch.no_grad():
53
  x_tst = stn_tst.unsqueeze(0)
54
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
55
  sid = torch.LongTensor([0])
56
- if os.path.exists(f"{emotion}.emo.npy"):
57
- emo = torch.FloatTensor(np.load(f"{emotion}.emo.npy")).unsqueeze(0)
 
 
58
  elif emotion == "random_sample":
59
- while True:
60
- rand_wav = random.sample(os.listdir(random_emotion_root), 1)[0]
61
- if rand_wav.endswith('wav') and os.path.exists(f"{random_emotion_root}/{rand_wav}.emo.npy"):
62
- break
63
- emo = torch.FloatTensor(np.load(f"{random_emotion_root}/{rand_wav}.emo.npy")).unsqueeze(0)
64
- print(f"{random_emotion_root}/{rand_wav}")
65
  elif emotion.endswith("wav"):
66
  import emotion_extract
67
  emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))
68
  else:
69
- print("emotion参数不正确")
 
 
 
 
70
 
71
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.2, emo=emo)[0][0,0].data.float().numpy()
 
 
 
 
72
 
73
- # Save the numpy array as a temporary file
74
- write(temp_file_path, hps.data.sampling_rate, audio)
 
 
75
 
76
- # Display the audio
77
- ipd.display(ipd.Audio(temp_file_path, rate=hps.data.sampling_rate, normalize=False))
78
 
79
- # Delete the temporary file
80
- os.remove(temp_file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- return audio
83
 
84
- def generate_audio(txt, emotion):
85
- # Create a temporary file
86
- temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
87
- temp_file_path = temp_file.name
88
- audio = tts(txt, emotion, temp_file_path)
89
- return audio
90
 
91
- input_text = gr.inputs.Textbox(label="输入文本")
92
- input_emotion = gr.inputs.Dropdown(choices=["random_sample"] + os.listdir(random_emotion_root), label="参考情感音频")
93
- output_audio = gr.outputs.Audio(type="numpy", label="合成音频")
94
 
95
- iface = gr.Interface(fn=generate_audio, inputs=[input_text, input_emotion], outputs=output_audio)
96
- iface.launch()
97
 
 
1
  import gradio as gr
2
+ import torch
 
 
3
  import commons
4
  import utils
 
 
 
 
5
  from models import SynthesizerTrn
6
  from text.symbols import symbols
7
  from text import text_to_sequence
8
+ import numpy as np
9
+
10
 
11
  def get_text(text, hps):
12
  text_norm = text_to_sequence(text, hps.data.text_cleaners)
 
14
  text_norm = commons.intersperse(text_norm, 0)
15
  text_norm = torch.LongTensor(text_norm)
16
  return text_norm
 
 
 
 
 
 
 
 
 
 
17
  hps = utils.get_hparams_from_file("./configs/leo.json")
18
  net_g = SynthesizerTrn(
19
  len(symbols),
 
22
  n_speakers=hps.data.n_speakers,
23
  **hps.model)
24
  _ = net_g.eval()
 
25
 
26
+ _ = utils.load_checkpoint("./logs/leo/G_4000.pth", net_g, None)
27
+ all_emotions = np.load("all_emotions.npy")
28
+ emotion_dict = {
29
+ "小声": 0,
30
+ "激动": 1,
31
+ "平静1": 2,
32
+ "平静2": 3
33
+ }
34
+ import random
35
+ def tts(txt, emotion):
36
+ stn_tst = get_text(txt, hps)
37
+ randsample = None
38
  with torch.no_grad():
39
  x_tst = stn_tst.unsqueeze(0)
40
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
41
  sid = torch.LongTensor([0])
42
+ if type(emotion) ==int:
43
+ emo = torch.FloatTensor(all_emotions[emotion]).unsqueeze(0)
44
+ elif emotion == "random":
45
+ emo = torch.randn([1,1024])
46
  elif emotion == "random_sample":
47
+ randint = random.randint(0, all_emotions.shape[0])
48
+ emo = torch.FloatTensor(all_emotions[randint]).unsqueeze(0)
49
+ randsample = randint
 
 
 
50
  elif emotion.endswith("wav"):
51
  import emotion_extract
52
  emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))
53
  else:
54
+ emo = torch.FloatTensor(all_emotions[emotion_dict[emotion]]).unsqueeze(0)
55
+
56
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[0][0,0].data.float().numpy()
57
+ return audio, randsample
58
+
59
 
60
+ def tts1(text, emotion):
61
+ if len(text) > 150:
62
+ return "Error: Text is too long", None
63
+ audio, _ = tts(text, emotion)
64
+ return "Success", (hps.data.sampling_rate, audio)
65
 
66
+ def tts2(text):
67
+ if len(text) > 150:
68
+ return "Error: Text is too long", None
69
+ audio, randsample = tts(text, "random_sample")
70
 
71
+ return str(randsample), (hps.data.sampling_rate, audio)
 
72
 
73
+ def tts3(text, sample):
74
+ if len(text) > 150:
75
+ return "Error: Text is too long", None
76
+ try:
77
+ audio, _ = tts(text, int(sample))
78
+ return "Success", (hps.data.sampling_rate, audio)
79
+ except:
80
+ return "输入参数不为整数或其他错误", None
81
+ app = gr.Blocks()
82
+ with app:
83
+ with gr.Tabs():
84
+ with gr.TabItem("使用预制情感合成"):
85
+ tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。私わあやちねねです。")
86
+ tts_input2 = gr.Dropdown(label="情感", choices=list(emotion_dict.keys()), value="平静1")
87
+ tts_submit = gr.Button("合成音频", variant="primary")
88
+ tts_output1 = gr.Textbox(label="Message")
89
+ tts_output2 = gr.Audio(label="Output")
90
+ tts_submit.click(tts1, [tts_input1, tts_input2], [tts_output1, tts_output2])
91
+ with gr.TabItem("随机抽取训练集样本作为情感参数"):
92
+ tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。私わあやちねねです。")
93
+ tts_submit = gr.Button("合成音频", variant="primary")
94
+ tts_output1 = gr.Textbox(label="随机样本id(可用于第三个tab中合成)")
95
+ tts_output2 = gr.Audio(label="Output")
96
+ tts_submit.click(tts2, [tts_input1], [tts_output1, tts_output2])
97
 
98
+ with gr.TabItem("使用情感样本id作为情感参数"):
99
 
100
+ tts_input1 = gr.TextArea(label="日语文本", value="こんにちは。私わあやちねねです。")
101
+ tts_input2 = gr.Number(label="情感样本id", value=2004)
102
+ tts_submit = gr.Button("合成音频", variant="primary")
103
+ tts_output1 = gr.Textbox(label="Message")
104
+ tts_output2 = gr.Audio(label="Output")
105
+ tts_submit.click(tts3, [tts_input1, tts_input2], [tts_output1, tts_output2])
106
 
107
+ with gr.TabItem("使用参考音频作为情感参数"):
108
+ tts_input1 = gr.TextArea(label="text", value="暂未实现")
 
109
 
110
+ app.launch()
 
111