chilge commited on
Commit
04749f1
1 Parent(s): 2e910d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -38
app.py CHANGED
@@ -1,23 +1,18 @@
 
1
  import os
 
 
 
 
 
 
2
  import json
3
- import math
4
  import torch
5
- from torch import nn
6
- from torch.nn import functional as F
7
- from torch.utils.data import DataLoader
8
- from scipy.io.wavfile import write
9
  import numpy as np
10
-
11
- import gradio as gr
12
- import IPython.display as ipd
13
-
14
- import commons
15
- import utils
16
- from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
17
  from models import SynthesizerTrn
18
  from text.symbols import symbols
19
  from text import text_to_sequence
20
-
21
 
22
  def get_text(text, hps):
23
  text_norm = text_to_sequence(text, hps.data.text_cleaners)
@@ -26,7 +21,6 @@ def get_text(text, hps):
26
  text_norm = torch.LongTensor(text_norm)
27
  return text_norm
28
 
29
-
30
  def get_text_byroma(text, hps):
31
  text_norm = []
32
  for i in text:
@@ -36,26 +30,22 @@ def get_text_byroma(text, hps):
36
  text_norm = torch.LongTensor(text_norm)
37
  return text_norm
38
 
39
-
40
  hps = utils.get_hparams_from_file("./configs/leo.json")
41
  net_g = SynthesizerTrn(
42
  len(symbols),
43
  hps.data.filter_length // 2 + 1,
44
  hps.train.segment_size // hps.data.hop_length,
45
  n_speakers=hps.data.n_speakers,
46
- **hps.model
47
- )
48
  _ = net_g.eval()
49
-
50
  _ = utils.load_checkpoint("logs/leo/G_4000.pth", net_g, None)
51
 
52
  # 随机抽取情感参考音频的根目录
53
  random_emotion_root = "wavs"
54
  emotion_dict = json.load(open("configs/leo.json", "r"))
55
 
56
-
57
  def tts(txt, emotion, roma=False, length_scale=1):
58
- """emotion为参考情感音频路径 或random_sample(随机抽取)"""
59
  if roma:
60
  stn_tst = get_text_byroma(txt, hps)
61
  else:
@@ -82,21 +72,22 @@ def tts(txt, emotion, roma=False, length_scale=1):
82
  audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.2, emo=emo)[0][0,0].data.float().numpy()
83
  ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
84
 
85
-
86
- def run_tts(text, emotion, roma):
87
- tts(text, emotion, roma)
88
-
89
-
90
- iface = gr.Interface(
91
- fn=run_tts,
92
- inputs=["text", "text", "checkbox"],
93
- outputs="audio",
94
- layout="vertical",
95
- title="TTS Demo",
96
- description="Generative TTS Demo with Emotional Control",
97
- allow_flagging=False,
98
- theme="huggingface",
99
- flagging_dir="flagged",
100
- )
101
-
102
- iface.launch(inline=True)
 
 
1
+ import gradio as gr
2
  import os
3
+ import random
4
+ import IPython.display as ipd
5
+
6
+ import matplotlib.pyplot as plt
7
+ %matplotlib inline
8
+
9
  import json
 
10
  import torch
 
 
 
 
11
  import numpy as np
 
 
 
 
 
 
 
12
  from models import SynthesizerTrn
13
  from text.symbols import symbols
14
  from text import text_to_sequence
15
+ from scipy.io.wavfile import write
16
 
17
  def get_text(text, hps):
18
  text_norm = text_to_sequence(text, hps.data.text_cleaners)
 
21
  text_norm = torch.LongTensor(text_norm)
22
  return text_norm
23
 
 
24
  def get_text_byroma(text, hps):
25
  text_norm = []
26
  for i in text:
 
30
  text_norm = torch.LongTensor(text_norm)
31
  return text_norm
32
 
 
33
  hps = utils.get_hparams_from_file("./configs/leo.json")
34
  net_g = SynthesizerTrn(
35
  len(symbols),
36
  hps.data.filter_length // 2 + 1,
37
  hps.train.segment_size // hps.data.hop_length,
38
  n_speakers=hps.data.n_speakers,
39
+ **hps.model)
 
40
  _ = net_g.eval()
 
41
  _ = utils.load_checkpoint("logs/leo/G_4000.pth", net_g, None)
42
 
43
  # 随机抽取情感参考音频的根目录
44
  random_emotion_root = "wavs"
45
  emotion_dict = json.load(open("configs/leo.json", "r"))
46
 
 
47
  def tts(txt, emotion, roma=False, length_scale=1):
48
+ """emotion为参考情感音频路径或random_sample(随机抽取)"""
49
  if roma:
50
  stn_tst = get_text_byroma(txt, hps)
51
  else:
 
72
  audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.2, emo=emo)[0][0,0].data.float().numpy()
73
  ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
74
 
75
+ # 定义GUI界面的输入和输出
76
+ def generate_audio(txt, emotion):
77
+ tts(txt, emotion)
78
+ return "Audio Generated"
79
+
80
+ inputs = [
81
+ gr.inputs.Textbox(lines=2, label="Text Input"),
82
+ gr.inputs.Radio(["random_sample", "wavs/vo_bm_main2_07_20_0048.wav"], label="Emotion Reference"),
83
+ ]
84
+
85
+ # 创建GUI界面
86
+ title = "Emotion TTS"
87
+ description = "Enter the text and select the emotion reference to generate synthesized speech."
88
+ outputs = gr.outputs.Textbox(label="Audio Output")
89
+ examples = [["Hello, how are you?", "random_sample"]]
90
+ gr_interface = gr.Interface(fn=generate_audio, inputs=inputs, outputs=outputs, title=title, description=description, examples=examples)
91
+
92
+ # 运行GUI界面
93
+ gr_interface.launch()