File size: 8,864 Bytes
3d67931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f9c46f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import gradio as gr

from textwrap import dedent

import edge_tts
import tempfile
from tts_voice import tts_order_voice

from english.translate import Translate
from english.split_text import sentence_split
from english.generator import generatorArticle

import random
import codecs
import torch
import librosa
from models import SynthesizerTrn

from scipy.io.wavfile import write
import utils
from mel_processing import mel_spectrogram_torch
from speaker_encoder.voice_encoder import SpeakerEncoder
from transformers import WavLMModel

language_dict = tts_order_voice

def parse_text(input):
    text =  generatorArticle(input).strip()

    lines = text.split("\n")
    lines = [line for line in lines if line != ""]
    count = 0
    for i, line in enumerate(lines):
        if "```" in line:
            count += 1
            items = line.split("`")
            if count % 2 == 1:
                lines[i] = f'<pre><code class="language-{items[-1]}">'
            else:
                lines[i] = "<br></code></pre>"
        else:
            if i > 0:
                if count % 2 == 1:
                    line = line.replace("`", r"\`")
                    line = line.replace("<", "&lt;")
                    line = line.replace(">", "&gt;")
                    line = line.replace(" ", "&nbsp;")
                    line = line.replace("*", "&ast;")
                    line = line.replace("_", "&lowbar;")
                    line = line.replace("-", "&#45;")
                    line = line.replace(".", "&#46;")
                    line = line.replace("!", "&#33;")
                    line = line.replace("(", "&#40;")
                    line = line.replace(")", "&#41;")
                    line = line.replace("$", "&#36;")
                lines[i] = "<br>" + line
    return text

def predict(input):
    article = parse_text(input)
    yield article,article

async def text_to_speech_edge(text, language_code):
    voice = language_dict[language_code]
    communicate = edge_tts.Communicate(text, voice)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
    await communicate.save(tmp_path)

    return tmp_path

def tran_2_chianese(text):
    translate = Translate()
    sentence_str = sentence_split(text)
    i = 0
    result=''
    length = len(sentence_str)
    while(i < length):
        tmp = sentence_str[i]
        print('\n'+tmp)
        tran = translate.translateToZh(tmp)
        result = result+tmp+'\n'+tran+'\n'
        i+=1
    return result

def readWorldsFile(file_path):
    fp  = codecs.open(file_path, 'r', encoding='gb2312')
    lines = fp.readlines()
    worlds ,paraphrase = [],[]
    for line in lines:
        tmp = line.split('|')
        worlds.append(tmp[0].strip())
        paraphrase.append(tmp[1].strip())
    fp.close()
    return worlds, paraphrase

def generatorWorlds(file_path):
    worlds,paraphrase = readWorldsFile(file_path)
    length = len(worlds)

    index = 0
    worlds_text = ''
    
    while index < 15:
        num = random.randint(0,length)
        worlds_text += f'{worlds[num]},【{paraphrase[num]}】\n'
        index += 1

    print('\n' + worlds_text)
    return worlds_text

def choose_word_from_file(input):
    result = generatorWorlds(input.orig_name)
    return result

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')

print("Loading FreeVC(24k)...")
hps = utils.get_hparams_from_file("configs/freevc-24.json")
freevc_24 = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = freevc_24.eval()
_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)

print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
 

def convert(model, src, tgt):
    with torch.no_grad():
        # tgt
        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
        if model == "FreeVC" or model == "FreeVC (24kHz)":
            g_tgt = smodel.embed_utterance(wav_tgt)
            g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
        else:
            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
            mel_tgt = mel_spectrogram_torch(
                wav_tgt, 
                hps.data.filter_length,
                hps.data.n_mel_channels,
                hps.data.sampling_rate,
                hps.data.hop_length,
                hps.data.win_length,
                hps.data.mel_fmin,
                hps.data.mel_fmax
            )
        # src
        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
        # infer
        if model == "FreeVC":
            audio = freevc.infer(c, g=g_tgt)
        elif model == "FreeVC-s":
            audio = freevc_s.infer(c, mel=mel_tgt)
        else:
            audio = freevc_24.infer(c, g=g_tgt)
        audio = audio[0][0].data.cpu().float().numpy()
        if model == "FreeVC" or model == "FreeVC-s":
            write("out.wav", hps.data.sampling_rate, audio)
        else:
            write("out.wav", 24000, audio)
    out = "out.wav"
    return out

with gr.Blocks(title="Learn English By AI", theme=gr.themes.Soft(text_size="sm")) as demo:
    gr.HTML("<center>"
            "<h1>OpenAI + 声音克隆:根据单词生成短文,帮助理解单词使用的语境!!</h1>"
            "</center>")

    with gr.Accordion("📒 相关信息", open=True):
        _ = f"""OpenAI Prompt 的可选参数信息:
            * 输入 10-15 个单词为宜
            * prompt = '你是一个非常厉害的英语助手,请将'{'words'}'组成一篇英语文章,字数限制在100 字以内'
            * Open AI 用的是限制账号,每分钟请求 3 次
            * 单词文件:每个单词及解释单独成行,单词与注释同行,用 “|” 分割
            """
        gr.Markdown(dedent(_))

    with gr.Row():

        file = gr.File()
        chooseBtn = gr.Button("从文件提取或输入 -》", variant="secondary")
        user_input = gr.Textbox(
                    max_lines=5,
                    lines=3,
                    label="单词用逗号分割:",
                    placeholder="10-15 words will be better",
                )
        
    with gr.Column(scale=1):
        submitBtn = gr.Button("开始生成英语短文", variant="primary")
        chatbot = gr.Textbox(label="英语短文:", lines = 5, max_lines=8)
        
    chooseBtn.click(
        choose_word_from_file, 
        inputs=[file], 
        outputs=[user_input],        
        show_progress="full",
        api_name="choose_word_from_file"
    )

    with gr.Column(scale=3):
        with gr.Row():
            tran_result = gr.Textbox(label="翻译结果", lines = 5,max_lines=8,scale=2)
            tran_btn = gr.Button("翻译", variant="primary")
    
    tran_btn.click(
        tran_2_chianese, 
        inputs=[chatbot], 
        outputs=[tran_result],        
        show_progress="full",
        api_name="tran_2_chianese"
    )
    
    with gr.Column(min_width=32, scale=2):
        with gr.Row():
            with gr.Column():
                language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人")
                tts_btn = gr.Button("生成对应的音频吧", variant="primary")
            output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False)

    tts_btn.click(text_to_speech_edge, inputs=[chatbot, language], outputs=[output_audio])

    with gr.Row():
        model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False) 
        audio1 = output_audio
        audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath')
        clone_btn = gr.Button("开始AI声音克隆吧", variant="primary")
        audio_cloned =  gr.Audio(label="为您生成的专属声音克隆音频", type='filepath')

    clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned])
 
    user_input.submit(
        predict,
        [user_input],
        [chatbot,tran_result],
        show_progress="full",
    )

    submitBtn.click(
        predict,
        [user_input],
        [chatbot,tran_result],
        show_progress="full",
        api_name="predict",
    )
    # submitBtn.click(reset_user_input, [], [user_input])

demo.queue().launch(show_error=True, debug=True)