File size: 5,188 Bytes
ec1133b
 
 
d37479b
 
 
cc6e289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec1133b
cc6e289
 
 
 
 
 
 
8d62e2f
0770c01
 
 
8d62e2f
cc6e289
 
0770c01
cc6e289
 
0770c01
cc6e289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0770c01
cc6e289
 
 
 
 
0770c01
 
cc6e289
 
0770c01
cc6e289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import os

os.system('pip install paddlespeech')
os.system('pip install paddlepaddle')

from transformers import AutoModel, AutoTokenizer
from TTS.api import TTS

tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)

tts1 = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)

import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement

enhance_model = SpectralMaskEnhancement.from_hparams(
source="speechbrain/metricgan-plus-voicebank",
savedir="pretrained_models/metricgan-plus-voicebank",
run_opts={"device":"cuda"},
)

tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
model = model.eval()

def inference(text):
    os.system("paddlespeech tts --input '"+text+"' --output output.wav")
    return  "output.wav"

def predict(input, history=None):
    if history is None:
        history = []
    response, history = model.chat(tokenizer, input, history)

    return history, history, response

def chinese(text_cn, upload1, VoiceMicrophone1):

    if upload1 is not None:
        
        tts.voice_conversion_to_file(source_wav=inference(text_cn), target_wav=upload1, file_path="output0.wav")
           
    else:
        tts.voice_conversion_to_file(source_wav=inference(text_cn), target_wav=VoiceMicrophone1, file_path="output0.wav")


    noisy = enhance_model.load_audio(
    "output0.wav"
    ).unsqueeze(0)

    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
        
    return "enhanced.wav"

def english(text_en, upload, VoiceMicrophone):
    if upload is not None:
        tts1.tts_to_file(text_en.strip(), speaker_wav = upload, language="en", file_path="output.wav")

    else:
        tts1.tts_to_file(text_en.strip(), speaker_wav = VoiceMicrophone, language="en", file_path="output.wav")
        
    noisy = enhance_model.load_audio(
    "output.wav"
    ).unsqueeze(0)

    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

    return "enhanced.wav"

with gr.Blocks() as demo:
    gr.Markdown(
            """ # <center>🥳💬💕 - TalktoAI,随时随地,谈天说地!</center>
            
            ### <center>🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!</center>
            
      """
    )
    state = gr.State([])
    chatbot = gr.Chatbot([], elem_id="chatbot").style(height=300)
    res = gr.Textbox(lines=1, placeholder="最新的回答在这里", show_label = False).style(container=False)
    with gr.Row():
#        with gr.Column(scale=4):
        txt = gr.Textbox(label = "说点什么吧(中英皆可)", lines=1)
#        with gr.Column(scale=1):
        button = gr.Button("开始对话吧")
    txt.submit(predict, [txt, state], [chatbot, state, res])
    button.click(predict, [txt, state], [chatbot, state, res])
    
    with gr.Row().style(mobile_collapse=False, equal_height=True):
        inp3 = res
        inp4 = gr.Audio(source="upload", label = "请上传您喜欢的声音(wav/mp3文件);长语音(90s左右)效果更好", type="filepath")
        inp5 = gr.Audio(source="microphone", type="filepath", label = '请用麦克风上传您喜欢的声音,与文件上传二选一即可')
        btn1 = gr.Button("用喜欢的声音听一听吧(中文)")

        btn2 = gr.Button("用喜欢的声音听一听吧(英文)")
    with gr.Row():
        out1 = gr.Audio(label="为您合成的专属声音(中文)")
        out2 = gr.Audio(label="为您合成的专属声音(英文)")
    btn1.click(chinese, [inp3, inp4, inp5], [out1])
    btn2.click(english, [inp3, inp4, inp5], [out2])

    gr.Markdown(
            """ ### <center>注意❗:请不要输入或生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及娱乐使用。用户输入或生成的内容与程序开发者无关,请自觉合法合规使用,违反者一切后果自负。</center>
            
            ### <center>Model by [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b). Thanks to [THUDM](https://github.com/THUDM). Please follow me on [Bilibili](https://space.bilibili.com/501495851?spm_id_from=333.1007.0.0).</center>
            
      """
        )
        
    gr.HTML('''
        <div class="footer">
                    <p>🎶🖼️🎡 - It’s the intersection of technology and liberal arts that makes our hearts sing. - Steve Jobs
                    </p>
                    <p>注:中文声音克隆实际上是通过声音转换(Voice Conversion)实现,所以输出结果可能更像是一种新的声音,效果不一定很理想,希望大家多多包涵,之后我们也会不断迭代该程序的!为了实现更好的效果,使用中文声音克隆时请尽量上传女声。
                    </p>
        </div>
        ''')     

demo.queue().launch(show_error=True)