Spaces:

Kevin676
/

Telephone-Interviewing_PpaddleSpeech-TTS

Build error

Telephone-Interviewing_PpaddleSpeech-TTS

File size: 5,188 Bytes

import gradio as gr
import os

os.system('pip install paddlespeech')
os.system('pip install paddlepaddle')

from transformers import AutoModel, AutoTokenizer
from TTS.api import TTS

tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)

tts1 = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)

import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement

enhance_model = SpectralMaskEnhancement.from_hparams(
source="speechbrain/metricgan-plus-voicebank",
savedir="pretrained_models/metricgan-plus-voicebank",
run_opts={"device":"cuda"},
)

tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
model = model.eval()

def inference(text):
    os.system("paddlespeech tts --input '"+text+"' --output output.wav")
    return  "output.wav"

def predict(input, history=None):
    if history is None:
        history = []
    response, history = model.chat(tokenizer, input, history)

    return history, history, response

def chinese(text_cn, upload1, VoiceMicrophone1):

    if upload1 is not None:
        
        tts.voice_conversion_to_file(source_wav=inference(text_cn), target_wav=upload1, file_path="output0.wav")
           
    else:
        tts.voice_conversion_to_file(source_wav=inference(text_cn), target_wav=VoiceMicrophone1, file_path="output0.wav")


    noisy = enhance_model.load_audio(
    "output0.wav"
    ).unsqueeze(0)

    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
        
    return "enhanced.wav"

def english(text_en, upload, VoiceMicrophone):
    if upload is not None:
        tts1.tts_to_file(text_en.strip(), speaker_wav = upload, language="en", file_path="output.wav")

    else:
        tts1.tts_to_file(text_en.strip(), speaker_wav = VoiceMicrophone, language="en", file_path="output.wav")
        
    noisy = enhance_model.load_audio(
    "output.wav"
    ).unsqueeze(0)

    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

    return "enhanced.wav"

with gr.Blocks() as demo:
    gr.Markdown(
            """ # <center>🥳💬💕 - TalktoAI，随时随地，谈天说地！</center>
            
            ### <center>🤖 - 让有人文关怀的AI造福每一个人！AI向善，文明璀璨！TalktoAI - Enable the future！</center>
            
      """
    )
    state = gr.State([])
    chatbot = gr.Chatbot([], elem_id="chatbot").style(height=300)
    res = gr.Textbox(lines=1, placeholder="最新的回答在这里", show_label = False).style(container=False)
    with gr.Row():
#        with gr.Column(scale=4):
        txt = gr.Textbox(label = "说点什么吧(中英皆可)", lines=1)
#        with gr.Column(scale=1):
        button = gr.Button("开始对话吧")
    txt.submit(predict, [txt, state], [chatbot, state, res])
    button.click(predict, [txt, state], [chatbot, state, res])
    
    with gr.Row().style(mobile_collapse=False, equal_height=True):
        inp3 = res
        inp4 = gr.Audio(source="upload", label = "请上传您喜欢的声音(wav/mp3文件)；长语音(90s左右)效果更好", type="filepath")
        inp5 = gr.Audio(source="microphone", type="filepath", label = '请用麦克风上传您喜欢的声音，与文件上传二选一即可')
        btn1 = gr.Button("用喜欢的声音听一听吧(中文)")

        btn2 = gr.Button("用喜欢的声音听一听吧(英文)")
    with gr.Row():
        out1 = gr.Audio(label="为您合成的专属声音(中文)")
        out2 = gr.Audio(label="为您合成的专属声音(英文)")
    btn1.click(chinese, [inp3, inp4, inp5], [out1])
    btn2.click(english, [inp3, inp4, inp5], [out2])

    gr.Markdown(
            """ ### <center>注意❗：请不要输入或生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及娱乐使用。用户输入或生成的内容与程序开发者无关，请自觉合法合规使用，违反者一切后果自负。</center>
            
            ### <center>Model by [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b). Thanks to [THUDM](https://github.com/THUDM). Please follow me on [Bilibili](https://space.bilibili.com/501495851?spm_id_from=333.1007.0.0).</center>
            
      """
        )
        
    gr.HTML('''
        <div class="footer">
                    <p>🎶🖼️🎡 - It’s the intersection of technology and liberal arts that makes our hearts sing. - Steve Jobs
                    </p>
                    <p>注：中文声音克隆实际上是通过声音转换(Voice Conversion)实现，所以输出结果可能更像是一种新的声音，效果不一定很理想，希望大家多多包涵，之后我们也会不断迭代该程序的！为了实现更好的效果，使用中文声音克隆时请尽量上传女声。
                    </p>
        </div>
        ''')     

demo.queue().launch(show_error=True)