Spaces:

ATForest
/

english

Runtime error

File size: 8,864 Bytes

import gradio as gr

from textwrap import dedent

import edge_tts
import tempfile
from tts_voice import tts_order_voice

from english.translate import Translate
from english.split_text import sentence_split
from english.generator import generatorArticle

import random
import codecs
import torch
import librosa
from models import SynthesizerTrn

from scipy.io.wavfile import write
import utils
from mel_processing import mel_spectrogram_torch
from speaker_encoder.voice_encoder import SpeakerEncoder
from transformers import WavLMModel

language_dict = tts_order_voice

def parse_text(input):
    text =  generatorArticle(input).strip()

    lines = text.split("\n")
    lines = [line for line in lines if line != ""]
    count = 0
    for i, line in enumerate(lines):
        if "```" in line:
            count += 1
            items = line.split("`")
            if count % 2 == 1:
                lines[i] = f'<pre><code class="language-{items[-1]}">'
            else:
                lines[i] = "<br></code></pre>"
        else:
            if i > 0:
                if count % 2 == 1:
                    line = line.replace("`", r"\`")
                    line = line.replace("<", "&lt;")
                    line = line.replace(">", "&gt;")
                    line = line.replace(" ", "&nbsp;")
                    line = line.replace("*", "&ast;")
                    line = line.replace("_", "&lowbar;")
                    line = line.replace("-", "&#45;")
                    line = line.replace(".", "&#46;")
                    line = line.replace("!", "&#33;")
                    line = line.replace("(", "&#40;")
                    line = line.replace(")", "&#41;")
                    line = line.replace("$", "&#36;")
                lines[i] = "<br>" + line
    return text

def predict(input):
    article = parse_text(input)
    yield article,article

async def text_to_speech_edge(text, language_code):
    voice = language_dict[language_code]
    communicate = edge_tts.Communicate(text, voice)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
    await communicate.save(tmp_path)

    return tmp_path

def tran_2_chianese(text):
    translate = Translate()
    sentence_str = sentence_split(text)
    i = 0
    result=''
    length = len(sentence_str)
    while(i < length):
        tmp = sentence_str[i]
        print('\n'+tmp)
        tran = translate.translateToZh(tmp)
        result = result+tmp+'\n'+tran+'\n'
        i+=1
    return result

def readWorldsFile(file_path):
    fp  = codecs.open(file_path, 'r', encoding='gb2312')
    lines = fp.readlines()
    worlds ,paraphrase = [],[]
    for line in lines:
        tmp = line.split('|')
        worlds.append(tmp[0].strip())
        paraphrase.append(tmp[1].strip())
    fp.close()
    return worlds, paraphrase

def generatorWorlds(file_path):
    worlds,paraphrase = readWorldsFile(file_path)
    length = len(worlds)

    index = 0
    worlds_text = ''
    
    while index < 15:
        num = random.randint(0,length)
        worlds_text += f'{worlds[num]},【{paraphrase[num]}】\n'
        index += 1

    print('\n' + worlds_text)
    return worlds_text

def choose_word_from_file(input):
    result = generatorWorlds(input.orig_name)
    return result

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')

print("Loading FreeVC(24k)...")
hps = utils.get_hparams_from_file("configs/freevc-24.json")
freevc_24 = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = freevc_24.eval()
_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)

print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
 

def convert(model, src, tgt):
    with torch.no_grad():
        # tgt
        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
        if model == "FreeVC" or model == "FreeVC (24kHz)":
            g_tgt = smodel.embed_utterance(wav_tgt)
            g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
        else:
            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
            mel_tgt = mel_spectrogram_torch(
                wav_tgt, 
                hps.data.filter_length,
                hps.data.n_mel_channels,
                hps.data.sampling_rate,
                hps.data.hop_length,
                hps.data.win_length,
                hps.data.mel_fmin,
                hps.data.mel_fmax
            )
        # src
        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
        # infer
        if model == "FreeVC":
            audio = freevc.infer(c, g=g_tgt)
        elif model == "FreeVC-s":
            audio = freevc_s.infer(c, mel=mel_tgt)
        else:
            audio = freevc_24.infer(c, g=g_tgt)
        audio = audio[0][0].data.cpu().float().numpy()
        if model == "FreeVC" or model == "FreeVC-s":
            write("out.wav", hps.data.sampling_rate, audio)
        else:
            write("out.wav", 24000, audio)
    out = "out.wav"
    return out

with gr.Blocks(title="Learn English By AI", theme=gr.themes.Soft(text_size="sm")) as demo:
    gr.HTML("<center>"
            "<h1>OpenAI + 声音克隆：根据单词生成短文，帮助理解单词使用的语境！！</h1>"
            "</center>")

    with gr.Accordion("📒 相关信息", open=True):
        _ = f"""OpenAI Prompt 的可选参数信息：
            * 输入 10-15 个单词为宜
            * prompt = '你是一个非常厉害的英语助手,请将'{'words'}'组成一篇英语文章,字数限制在100 字以内'
            * Open AI 用的是限制账号，每分钟请求 3 次
            * 单词文件：每个单词及解释单独成行，单词与注释同行，用 “｜” 分割
            """
        gr.Markdown(dedent(_))

    with gr.Row():

        file = gr.File()
        chooseBtn = gr.Button("从文件提取或输入 -》", variant="secondary")
        user_input = gr.Textbox(
                    max_lines=5,
                    lines=3,
                    label="单词用逗号分割：",
                    placeholder="10-15 words will be better",
                )
        
    with gr.Column(scale=1):
        submitBtn = gr.Button("开始生成英语短文", variant="primary")
        chatbot = gr.Textbox(label="英语短文：", lines = 5, max_lines=8)
        
    chooseBtn.click(
        choose_word_from_file, 
        inputs=[file], 
        outputs=[user_input],        
        show_progress="full",
        api_name="choose_word_from_file"
    )

    with gr.Column(scale=3):
        with gr.Row():
            tran_result = gr.Textbox(label="翻译结果", lines = 5,max_lines=8,scale=2)
            tran_btn = gr.Button("翻译", variant="primary")
    
    tran_btn.click(
        tran_2_chianese, 
        inputs=[chatbot], 
        outputs=[tran_result],        
        show_progress="full",
        api_name="tran_2_chianese"
    )
    
    with gr.Column(min_width=32, scale=2):
        with gr.Row():
            with gr.Column():
                language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人")
                tts_btn = gr.Button("生成对应的音频吧", variant="primary")
            output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False)

    tts_btn.click(text_to_speech_edge, inputs=[chatbot, language], outputs=[output_audio])

    with gr.Row():
        model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False) 
        audio1 = output_audio
        audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath')
        clone_btn = gr.Button("开始AI声音克隆吧", variant="primary")
        audio_cloned =  gr.Audio(label="为您生成的专属声音克隆音频", type='filepath')

    clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned])
 
    user_input.submit(
        predict,
        [user_input],
        [chatbot,tran_result],
        show_progress="full",
    )

    submitBtn.click(
        predict,
        [user_input],
        [chatbot,tran_result],
        show_progress="full",
        api_name="predict",
    )
    # submitBtn.click(reset_user_input, [], [user_input])

demo.queue().launch(show_error=True, debug=True)