|
import gradio as gr |
|
|
|
from textwrap import dedent |
|
|
|
import edge_tts |
|
import tempfile |
|
from tts_voice import tts_order_voice |
|
|
|
from english.translate import Translate |
|
from english.split_text import sentence_split |
|
from english.generator import generatorArticle |
|
|
|
import random |
|
import codecs |
|
import torch |
|
import librosa |
|
from models import SynthesizerTrn |
|
|
|
from scipy.io.wavfile import write |
|
import utils |
|
from mel_processing import mel_spectrogram_torch |
|
from speaker_encoder.voice_encoder import SpeakerEncoder |
|
from transformers import WavLMModel |
|
|
|
language_dict = tts_order_voice |
|
|
|
def parse_text(input): |
|
text = generatorArticle(input).strip() |
|
|
|
lines = text.split("\n") |
|
lines = [line for line in lines if line != ""] |
|
count = 0 |
|
for i, line in enumerate(lines): |
|
if "```" in line: |
|
count += 1 |
|
items = line.split("`") |
|
if count % 2 == 1: |
|
lines[i] = f'<pre><code class="language-{items[-1]}">' |
|
else: |
|
lines[i] = "<br></code></pre>" |
|
else: |
|
if i > 0: |
|
if count % 2 == 1: |
|
line = line.replace("`", r"\`") |
|
line = line.replace("<", "<") |
|
line = line.replace(">", ">") |
|
line = line.replace(" ", " ") |
|
line = line.replace("*", "*") |
|
line = line.replace("_", "_") |
|
line = line.replace("-", "-") |
|
line = line.replace(".", ".") |
|
line = line.replace("!", "!") |
|
line = line.replace("(", "(") |
|
line = line.replace(")", ")") |
|
line = line.replace("$", "$") |
|
lines[i] = "<br>" + line |
|
return text |
|
|
|
def predict(input): |
|
article = parse_text(input) |
|
yield article,article |
|
|
|
async def text_to_speech_edge(text, language_code): |
|
voice = language_dict[language_code] |
|
communicate = edge_tts.Communicate(text, voice) |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: |
|
tmp_path = tmp_file.name |
|
await communicate.save(tmp_path) |
|
|
|
return tmp_path |
|
|
|
def tran_2_chianese(text): |
|
translate = Translate() |
|
sentence_str = sentence_split(text) |
|
i = 0 |
|
result='' |
|
length = len(sentence_str) |
|
while(i < length): |
|
tmp = sentence_str[i] |
|
print('\n'+tmp) |
|
tran = translate.translateToZh(tmp) |
|
result = result+tmp+'\n'+tran+'\n' |
|
i+=1 |
|
return result |
|
|
|
def readWorldsFile(file_path): |
|
fp = codecs.open(file_path, 'r', encoding='gb2312') |
|
lines = fp.readlines() |
|
worlds ,paraphrase = [],[] |
|
for line in lines: |
|
tmp = line.split('|') |
|
worlds.append(tmp[0].strip()) |
|
paraphrase.append(tmp[1].strip()) |
|
fp.close() |
|
return worlds, paraphrase |
|
|
|
def generatorWorlds(file_path): |
|
worlds,paraphrase = readWorldsFile(file_path) |
|
length = len(worlds) |
|
|
|
index = 0 |
|
worlds_text = '' |
|
|
|
while index < 15: |
|
num = random.randint(0,length) |
|
worlds_text += f'{worlds[num]},【{paraphrase[num]}】\n' |
|
index += 1 |
|
|
|
print('\n' + worlds_text) |
|
return worlds_text |
|
|
|
def choose_word_from_file(input): |
|
result = generatorWorlds(input.orig_name) |
|
return result |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') |
|
|
|
print("Loading FreeVC(24k)...") |
|
hps = utils.get_hparams_from_file("configs/freevc-24.json") |
|
freevc_24 = SynthesizerTrn( |
|
hps.data.filter_length // 2 + 1, |
|
hps.train.segment_size // hps.data.hop_length, |
|
**hps.model).to(device) |
|
_ = freevc_24.eval() |
|
_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None) |
|
|
|
print("Loading WavLM for content...") |
|
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) |
|
|
|
|
|
def convert(model, src, tgt): |
|
with torch.no_grad(): |
|
|
|
wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) |
|
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) |
|
if model == "FreeVC" or model == "FreeVC (24kHz)": |
|
g_tgt = smodel.embed_utterance(wav_tgt) |
|
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) |
|
else: |
|
wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device) |
|
mel_tgt = mel_spectrogram_torch( |
|
wav_tgt, |
|
hps.data.filter_length, |
|
hps.data.n_mel_channels, |
|
hps.data.sampling_rate, |
|
hps.data.hop_length, |
|
hps.data.win_length, |
|
hps.data.mel_fmin, |
|
hps.data.mel_fmax |
|
) |
|
|
|
wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) |
|
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) |
|
c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) |
|
|
|
if model == "FreeVC": |
|
audio = freevc.infer(c, g=g_tgt) |
|
elif model == "FreeVC-s": |
|
audio = freevc_s.infer(c, mel=mel_tgt) |
|
else: |
|
audio = freevc_24.infer(c, g=g_tgt) |
|
audio = audio[0][0].data.cpu().float().numpy() |
|
if model == "FreeVC" or model == "FreeVC-s": |
|
write("out.wav", hps.data.sampling_rate, audio) |
|
else: |
|
write("out.wav", 24000, audio) |
|
out = "out.wav" |
|
return out |
|
|
|
with gr.Blocks(title="Learn English By AI", theme=gr.themes.Soft(text_size="sm")) as demo: |
|
gr.HTML("<center>" |
|
"<h1>OpenAI + 声音克隆:根据单词生成短文,帮助理解单词使用的语境!!</h1>" |
|
"</center>") |
|
|
|
with gr.Accordion("📒 相关信息", open=True): |
|
_ = f"""OpenAI Prompt 的可选参数信息: |
|
* 输入 10-15 个单词为宜 |
|
* prompt = '你是一个非常厉害的英语助手,请将'{'words'}'组成一篇英语文章,字数限制在100 字以内' |
|
* Open AI 用的是限制账号,每分钟请求 3 次 |
|
* 单词文件:每个单词及解释单独成行,单词与注释同行,用 “|” 分割 |
|
""" |
|
gr.Markdown(dedent(_)) |
|
|
|
with gr.Row(): |
|
|
|
file = gr.File() |
|
chooseBtn = gr.Button("从文件提取或输入 -》", variant="secondary") |
|
user_input = gr.Textbox( |
|
max_lines=5, |
|
lines=3, |
|
label="单词用逗号分割:", |
|
placeholder="10-15 words will be better", |
|
) |
|
|
|
with gr.Column(scale=1): |
|
submitBtn = gr.Button("开始生成英语短文", variant="primary") |
|
chatbot = gr.Textbox(label="英语短文:", lines = 5, max_lines=8) |
|
|
|
chooseBtn.click( |
|
choose_word_from_file, |
|
inputs=[file], |
|
outputs=[user_input], |
|
show_progress="full", |
|
api_name="choose_word_from_file" |
|
) |
|
|
|
with gr.Column(scale=3): |
|
with gr.Row(): |
|
tran_result = gr.Textbox(label="翻译结果", lines = 5,max_lines=8,scale=2) |
|
tran_btn = gr.Button("翻译", variant="primary") |
|
|
|
tran_btn.click( |
|
tran_2_chianese, |
|
inputs=[chatbot], |
|
outputs=[tran_result], |
|
show_progress="full", |
|
api_name="tran_2_chianese" |
|
) |
|
|
|
with gr.Column(min_width=32, scale=2): |
|
with gr.Row(): |
|
with gr.Column(): |
|
language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人") |
|
tts_btn = gr.Button("生成对应的音频吧", variant="primary") |
|
output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False) |
|
|
|
tts_btn.click(text_to_speech_edge, inputs=[chatbot, language], outputs=[output_audio]) |
|
|
|
with gr.Row(): |
|
model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False) |
|
audio1 = output_audio |
|
audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath') |
|
clone_btn = gr.Button("开始AI声音克隆吧", variant="primary") |
|
audio_cloned = gr.Audio(label="为您生成的专属声音克隆音频", type='filepath') |
|
|
|
clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned]) |
|
|
|
user_input.submit( |
|
predict, |
|
[user_input], |
|
[chatbot,tran_result], |
|
show_progress="full", |
|
) |
|
|
|
submitBtn.click( |
|
predict, |
|
[user_input], |
|
[chatbot,tran_result], |
|
show_progress="full", |
|
api_name="predict", |
|
) |
|
|
|
|
|
demo.queue().launch(show_error=True, debug=True) |
|
|