Spaces:

Mahiruoshi
/

Lovelive_Nijigasaki_VITS

Running

App Files Files Community

Mahiruoshi commited on Mar 30, 2023

Commit

73140b0

•

1 Parent(s): d1871c9

Update app.py

Browse files

Files changed (1) hide show

app.py +261 -17

app.py CHANGED Viewed

@@ -1,24 +1,268 @@
 import gradio as gr
-import random
 import time
-with gr.Blocks() as demo:
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox()
-    clear = gr.Button("Clear")
-    def user(user_message, history):
-        return "", history + [[user_message, None]]
-    def bot(history):
-        bot_message = random.choice(["Yes", "No"])
-        history[-1][1] = bot_message
-        time.sleep(1)
-        return history
-    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
-        bot, chatbot, chatbot
-    )
-    clear.click(lambda: None, None, chatbot, queue=False)
-demo.launch()

+import logging
+logging.getLogger('numba').setLevel(logging.WARNING)
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+logging.getLogger('urllib3').setLevel(logging.WARNING)
+import json
+import re
+import numpy as np
+import IPython.display as ipd
+import torch
+import commons
+import utils
+from models import SynthesizerTrn
+from text.symbols import symbols
+from text import text_to_sequence
 import gradio as gr
 import time
+import datetime
+import os
+import pickle
+import openai
+from scipy.io.wavfile import write
+def is_japanese(string):
+        for ch in string:
+            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
+                return True
+        return False
+def is_english(string):
+        import re
+        pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
+        if pattern.fullmatch(string):
+            return True
+        else:
+            return False
+def extrac(text):
+    text = re.sub("<[^>]*>","",text)
+    result_list = re.split(r'\n', text)
+    final_list = []
+    for i in result_list:
+        if is_english(i):
+            i = romajitable.to_kana(i).katakana
+        i = i.replace('\n','').replace(' ','')
+        #Current length of single sentence: 20
+        if len(i)>1:
+            if len(i) > 20:
+                try:
+                    cur_list = re.split(r'。|！', i)
+                    for i in cur_list:
+                        if len(i)>1:
+                            final_list.append(i+'。')
+                except:
+                    pass
+            else:
+                final_list.append(i)
+    final_list = [x for x in final_list if x != '']
+    print(final_list)
+    return final_list
+def to_numpy(tensor: torch.Tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad \
+        else tensor.detach().numpy()
+def chatgpt(text):
+    messages = []
+    try:
+        with open('log.pickle', 'rb') as f:
+            messages = pickle.load(f)
+            messages.append({"role": "user", "content": text},)
+            chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
+            reply = chat.choices[0].message.content
+            messages.append({"role": "assistant", "content": reply})
+            print(messages[-1])
+            if len(messages) == 12:
+                messages[6:10] = messages[8:]
+                del messages[-2:]
+            with open('log.pickle', 'wb') as f:
+                pickle.dump(messages, f)
+            return reply
+    except:
+        messages.append({"role": "user", "content": text},)
+        chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
+        reply = chat.choices[0].message.content
+        messages.append({"role": "assistant", "content": reply})
+        print(messages[-1])
+        if len(messages) == 12:
+            messages[6:10] = messages[8:]
+            del messages[-2:]
+        with open('log.pickle', 'wb') as f:
+            pickle.dump(messages, f)
+        return reply
+def get_symbols_from_json(path):
+    assert os.path.isfile(path)
+    with open(path, 'r') as f:
+        data = json.load(f)
+    return data['symbols']
+def sle(language,text):
+        text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
+        if language == "中文":
+            tts_input1 = "[ZH]" + text + "[ZH]"
+            return tts_input1
+        elif language == "自动":
+            tts_input1 = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]"
+            return tts_input1
+        elif language == "日文":
+            tts_input1 = "[JA]" + text + "[JA]"
+            return tts_input1
+        elif language == "英文":
+            tts_input1 = "[EN]" + text + "[EN]"
+            return tts_input1
+        elif language == "手动":
+            return text
+def get_text(text,hps_ms):
+    text_norm = text_to_sequence(text,hps_ms.data.text_cleaners)
+    if hps_ms.data.add_blank:
+        text_norm = commons.intersperse(text_norm, 0)
+    text_norm = torch.LongTensor(text_norm)
+    return text_norm
+def create_tts_fn(net_g,hps,speaker_id):
+    speaker_id = int(speaker_id)
+    def tts_fn(history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
+        repeat_time = int(repeat_time)
+        if is_gpt:
+            openai.api_key = api_key
+            text = chatgpt(text)
+        history + [[text, None]]
+        if not extract:
+            print(text)
+            t1 = time.time()
+            stn_tst = get_text(sle(language,text),hps)
+            with torch.no_grad():
+                x_tst = stn_tst.unsqueeze(0).to(dev)
+                x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
+                sid = torch.LongTensor([speaker_id]).to(dev)
+                audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
+                t2 = time.time()
+                spending_time = "推理时间为："+str(t2-t1)+"s"
+                print(spending_time)
+                file_path = "subtitles.srt"
+                write('moe/temp.wav',22050,audio)
+            try:
+                write(audiopath + '.wav',22050,audio)
+                if is_audio:
+                    for i in range(repeat_time):
+                        cmd = 'ffmpeg -y -i ' +  audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
+                        os.system(cmd)
+            except:
+                pass
+            return history,file_path,(hps.data.sampling_rate,audio)
+        else:
+            a = ['【','[','(','（']
+            b = ['】',']',')','）']
+            for i in a:
+                text = text.replace(i,'<')
+            for i in b:
+                text = text.replace(i,'>')
+            final_list = extrac(text.replace('“','').replace('”',''))
+            audio_fin = []
+            c = 0
+            t = datetime.timedelta(seconds=0)
+            f1 = open("subtitles.srt",'w',encoding='utf-8')
+            for sentence in final_list:
+                c +=1
+                stn_tst = get_text(sle(language,sentence),hps)
+                with torch.no_grad():
+                    x_tst = stn_tst.unsqueeze(0).to(dev)
+                    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
+                    sid = torch.LongTensor([speaker_id]).to(dev)
+                    t1 = time.time()
+                    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
+                    t2 = time.time()
+                    spending_time = "第"+str(c)+"句的推理时间为："+str(t2-t1)+"s"
+                    print(spending_time)
+                    time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
+                    last_time = datetime.timedelta(seconds=len(audio)/float(22050))
+                    t+=last_time
+                    time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
+                    print(time_end)
+                    f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
+                    audio_fin.append(audio)
+                try:
+                    write(audiopath + '.wav',22050,np.concatenate(audio_fin))
+                    if is_audio:
+                        for i in range(repeat_time):
+                            cmd = 'ffmpeg -y -i ' +  audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
+                            os.system(cmd)
+                except:
+                    pass
+            file_path = "subtitles.srt"
+            return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin))
+    return tts_fn
+if __name__ == '__main__':
+    hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
+    dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    models = []
+    schools = ["Nijigasaki High School","Seisho-Nijigasaki(Recommend)","Seisho Music Academy","Rinmeikan Girls School","Frontier School of Arts","Siegfeld Institute of Music"]
+    lan = ["中文","日文","自动","手动"]
+    with open("checkpoints/info.json", "r", encoding="utf-8") as f:
+        models_info = json.load(f)
+    for i in models_info:
+        school = models_info[i]
+        speakers = school["speakers"]
+        checkpoint = school["checkpoint"]
+        phone_dict = {
+            symbol: i for i, symbol in enumerate(symbols)
+        }
+        net_g = SynthesizerTrn(
+            len(symbols),
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            n_speakers=hps.data.n_speakers,
+            **hps.model).to(dev)
+        _ = net_g.eval()
+        _ = utils.load_checkpoint(checkpoint, net_g)
+        content = []
+        for j in speakers:
+            sid = int(speakers[j]['sid'])
+            title = school
+            example = speakers[j]['speech']
+            name = speakers[j]["name"]
+            content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
+        models.append(content)
+    with gr.Blocks() as app:
+        with gr.Tabs():
+            for i in schools:
+                with gr.TabItem(i):
+                    for (sid, name,  title, example, tts_fn) in models[schools.index(i)]:
+                        with gr.TabItem(name):
+                            with gr.Column():
+                                with gr.Row():
+                                    with gr.Row():
+                                        gr.Markdown(
+                                            '<div align="center">'
+                                            f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
+                                            '</div>'
+                                        )
+                                    chatbot = gr.Chatbot(elem_id="History")
+                                with gr.Row():
+                                    with gr.Column(scale=0.85):
+                                        input1 = gr.TextArea(label="Text", value=example,lines = 1)
+                                    with gr.Column(scale=0.15, min_width=0):
+                                        btnVC = gr.Button("Send")
+                                output1 = gr.Audio(label="采样率22050")
+                                with gr.Accordion(label="Setting", open=False):
+                                    input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
+                                    input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
+                                    input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale)，以控制情感", value=0.467)
+                                    input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w)，以控制音素长短", value=0.7)
+                                    input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
+                                with gr.Accordion(label="Advanced Setting", open=False):
+                                    audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
+                                    api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
+                                    api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api')
+                                    output2 = gr.outputs.File(label="字幕文件：subtitles.srt")
+                                    audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
+                                    audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
+                        btnVC.click(tts_fn, inputs=[chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1])
+    app.launch()