Spaces:

Mahiruoshi
/

Lovelive_Nijigasaki_VITS

Running

App Files Files Community

Mahiruoshi commited on Mar 30, 2023

Commit

1c576d6

•

1 Parent(s): 9b16e15

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -30

app.py CHANGED Viewed

@@ -16,6 +16,9 @@ import gradio as gr
 import time
 import datetime
 import os
 def is_japanese(string):
         for ch in string:
             if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
@@ -58,6 +61,67 @@ def to_numpy(tensor: torch.Tensor):
     return tensor.detach().cpu().numpy() if tensor.requires_grad \
         else tensor.detach().numpy()
 def get_symbols_from_json(path):
     assert os.path.isfile(path)
     with open(path, 'r') as f:
@@ -90,7 +154,14 @@ def get_text(text,hps_ms):
 def create_tts_fn(net_g,hps,speaker_id):
     speaker_id = int(speaker_id)
-    def tts_fn(text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
         if not extract:
             print(text)
             t1 = time.time()
@@ -104,7 +175,12 @@ def create_tts_fn(net_g,hps,speaker_id):
                 spending_time = "推理时间为："+str(t2-t1)+"s"
                 print(spending_time)
                 file_path = "subtitles.srt"
-            return (hps.data.sampling_rate, audio),file_path
         else:
             a = ['【','[','(','（']
             b = ['】',']',')','）']
@@ -119,7 +195,7 @@ def create_tts_fn(net_g,hps,speaker_id):
             f1 = open("subtitles.srt",'w',encoding='utf-8')
             for sentence in final_list:
                 c +=1
-                stn_tst = get_text(sle(language,text),hps)
                 with torch.no_grad():
                     x_tst = stn_tst.unsqueeze(0).to(dev)
                     x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
@@ -136,8 +212,13 @@ def create_tts_fn(net_g,hps,speaker_id):
                     print(time_end)
                     f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
                     audio_fin.append(audio)
             file_path = "subtitles.srt"
-            return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path
     return tts_fn
@@ -174,36 +255,38 @@ if __name__ == '__main__':
         models.append(content)
     with gr.Blocks() as app:
-        gr.Markdown(
-            "## <center> LoveLive-ShojoKageki 中日双语Vits\n"
-            "### <center> 请不要生成会对个人以及企划造成侵害的内容\n"
-            '<div align="center"><a>参数说明:由于模型仍然在训练初期阶段，少歌部分质量较差，建议将噪声比例调节至0.2-0.3区间，ShojoKageki模型则可以尝试默认的0.667;噪声偏差对应着每个字之间的间隔，对普通话影响较大，建议0.6-0.8;duration代表整体语速</div>'
-            '<div align="center"><a>合成前请先选择模型，否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行</div>')
         with gr.Tabs():
             for i in schools:
                 with gr.TabItem(i):
                     for (sid, name,  title, example, tts_fn) in models[schools.index(i)]:
                         with gr.TabItem(name):
-                            with gr.Row():
-                                with gr.Column():
                                     with gr.Row():
-                                        with gr.Column():
-                                            gr.Markdown(
-                                                '<div align="center">'
-                                                f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
-                                                '</div>'
-                                            )
-                                            with gr.Accordion(label="Advanced Options", open=False):
-                                                input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
-                                                input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
-                                                output2 = gr.outputs.File(label="字幕文件：subtitles.srt")
-                                        with gr.Column():
-                                            input1 = gr.TextArea(label="Text", value=example)
-                                            input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale)，以控制情感", value=0.267)
-                                            input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w)，以控制音素长短", value=0.7)
-                                            input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
-                                            btnVC = gr.Button("Submit")
-                                            output1 = gr.Audio(label="采样率22050")
-                            btnVC.click(tts_fn, inputs=[input1, input2,input3,input4, input5, input6], outputs=[output1,output2])
-    app.launch()

 import time
 import datetime
 import os
+import pickle
+import openai
+from scipy.io.wavfile import write
 def is_japanese(string):
         for ch in string:
             if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
     return tensor.detach().cpu().numpy() if tensor.requires_grad \
         else tensor.detach().numpy()
+def to_html(chat_history):
+    chat_html = ""
+    for item in chat_history:
+        if item['role'] == 'user':
+            chat_html += f"""
+                <div style="margin-bottom: 20px;">
+                    <div style="text-align: right; margin-right: 20px;">
+                        <span style="background-color: #4CAF50; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
+                            {item['content']}
+                        </span>
+                    </div>
+                </div>
+            """
+        else:
+            chat_html += f"""
+                <div style="margin-bottom: 20px;">
+                    <div style="text-align: left; margin-left: 20px;">
+                        <span style="background-color: white; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
+                            {item['content']}
+                        </span>
+                    </div>
+                </div>
+            """
+    output_html = f"""
+        <div style="height: 400px; overflow-y: scroll; padding: 10px;">
+            {chat_html}
+        </div>
+    """
+    return output_html
+def chatgpt(text):
+    messages = []
+    try:
+        with open('log.pickle', 'rb') as f:
+            messages = pickle.load(f)
+            messages.append({"role": "user", "content": text},)
+            chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
+            reply = chat.choices[0].message.content
+            messages.append({"role": "assistant", "content": reply})
+            print(messages[-1])
+            if len(messages) == 12:
+                messages[6:10] = messages[8:]
+                del messages[-2:]
+            with open('log.pickle', 'wb') as f:
+                pickle.dump(messages, f)
+            return reply,messages
+    except:
+        messages.append({"role": "user", "content": text},)
+        chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
+        reply = chat.choices[0].message.content
+        messages.append({"role": "assistant", "content": reply})
+        print(messages[-1])
+        if len(messages) == 12:
+            messages[6:10] = messages[8:]
+            del messages[-2:]
+        with open('log.pickle', 'wb') as f:
+            pickle.dump(messages, f)
+        return reply,messages
 def get_symbols_from_json(path):
     assert os.path.isfile(path)
     with open(path, 'r') as f:
 def create_tts_fn(net_g,hps,speaker_id):
     speaker_id = int(speaker_id)
+    def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
+        repeat_time = int(repeat_time)
+        if is_gpt:
+            openai.api_key = api_key
+            text,messages = chatgpt(text)
+            htm = to_html(messages)
+        else:
+             htm = ""
         if not extract:
             print(text)
             t1 = time.time()
                 spending_time = "推理时间为："+str(t2-t1)+"s"
                 print(spending_time)
                 file_path = "subtitles.srt"
+            write(audiopath + '.wav',22050,audio)
+            if is_audio:
+                for i in range(repeat_time):
+                    cmd = 'ffmpeg -y -i ' +  audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
+                    os.system(cmd)
+            return (hps.data.sampling_rate, audio),file_path,htm
         else:
             a = ['【','[','(','（']
             b = ['】',']',')','）']
             f1 = open("subtitles.srt",'w',encoding='utf-8')
             for sentence in final_list:
                 c +=1
+                stn_tst = get_text(sle(language,sentence),hps)
                 with torch.no_grad():
                     x_tst = stn_tst.unsqueeze(0).to(dev)
                     x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
                     print(time_end)
                     f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
                     audio_fin.append(audio)
+            write(audiopath + '.wav',22050,np.concatenate(audio_fin))
+            if is_audio:
+                for i in range(repeat_time):
+                    cmd = 'ffmpeg -y -i ' +  audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
+                    os.system(cmd)
             file_path = "subtitles.srt"
+            return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path,htm
     return tts_fn
         models.append(content)
     with gr.Blocks() as app:
         with gr.Tabs():
             for i in schools:
                 with gr.TabItem(i):
                     for (sid, name,  title, example, tts_fn) in models[schools.index(i)]:
                         with gr.TabItem(name):
+                            with gr.Column():
+                                with gr.Accordion(label="Setting", open=False):
+                                    input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
+                                    input3 = gr.Checkbox(value=True, label="长句切割(小说合成)")
+                                    input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale)，以控制情感", value=0.267)
+                                    input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w)，以控制音素长短", value=0.7)
+                                    input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
+                                with gr.Accordion(label="Advanced Setting", open=False):
+                                    audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
+                                    api_input1 = gr.Checkbox(value=True, label="接入chatgpt")
+                                    api_input2 = gr.TextArea(label="api-key",lines=1,value = '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')
+                                    output2 = gr.outputs.File(label="字幕文件：subtitles.srt")
+                                    audio_input1 = gr.Checkbox(value=True, label="修改音频路径(live2d)")
+                                    audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
+                                with gr.Row():
                                     with gr.Row():
+                                        gr.Markdown(
+                                            '<div align="center">'
+                                            f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
+                                            '</div>'
+                                        )
+                                    with gr.Row():
+                                        output_UI = gr.outputs.HTML()
+                                input1 = gr.TextArea(label="Text", value=example, lines=1)
+                                btnVC = gr.Button("Submit")
+                                output1 = gr.Audio(label="采样率22050")
+                        btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
+    app.launch()