Spaces:

ATForest
/

english

Runtime error

App Files Files Community

ATForest commited on Jul 21, 2023

Commit

3d67931

•

0 Parent(s):

Duplicate from ATForest/test

Browse files

Files changed (42) hide show

.flake8 +21 -0
.gitattributes +35 -0
.gitattributes copy +34 -0
.gitignore +3 -0
README.md +14 -0
app.py +256 -0
app_backup.py +431 -0
checkpoints/freevc-24.pth +3 -0
commons.py +171 -0
configs/freevc-24.json +54 -0
english/generator.py +27 -0
english/models/deltalm/configuration_deltalm.py +170 -0
english/models/deltalm/modeling_deltalm.py +1551 -0
english/models/deltalm/tokenizer_deltalm.py +323 -0
english/split_text.py +19 -0
english/translate.py +18 -0
mel_processing.py +112 -0
models.py +351 -0
modules.py +342 -0
requirements.txt +17 -0
speaker_encoder/__init__.py +0 -0
speaker_encoder/audio.py +107 -0
speaker_encoder/ckpt/pretrained_bak_5805000.pt +3 -0
speaker_encoder/compute_embed.py +40 -0
speaker_encoder/config.py +45 -0
speaker_encoder/data_objects/__init__.py +2 -0
speaker_encoder/data_objects/random_cycler.py +37 -0
speaker_encoder/data_objects/speaker.py +40 -0
speaker_encoder/data_objects/speaker_batch.py +12 -0
speaker_encoder/data_objects/speaker_verification_dataset.py +56 -0
speaker_encoder/data_objects/utterance.py +26 -0
speaker_encoder/hparams.py +31 -0
speaker_encoder/inference.py +177 -0
speaker_encoder/model.py +135 -0
speaker_encoder/params_data.py +29 -0
speaker_encoder/params_model.py +11 -0
speaker_encoder/preprocess.py +285 -0
speaker_encoder/train.py +125 -0
speaker_encoder/visualizations.py +178 -0
speaker_encoder/voice_encoder.py +173 -0
tts_voice.py +290 -0
utils.py +305 -0

.flake8 ADDED Viewed

	@@ -0,0 +1,21 @@

+[flake8]
+ignore =
+  # E203 whitespace before ':'
+  E203
+  D203,
+  # line too long
+  E501
+per-file-ignores =
+  # imported but unused
+  # __init__.py: F401
+  test_*.py: F401
+exclude =
+  .git,
+  __pycache__,
+  docs/source/conf.py,
+  old,
+  build,
+  dist,
+  .venv
+  pad*.py
+max-complexity = 25

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitattributes copy ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__
+flagged
+call-activate.bat

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Test
+emoji: 😻
+colorFrom: purple
+colorTo: indigo
+sdk: gradio
+sdk_version: 3.36.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: ATForest/test
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import gradio as gr
+from textwrap import dedent
+import edge_tts
+import tempfile
+from tts_voice import tts_order_voice
+from english.translate import Translate
+from english.split_text import sentence_split
+from english.generator import generatorArticle
+import random
+import codecs
+import torch
+import librosa
+from models import SynthesizerTrn
+from scipy.io.wavfile import write
+import utils
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
+from transformers import WavLMModel
+language_dict = tts_order_voice
+def parse_text(input):
+    text =  generatorArticle(input).strip()
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split("`")
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = "<br></code></pre>"
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", r"\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>" + line
+    return text
+def predict(input):
+    article = parse_text(input)
+    yield article,article
+async def text_to_speech_edge(text, language_code):
+    voice = language_dict[language_code]
+    communicate = edge_tts.Communicate(text, voice)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+    await communicate.save(tmp_path)
+    return tmp_path
+def tran_2_chianese(text):
+    translate = Translate()
+    sentence_str = sentence_split(text)
+    i = 0
+    result=''
+    length = len(sentence_str)
+    while(i < length):
+        tmp = sentence_str[i]
+        print('\n'+tmp)
+        tran = translate.translateToZh(tmp)
+        result = result+tmp+'\n'+tran+'\n'
+        i+=1
+    return result
+def readWorldsFile(file_path):
+    fp  = codecs.open(file_path, 'r', encoding='gb2312')
+    lines = fp.readlines()
+    worlds ,paraphrase = [],[]
+    for line in lines:
+        tmp = line.split('|')
+        worlds.append(tmp[0].strip())
+        paraphrase.append(tmp[1].strip())
+    fp.close()
+    return worlds, paraphrase
+def generatorWorlds(file_path):
+    worlds,paraphrase = readWorldsFile(file_path)
+    length = len(worlds)
+    index = 0
+    worlds_text = ''
+    while index < 15:
+        num = random.randint(0,length)
+        worlds_text += f'{worlds[num]},【{paraphrase[num]}】\n'
+        index += 1
+    print('\n' + worlds_text)
+    return worlds_text
+def choose_word_from_file(input):
+    result = generatorWorlds(input.orig_name)
+    return result
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+def convert(model, src, tgt):
+    with torch.no_grad():
+        # tgt
+        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+        if model == "FreeVC" or model == "FreeVC (24kHz)":
+            g_tgt = smodel.embed_utterance(wav_tgt)
+            g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+        else:
+            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
+            mel_tgt = mel_spectrogram_torch(
+                wav_tgt,
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.hop_length,
+                hps.data.win_length,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax
+            )
+        # src
+        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+        # infer
+        if model == "FreeVC":
+            audio = freevc.infer(c, g=g_tgt)
+        elif model == "FreeVC-s":
+            audio = freevc_s.infer(c, mel=mel_tgt)
+        else:
+            audio = freevc_24.infer(c, g=g_tgt)
+        audio = audio[0][0].data.cpu().float().numpy()
+        if model == "FreeVC" or model == "FreeVC-s":
+            write("out.wav", hps.data.sampling_rate, audio)
+        else:
+            write("out.wav", 24000, audio)
+    out = "out.wav"
+    return out
+with gr.Blocks(title="Learn English By AI", theme=gr.themes.Soft(text_size="sm")) as demo:
+    gr.HTML("<center>"
+            "<h1>OpenAI + 声音克隆：根据单词生成短文，帮助理解单词使用的语境！！</h1>"
+            "</center>")
+    with gr.Accordion("📒 相关信息", open=True):
+        _ = f"""OpenAI Prompt 的可选参数信息：
+            * 输入 10-15 个单词为宜
+            * prompt = '你是一个非常厉害的英语助手,请将'{'words'}'组成一篇英语文章,字数限制在100 字以内'
+            * Open AI 用的是限制账号，每分钟请求 3 次
+            * 单词文件：每个单词及解释单独成行，单词与注释同行，用 “｜” 分割
+            """
+        gr.Markdown(dedent(_))
+    with gr.Row():
+        file = gr.File()
+        chooseBtn = gr.Button("从文件提取或输入 -》", variant="secondary")
+        user_input = gr.Textbox(
+                    max_lines=5,
+                    lines=3,
+                    label="单词用逗号分割：",
+                    placeholder="10-15 words will be better",
+                )
+    with gr.Column(scale=1):
+        submitBtn = gr.Button("开始生成英语短文", variant="primary")
+        chatbot = gr.Textbox(label="英语短文：", lines = 5, max_lines=8)
+    chooseBtn.click(
+        choose_word_from_file,
+        inputs=[file],
+        outputs=[user_input],
+        show_progress="full",
+        api_name="choose_word_from_file"
+    )
+    with gr.Column(scale=3):
+        with gr.Row():
+            tran_result = gr.Textbox(label="翻译结果", lines = 5,max_lines=8,scale=2)
+            tran_btn = gr.Button("翻译", variant="primary")
+    tran_btn.click(
+        tran_2_chianese,
+        inputs=[chatbot],
+        outputs=[tran_result],
+        show_progress="full",
+        api_name="tran_2_chianese"
+    )
+    with gr.Column(min_width=32, scale=2):
+        with gr.Row():
+            with gr.Column():
+                language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人")
+                tts_btn = gr.Button("生成对应的音频吧", variant="primary")
+            output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False)
+    tts_btn.click(text_to_speech_edge, inputs=[chatbot, language], outputs=[output_audio])
+    with gr.Row():
+        model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False)
+        audio1 = output_audio
+        audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath')
+        clone_btn = gr.Button("开始AI声音克隆吧", variant="primary")
+        audio_cloned =  gr.Audio(label="为您生成的专属声音克隆音频", type='filepath')
+    clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned])
+    user_input.submit(
+        predict,
+        [user_input],
+        [chatbot,tran_result],
+        show_progress="full",
+    )
+    submitBtn.click(
+        predict,
+        [user_input],
+        [chatbot,tran_result],
+        show_progress="full",
+        api_name="predict",
+    )
+    # submitBtn.click(reset_user_input, [], [user_input])
+demo.queue().launch(show_error=True, debug=True)

app_backup.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import os
+import torch
+import librosa
+import gradio as gr
+from scipy.io.wavfile import write
+from transformers import WavLMModel
+import utils
+from models import SynthesizerTrn
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
+import time
+from textwrap import dedent
+import mdtex2html
+from loguru import logger
+from transformers import AutoModel, AutoTokenizer
+from tts_voice import tts_order_voice
+import edge_tts
+import tempfile
+import anyio
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+def convert(model, src, tgt):
+    with torch.no_grad():
+        # tgt
+        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+        if model == "FreeVC" or model == "FreeVC (24kHz)":
+            g_tgt = smodel.embed_utterance(wav_tgt)
+            g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+        else:
+            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
+            mel_tgt = mel_spectrogram_torch(
+                wav_tgt,
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.hop_length,
+                hps.data.win_length,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax
+            )
+        # src
+        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+        # infer
+        if model == "FreeVC":
+            audio = freevc.infer(c, g=g_tgt)
+        elif model == "FreeVC-s":
+            audio = freevc_s.infer(c, mel=mel_tgt)
+        else:
+            audio = freevc_24.infer(c, g=g_tgt)
+        audio = audio[0][0].data.cpu().float().numpy()
+        if model == "FreeVC" or model == "FreeVC-s":
+            write("out.wav", hps.data.sampling_rate, audio)
+        else:
+            write("out.wav", 24000, audio)
+    out = "out.wav"
+    return out
+# GLM2
+language_dict = tts_order_voice
+# fix timezone in Linux
+os.environ["TZ"] = "Asia/Shanghai"
+try:
+    time.tzset()  # type: ignore # pylint: disable=no-member
+except Exception:
+    # Windows
+    logger.warning("Windows, cant run time.tzset()")
+# model_name = "THUDM/chatglm2-6b"
+model_name = "THUDM/chatglm2-6b-int4"
+RETRY_FLAG = False
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda()
+# 4/8 bit
+# model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).quantize(4).cuda()
+has_cuda = torch.cuda.is_available()
+# has_cuda = False  # force cpu
+if has_cuda:
+    model_glm = (
+        AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda().half()
+    )  # 3.92G
+else:
+    model_glm = AutoModel.from_pretrained(
+        model_name, trust_remote_code=True
+    ).float()  # .float() .half().float()
+model_glm = model_glm.eval()
+_ = """Override Chatbot.postprocess"""
+def postprocess(self, y):
+    if y is None:
+        return []
+    for i, (message, response) in enumerate(y):
+        y[i] = (
+            None if message is None else mdtex2html.convert((message)),
+            None if response is None else mdtex2html.convert(response),
+        )
+    return y
+gr.Chatbot.postprocess = postprocess
+def parse_text(text):
+    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split("`")
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = "<br></code></pre>"
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", r"\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>" + line
+    text = "".join(lines)
+    return text
+def predict(
+    RETRY_FLAG, input, chatbot, max_length, top_p, temperature, history, past_key_values
+):
+    try:
+        chatbot.append((parse_text(input), ""))
+    except Exception as exc:
+        logger.error(exc)
+        logger.debug(f"{chatbot=}")
+        _ = """
+        if chatbot:
+            chatbot[-1] = (parse_text(input), str(exc))
+            yield chatbot, history, past_key_values
+        # """
+        yield chatbot, history, past_key_values
+    for response, history, past_key_values in model_glm.stream_chat(
+        tokenizer,
+        input,
+        history,
+        past_key_values=past_key_values,
+        return_past_key_values=True,
+        max_length=max_length,
+        top_p=top_p,
+        temperature=temperature,
+    ):
+        chatbot[-1] = (parse_text(input), parse_text(response))
+        # chatbot[-1][-1] = parse_text(response)
+        yield chatbot, history, past_key_values, parse_text(response)
+def trans_api(input, max_length=4096, top_p=0.8, temperature=0.2):
+    if max_length < 10:
+        max_length = 4096
+    if top_p < 0.1 or top_p > 1:
+        top_p = 0.85
+    if temperature <= 0 or temperature > 1:
+        temperature = 0.01
+    try:
+        res, _ = model_glm.chat(
+            tokenizer,
+            input,
+            history=[],
+            past_key_values=None,
+            max_length=max_length,
+            top_p=top_p,
+            temperature=temperature,
+        )
+        # logger.debug(f"{res=} \n{_=}")
+    except Exception as exc:
+        logger.error(f"{exc=}")
+        res = str(exc)
+    return res
+def reset_user_input():
+    return gr.update(value="")
+def reset_state():
+    return [], [], None, ""
+# Delete last turn
+def delete_last_turn(chat, history):
+    if chat and history:
+        chat.pop(-1)
+        history.pop(-1)
+    return chat, history
+# Regenerate response
+def retry_last_answer(
+    user_input, chatbot, max_length, top_p, temperature, history, past_key_values
+):
+    if chatbot and history:
+        # Removing the previous conversation from chat
+        chatbot.pop(-1)
+        # Setting up a flag to capture a retry
+        RETRY_FLAG = True
+        # Getting last message from user
+        user_input = history[-1][0]
+        # Removing bot response from the history
+        history.pop(-1)
+    yield from predict(
+        RETRY_FLAG,  # type: ignore
+        user_input,
+        chatbot,
+        max_length,
+        top_p,
+        temperature,
+        history,
+        past_key_values,
+    )
+# print
+def print(text):
+    return text
+# TTS
+async def text_to_speech_edge(text, language_code):
+    voice = language_dict[language_code]
+    communicate = edge_tts.Communicate(text, voice)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+    await communicate.save(tmp_path)
+    return tmp_path
+with gr.Blocks(title="ChatGLM2-6B-int4", theme=gr.themes.Soft(text_size="sm")) as demo:
+    gr.HTML("<center>"
+            "<h1>🥳💕🎶 - ChatGLM2 + 声音克隆：和你喜欢的角色畅所欲言吧！</h1>"
+            "</center>")
+    with gr.Accordion("📒 相关信息", open=False):
+        _ = f""" ChatGLM2的可选参数信息：
+            * Low temperature: responses will be more deterministic and focused; High temperature: responses more creative.
+            * Suggested temperatures -- translation: up to 0.3; chatting: > 0.4
+            * Top P controls dynamic vocabulary selection based on context.\n
+            如果您想让ChatGLM2进行角色扮演并与之对话，请先输入恰当的提示词，如“请你扮演成动漫角色蜡笔小新并和我进行对话”；您也可以为ChatGLM2提供自定义的角色设定\n
+            当您使用声音克隆功能时，请先在此程序的对应位置上传一段您喜欢的音频
+            """
+        gr.Markdown(dedent(_))
+    chatbot = gr.Chatbot(height=300)
+    with gr.Row():
+        with gr.Column(scale=4):
+            with gr.Column(scale=12):
+                user_input = gr.Textbox(
+                    label="请在此处和GLM2聊天 (按回车键即可发送)",
+                    placeholder="聊点什么吧",
+                )
+                RETRY_FLAG = gr.Checkbox(value=False, visible=False)
+    with gr.Column(min_width=32, scale=1):
+        with gr.Row():
+            submitBtn = gr.Button("开始和GLM2交流吧", variant="primary")
+            deleteBtn = gr.Button("删除最新一轮对话", variant="secondary")
+            retryBtn = gr.Button("重新生成最新一轮对话", variant="secondary")
+    with gr.Accordion("🔧 更多设置", open=False):
+        with gr.Row():
+            emptyBtn = gr.Button("清空所有聊天记录")
+            max_length = gr.Slider(
+                0,
+                32768,
+                value=8192,
+                step=1.0,
+                label="Maximum length",
+                interactive=True,
+            )
+            top_p = gr.Slider(
+                0, 1, value=0.85, step=0.01, label="Top P", interactive=True
+            )
+            temperature = gr.Slider(
+                0.01, 1, value=0.95, step=0.01, label="Temperature", interactive=True
+            )
+    with gr.Row():
+        test1 = gr.Textbox(label="GLM2的最新回答 (可编辑)", lines = 3)
+        with gr.Column():
+            language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人")
+            tts_btn = gr.Button("生成对应的音频吧", variant="primary")
+        output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False)
+    tts_btn.click(text_to_speech_edge, inputs=[test1, language], outputs=[output_audio])
+    with gr.Row():
+        model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False)
+        audio1 = output_audio
+        audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath')
+        clone_btn = gr.Button("开始AI声音克隆吧", variant="primary")
+        audio_cloned =  gr.Audio(label="为您生成的专属声音克隆音频", type='filepath')
+    clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned])
+    history = gr.State([])
+    past_key_values = gr.State(None)
+    user_input.submit(
+        predict,
+        [
+            RETRY_FLAG,
+            user_input,
+            chatbot,
+            max_length,
+            top_p,
+            temperature,
+            history,
+            past_key_values,
+        ],
+        [chatbot, history, past_key_values, test1],
+        show_progress="full",
+    )
+    submitBtn.click(
+        predict,
+        [
+            RETRY_FLAG,
+            user_input,
+            chatbot,
+            max_length,
+            top_p,
+            temperature,
+            history,
+            past_key_values,
+        ],
+        [chatbot, history, past_key_values, test1],
+        show_progress="full",
+        api_name="predict",
+    )
+    submitBtn.click(reset_user_input, [], [user_input])
+    emptyBtn.click(
+        reset_state, outputs=[chatbot, history, past_key_values, test1], show_progress="full"
+    )
+    retryBtn.click(
+        retry_last_answer,
+        inputs=[
+            user_input,
+            chatbot,
+            max_length,
+            top_p,
+            temperature,
+            history,
+            past_key_values,
+        ],
+        # outputs = [chatbot, history, last_user_message, user_message]
+        outputs=[chatbot, history, past_key_values, test1],
+    )
+    deleteBtn.click(delete_last_turn, [chatbot, history], [chatbot, history])
+    with gr.Accordion("For Chat/Translation API", open=False):
+        input_text = gr.Text()
+        tr_btn = gr.Button("Go", variant="primary")
+        out_text = gr.Text()
+    tr_btn.click(
+        trans_api,
+        [input_text, max_length, top_p, temperature],
+        out_text,
+        # show_progress="full",
+        api_name="tr",
+    )
+    _ = """
+    input_text.submit(
+        trans_api,
+        [input_text, max_length, top_p, temperature],
+        out_text,
+        show_progress="full",
+        api_name="tr1",
+    )
+    # """
+    gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
+    gr.Markdown("<center>💡 - 如何使用此程序：输入您对ChatGLM的提问后，依次点击“开始和GLM2交流吧”、“生成对应的音频吧”、“开始AI声音克隆吧”三个按键即可；使用声音克隆功能时，请先上传一段您喜欢的音频</center>")
+    gr.HTML('''
+        <div class="footer">
+                    <p>Reedit by Forest, Thanks 明·顾璘</p>
+        </div>
+    ''')
+demo.queue().launch(show_error=True, debug=True)

checkpoints/freevc-24.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b39a86fefbc9ec6e30be8d26ee2a6aa5ffe6d235f6ab15773d01cdf348e5b20
+size 472644351

commons.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+def init_weights(m, mean=0.0, std=0.01):
+  classname = m.__class__.__name__
+  if classname.find("Conv") != -1:
+    m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+  return int((kernel_size*dilation - dilation)/2)
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+def intersperse(lst, item):
+  result = [item] * (len(lst) * 2 + 1)
+  result[1::2] = lst
+  return result
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+  """KL(P||Q)"""
+  kl = (logs_q - logs_p) - 0.5
+  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+  return kl
+def rand_gumbel(shape):
+  """Sample from the Gumbel distribution, protect from overflows."""
+  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+  return -torch.log(-torch.log(uniform_samples))
+def rand_gumbel_like(x):
+  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+  return g
+def slice_segments(x, ids_str, segment_size=4):
+  ret = torch.zeros_like(x[:, :, :segment_size])
+  for i in range(x.size(0)):
+    idx_str = ids_str[i]
+    idx_end = idx_str + segment_size
+    ret[i] = x[i, :, idx_str:idx_end]
+  return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size + 1
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+def get_timing_signal_1d(
+    length, channels, min_timescale=1.0, max_timescale=1.0e4):
+  position = torch.arange(length, dtype=torch.float)
+  num_timescales = channels // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (num_timescales - 1))
+  inv_timescales = min_timescale * torch.exp(
+      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+  signal = F.pad(signal, [0, 0, 0, channels % 2])
+  signal = signal.view(1, channels, length)
+  return signal
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return x + signal.to(dtype=x.dtype, device=x.device)
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+def subsequent_mask(length):
+  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+  return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+  n_channels_int = n_channels[0]
+  in_act = input_a + input_b
+  t_act = torch.tanh(in_act[:, :n_channels_int, :])
+  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+  acts = t_act * s_act
+  return acts
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+def shift_1d(x):
+  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+  return x
+def sequence_mask(length, max_length=None):
+  if max_length is None:
+    max_length = length.max()
+  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+  return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+  """
+  duration: [b, 1, t_x]
+  mask: [b, 1, t_y, t_x]
+  """
+  device = duration.device
+  b, _, t_y, t_x = mask.shape
+  cum_duration = torch.cumsum(duration, -1)
+  cum_duration_flat = cum_duration.view(b * t_x)
+  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+  path = path.view(b, t_x, t_y)
+  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+  path = path.unsqueeze(1).transpose(2,3) * mask
+  return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+  if isinstance(parameters, torch.Tensor):
+    parameters = [parameters]
+  parameters = list(filter(lambda p: p.grad is not None, parameters))
+  norm_type = float(norm_type)
+  if clip_value is not None:
+    clip_value = float(clip_value)
+  total_norm = 0
+  for p in parameters:
+    param_norm = p.grad.data.norm(norm_type)
+    total_norm += param_norm.item() ** norm_type
+    if clip_value is not None:
+      p.grad.data.clamp_(min=-clip_value, max=clip_value)
+  total_norm = total_norm ** (1. / norm_type)
+  return total_norm

configs/freevc-24.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8640,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8008"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,4,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": true
+  }
+}

english/generator.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import openai
+openai.organization = ""
+TEMPERATURE = 1
+# COMPLETION_MODEL = "gpt-3.5-turbo-0613"
+COMPLETION_MODEL ='text-davinci-003'
+def generatorByOpenAI(worlds):
+    prompt = f'你是一个非常厉害的英语助手,请将{worlds}组成一篇英语文章,字数限制在100 字以内'
+    response = openai.Completion.create(
+        prompt=prompt,
+        engine = COMPLETION_MODEL,
+        temperature = TEMPERATURE,
+        max_tokens = 512,
+        n = 1,
+        stop = None,
+        timeout = 20
+    )
+    # print(response)
+    return response
+def generatorArticle(worlds):
+    result = generatorByOpenAI(worlds)
+    return result.choices[0].text

english/models/deltalm/configuration_deltalm.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+""" deltalm model configuration"""
+import warnings
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "IDEA/Deltalm": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
+    # See all deltalm models at https://huggingface.co/models?filter=deltam
+}
+class DeltalmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeltalmModel`]. It is used to instantiate a Deltalm
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BART
+    [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BartModel`] or [`TFBartModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels: (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`BartForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+    Example:
+    ```python
+    >>> from transformers import BartModel, BartConfig
+    >>> # Initializing a BART facebook/bart-large style configuration
+    >>> configuration = BartConfig()
+    >>> # Initializing a model from the facebook/bart-large style configuration
+    >>> model = BartModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "Deltalm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=250001,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=3072,
+        encoder_attention_heads=12,
+        decoder_layers=6,
+        decoder_ffn_dim=3072,
+        decoder_attention_heads=12,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=0,
+        forced_eos_token_id=2,
+        label_smoothing=0.1,
+        length_penalty=1.0,
+        encoder_normalize_before=False,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.label_smoothing = label_smoothing
+        self.encoder_normalize_before = encoder_normalize_before
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            length_penalty=length_penalty,
+            **kwargs,
+        )
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            warnings.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model

english/models/deltalm/modeling_deltalm.py ADDED Viewed

	@@ -0,0 +1,1551 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import copy
+import math
+import random
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+from typing import List, Optional, Tuple, Union
+from transformers.modeling_utils import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    Seq2SeqModelOutput,
+    Seq2SeqLMOutput,
+)
+from transformers.file_utils import (
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+import logging
+from .configuration_deltalm import DeltalmConfig
+logger = logging.getLogger(__name__)
+_CHECKPOINT_FOR_DOC = "IDEA-CCNL/Randeng-Deltalm-362M-En-Zn"
+_CONFIG_FOR_DOC = "DeltalmConfig"
+_TOKENIZER_FOR_DOC = "DeltalmTokenizer"
+# Base model docstring
+_EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+    return shifted_input_ids
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(float("-inf")))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class DeltalmLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # Deltalm is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions + self.offset)
+class DeltalmAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped, past_key_value
+class DeltalmEncoderLayer(nn.Module):
+    def __init__(self, config: DeltalmConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DeltalmAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class DeltalmDecoderLayer(nn.Module):
+    def __init__(self, config: DeltalmConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DeltalmAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = DeltalmAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.fc3 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc4 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.ffn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Add another ffn after self-attention to keep the structure same to encoder-layer
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc3(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc4(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class DeltalmPretrainedModel(PreTrainedModel):
+    config_class = DeltalmConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (DeltalmDecoder, DeltalmEncoder)):
+            module.gradient_checkpointing = value
+class DeltalmDecoder(DeltalmPretrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DeltalmDecoderLayer`]
+    Args:
+        config: DeltalmConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(self, config: DeltalmConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+        self.embed_positions = DeltalmLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+        self.layers = nn.ModuleList([DeltalmDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        # fairseq实现了一个 nn.init.normal_(self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5) 对最后的output权重做正态分布转换？
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(inputs_embeds.device)
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+DELTALM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`DeltalmConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+DELTALM_GENERATION_EXAMPLE = r"""
+    Summarization example:
+    ```python
+    >>> from transformers import DeltalmTokenizer, DeltalmForConditionalGeneration
+    >>> model = DeltalmForConditionalGeneration.from_pretrained("facebook/deltalm-large-cnn")
+    >>> tokenizer = DeltalmTokenizer.from_pretrained("facebook/deltalm-large-cnn")
+    >>> ARTICLE_TO_SUMMARIZE = (
+    ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
+    ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
+    ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+    ... )
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
+    >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
+    ```
+    Mask filling example:
+    ```python
+    >>> from transformers import DeltalmTokenizer, DeltalmForConditionalGeneration
+    >>> tokenizer = DeltalmTokenizer.from_pretrained("facebook/deltalm-base")
+    >>> model = DeltalmForConditionalGeneration.from_pretrained("facebook/deltalm-base")
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
+    >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+    >>> logits = model(input_ids).logits
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    >>> probs = logits[0, masked_index].softmax(dim=0)
+    >>> values, predictions = probs.topk(5)
+    >>> tokenizer.decode(predictions).split()
+    ['not', 'good', 'healthy', 'great', 'very']
+    ```
+"""
+DELTALM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            Deltalm uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+            For translation and summarization training, `decoder_input_ids` should be provided. If no
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
+            for denoising pre-training following the paper.
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+            If you want to change padding behavior, you should read [`modeling_deltalm._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
+            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
+            can choose to directly pass an embedded representation. This is useful if you want more control over how to
+            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class DeltalmEncoder(DeltalmPretrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`DeltalmEncoderLayer`].
+    Args:
+        config: DeltalmConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(self, config: DeltalmConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+        self.embed_positions = DeltalmLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList([DeltalmEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+        self.gradient_checkpointing = False
+        if config.encoder_normalize_before:
+            self.layer_norm = nn.LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        embed_pos = self.embed_positions(input_shape)
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if self.layer_norm is not None:
+            hidden_states = self.layer_norm(hidden_states)
+        # hidden_states = self.layernorm_embedding(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+class DeltalmModel(DeltalmPretrainedModel):
+    def __init__(self, config: DeltalmConfig):
+        super().__init__(config)
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+        self.encoder = DeltalmEncoder(config, self.shared)
+        self.decoder = DeltalmDecoder(config, self.shared)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.shared
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+    def get_encoder(self):
+        return self.encoder
+    def get_decoder(self):
+        return self.decoder
+    @add_start_docstrings_to_model_forward(DELTALM_INPUTS_DOCSTRING)
+    # @add_code_sample_docstrings(
+    #     processor_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=Seq2SeqModelOutput,
+    #     config_class=_CONFIG_FOR_DOC,
+    #     expected_output=_EXPECTED_OUTPUT_SHAPE,
+    # )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqModelOutput]:
+        # different to other models, Deltalm automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        logger.debug("last_hidden_state.size:  %s", decoder_outputs.last_hidden_state)
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    "The DELTALM Model with a language modeling head. Can be used for translation.", DELTALM_START_DOCSTRING
+)
+class DeltalmForConditionalGeneration(DeltalmPretrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head.weight"]
+    def __init__(self, config: DeltalmConfig):
+        super().__init__(config)
+        self.model = DeltalmModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_encoder(self):
+        return self.model.get_encoder()
+    def get_decoder(self):
+        return self.model.get_decoder()
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        logger.debug("Debug: coming to _resize_final_logits_bias")
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    @add_start_docstrings_to_model_forward(DELTALM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(DELTALM_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        logger.debug("Comming to Generation!")
+        if labels is not None:
+            logger.debug("Debug: *************** Before label ***************** ")
+            logger.debug("Debug: %s", labels.size())
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+            logger.debug("Debug: ************ After labels ************")
+            logger.debug("Debug: %s", labels.size())
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+        # print(self.lm_head)
+        logger.debug("Debug: logit_size: %s", lm_logits.size())
+        # logger.debug("Debug: change logit size: ", lm_logits.view(-1, self.config.vocab_size).size())
+        # logger.debug("Debug: change label size: ", labels.view(-1).size())
+        masked_lm_loss = None
+        if labels is not None:
+            # logger.debug("Debug: model label_size: %s", labels.size())
+            # loss_fct = CrossEntropyLoss()
+            # masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+            # label_smoothing = self.config.label_smoothing
+            # # logger.debug("Debug: label.size: ", )
+            # if label_smoothing == 0:
+            #     # compute label smoothed loss
+            #     loss_fct = CrossEntropyLoss()
+            #     masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+            # else:
+            #     m = torch.nn.LogSoftmax(dim=-1)
+            #     lprobs = m(lm_logits.float())
+            #     # lprobs = m(lm_logits)
+            #     # # torch.set_printoptions(linewidth=200)
+            #     loss_fn = label_smoothed_nll_loss
+            #     masked_lm_loss, _ = loss_fn(lprobs.view(-1, lprobs.size(-1)), labels.view(-1), label_smoothing, self.config.pad_token_id)
+        if not return_dict:
+            logger.debug("Debug: not return dict")
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+class DeltalmDecoderWrapper(DeltalmPretrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = DeltalmDecoder(config)
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+class DeltalmForCausalLM(DeltalmPretrainedModel):
+    def __init__(self, config):
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        super().__init__(config)
+        self.model = DeltalmDecoderWrapper(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+    def get_decoder(self):
+        return self.model.decoder
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`DeltalmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
+                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import DeltalmTokenizer, DeltalmForCausalLM
+        >>> tokenizer = DeltalmTokenizer.from_pretrained("facebook/deltalm-base")
+        >>> model = DeltalmForCausalLM.from_pretrained("facebook/deltalm-base", add_cross_attention=False)
+        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
+        >>> list(logits.shape) == expected_shape
+        True
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.lm_head(outputs[0])
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        if past:
+            input_ids = input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "use_cache": use_cache,
+        }
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past

english/models/deltalm/tokenizer_deltalm.py ADDED Viewed

	@@ -0,0 +1,323 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import re
+import warnings
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+# import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+SPIECE_UNDERLINE = "▁"
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"IDEA-CCNL/deltalm": "https://huggingface.co/IDEA-CCNL/Randeng-Deltalm-362M-En-Zn/resolve/main/spm.model"}
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "IDEA-CCNL/deltalm": 512,
+}
+logger = logging.get_logger(__name__)
+class DeltalmTokenizer(PreTrainedTokenizer):
+    """
+    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+            </Tip>
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        extra_ids (`int`, *optional*, defaults to 100):
+            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
+            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
+            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
+            like in T5 preprocessing see
+            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=0,
+        additional_special_tokens=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs
+    ) -> None:
+        # Add extra_ids to the special token list
+        if extra_ids > 0 and additional_special_tokens is None:
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
+        elif extra_ids > 0 and additional_special_tokens is not None:
+            # Check that we have the right number of extra_id special tokens
+            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
+            if extra_tokens != extra_ids:
+                raise ValueError(
+                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
+                    " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
+                    " tokens"
+                )
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            extra_ids=extra_ids,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self.offset = 1
+        self._extra_ids = extra_ids
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self.encoder: Dict[int, str] = {
+            0: self.bos_token,
+            1: self.pad_token,
+            2: self.eos_token,
+            3: self.unk_token,
+        }
+        self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
+    @staticmethod
+    def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
+        if pretrained_model_name_or_path in DeltalmTokenizer.max_model_input_sizes:
+            deprecated_max_model_length = DeltalmTokenizer.max_model_input_sizes[pretrained_model_name_or_path]
+            if init_max_model_length is not None and init_max_model_length != max_model_length:
+                return init_max_model_length
+            elif init_max_model_length is None:
+                warnings.warn(
+                    "This tokenizer was incorrectly instantiated with a model max length of"
+                    f" {deprecated_max_model_length} which will be corrected in Transformers v5.\nFor now, this"
+                    " behavior is kept to avoid breaking backwards compatibility when padding/encoding with"
+                    " `truncation is True`.\n- Be aware that you SHOULD NOT rely on"
+                    f" {pretrained_model_name_or_path} automatically truncating your input to"
+                    f" {deprecated_max_model_length} when padding/encoding.\n- If you want to encode/pad to sequences"
+                    f" longer than {deprecated_max_model_length} you can either instantiate this tokenizer with"
+                    " `model_max_length` or pass `max_length` when encoding/padding.\n- To avoid this warning, please"
+                    " instantiate this tokenizer with `model_max_length` set to your preferred value.",
+                    FutureWarning,
+                )
+        return max_model_length
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size()  # + self._extra_ids
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        # normal case: some special tokens
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+    def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
+        """Do not add eos again if user already added it."""
+        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
+            warnings.warn(
+                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
+                " eos tokens being added."
+            )
+            return token_ids
+        else:
+            return token_ids + [self.eos_token_id]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A sequence has the following format:
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
+        if token_ids_1 is None:
+            return token_ids_0
+        else:
+            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
+            return token_ids_0 + token_ids_1
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token.startswith("<extra_id_"):
+            match = re.match(r"<extra_id_(\d+)>", token)
+            num = int(match.group(1))
+            return self.vocab_size - num - 1
+        elif token in self.decoder:
+            return self.decoder[token]
+        sp_id = self.sp_model.piece_to_id(token)
+        return sp_id + self.offset
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        # if index < self.sp_model.get_piece_size():
+        #     token = self.sp_model.IdToPiece(index)
+        # else:
+        #     token = f"<extra_id_{self.vocab_size - 1 - index}>"
+        # return token
+        if index in self.encoder:
+            return self.encoder[index]
+        elif index in self.added_tokens_encoder:
+            return self.added_tokens_encoder[index]
+        elif index < self.sp_model.get_piece_size() + 4:
+            token = self.sp_model.IdToPiece(index-self.offset)
+        else:
+            token = f"<extra_id_{self.vocab_size - 1 - index}>"
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode_pieces(current_sub_tokens) + token + " "
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode_pieces(current_sub_tokens)
+        return out_string.strip()
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)

english/split_text.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from nltk.tokenize import sent_tokenize
+def sentence_token_nltk(str):
+    print("start split sentence_token_nltk...\n")
+    sent_tokenize_list = sent_tokenize(str)
+    return sent_tokenize_list
+def sentence_split(str_centence):
+    list_ret = list()
+    for s_str in str_centence.split('.'):
+        if '?' in s_str:
+            list_ret.extend(s_str.split('?'))
+        elif '!' in s_str:
+            list_ret.extend(s_str.split('!'))
+        else:
+            list_ret.append(s_str)
+    return list_ret

english/translate.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from english.models.deltalm.modeling_deltalm import DeltalmForConditionalGeneration
+from transformers import AutoTokenizer
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class Translate:
+    def __init__(self):
+        super().__init__()
+        self.model = DeltalmForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Deltalm-362M-En-Zn")
+        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/infoxlm-base")
+    def translateToZh(self,text):
+        inputs = self.tokenizer(text, max_length=512, truncation=True,return_tensors="pt")
+        generate_ids = self.model.generate(inputs["input_ids"], max_length=512)
+        output = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        print(output)
+        return output

mel_processing.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+MAX_WAV_VALUE = 32768.0
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    return spec
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + '_' + str(spec.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

models.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import commons
+import modules
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from commons import init_weights, get_padding
+class ResidualCouplingBlock(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      n_flows=4,
+      gin_channels=0):
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.n_flows = n_flows
+    self.gin_channels = gin_channels
+    self.flows = nn.ModuleList()
+    for i in range(n_flows):
+      self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+      self.flows.append(modules.Flip())
+  def forward(self, x, x_mask, g=None, reverse=False):
+    if not reverse:
+      for flow in self.flows:
+        x, _ = flow(x, x_mask, g=g, reverse=reverse)
+    else:
+      for flow in reversed(self.flows):
+        x = flow(x, x_mask, g=g, reverse=reverse)
+    return x
+class Encoder(nn.Module):
+  def __init__(self,
+      in_channels,
+      out_channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      gin_channels=0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+    self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+  def forward(self, x, x_lengths, g=None):
+    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+    x = self.pre(x) * x_mask
+    x = self.enc(x, x_mask, g=g)
+    stats = self.proj(x) * x_mask
+    m, logs = torch.split(stats, self.out_channels, dim=1)
+    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+    return z, m, logs, x_mask
+class Generator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+          x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2,3,5,7,11]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class SpeakerEncoder(torch.nn.Module):
+    def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
+        super(SpeakerEncoder, self).__init__()
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+    def forward(self, mels):
+        self.lstm.flatten_parameters()
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+    def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+        mel_slices = []
+        for i in range(0, total_frames-partial_frames, partial_hop):
+            mel_range = torch.arange(i, i+partial_frames)
+            mel_slices.append(mel_range)
+        return mel_slices
+    def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+        mel_len = mel.size(1)
+        last_mel = mel[:,-partial_frames:]
+        if mel_len > partial_frames:
+            mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
+            mels = list(mel[:,s] for s in mel_slices)
+            mels.append(last_mel)
+            mels = torch.stack(tuple(mels), 0).squeeze(1)
+            with torch.no_grad():
+                partial_embeds = self(mels)
+            embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+            #embed = embed / torch.linalg.norm(embed, 2)
+        else:
+            with torch.no_grad():
+                embed = self(last_mel)
+        return embed
+class SynthesizerTrn(nn.Module):
+  """
+  Synthesizer for Training
+  """
+  def __init__(self,
+    spec_channels,
+    segment_size,
+    inter_channels,
+    hidden_channels,
+    filter_channels,
+    n_heads,
+    n_layers,
+    kernel_size,
+    p_dropout,
+    resblock,
+    resblock_kernel_sizes,
+    resblock_dilation_sizes,
+    upsample_rates,
+    upsample_initial_channel,
+    upsample_kernel_sizes,
+    gin_channels,
+    ssl_dim,
+    use_spk,
+    **kwargs):
+    super().__init__()
+    self.spec_channels = spec_channels
+    self.inter_channels = inter_channels
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.resblock = resblock
+    self.resblock_kernel_sizes = resblock_kernel_sizes
+    self.resblock_dilation_sizes = resblock_dilation_sizes
+    self.upsample_rates = upsample_rates
+    self.upsample_initial_channel = upsample_initial_channel
+    self.upsample_kernel_sizes = upsample_kernel_sizes
+    self.segment_size = segment_size
+    self.gin_channels = gin_channels
+    self.ssl_dim = ssl_dim
+    self.use_spk = use_spk
+    self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
+    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+    self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+    if not self.use_spk:
+      self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
+  def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if spec_lengths == None:
+      spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
+    if not self.use_spk:
+      g = self.enc_spk(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+    _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
+    z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
+    z_p = self.flow(z, spec_mask, g=g)
+    z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size)
+    o = self.dec(z_slice, g=g)
+    return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+  def infer(self, c, g=None, mel=None, c_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if not self.use_spk:
+      g = self.enc_spk.embed_utterance(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+    z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
+    z = self.flow(z_p, c_mask, g=g, reverse=True)
+    o = self.dec(z * c_mask, g=g)
+    return o

modules.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+import commons
+from commons import init_weights, get_padding
+LRELU_SLOPE = 0.1
+class LayerNorm(nn.Module):
+  def __init__(self, channels, eps=1e-5):
+    super().__init__()
+    self.channels = channels
+    self.eps = eps
+    self.gamma = nn.Parameter(torch.ones(channels))
+    self.beta = nn.Parameter(torch.zeros(channels))
+  def forward(self, x):
+    x = x.transpose(1, -1)
+    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+    return x.transpose(1, -1)
+class ConvReluNorm(nn.Module):
+  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+    super().__init__()
+    self.in_channels = in_channels
+    self.hidden_channels = hidden_channels
+    self.out_channels = out_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    assert n_layers > 1, "Number of layers should be larger than 0."
+    self.conv_layers = nn.ModuleList()
+    self.norm_layers = nn.ModuleList()
+    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+    self.norm_layers.append(LayerNorm(hidden_channels))
+    self.relu_drop = nn.Sequential(
+        nn.ReLU(),
+        nn.Dropout(p_dropout))
+    for _ in range(n_layers-1):
+      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+      self.norm_layers.append(LayerNorm(hidden_channels))
+    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+  def forward(self, x, x_mask):
+    x_org = x
+    for i in range(self.n_layers):
+      x = self.conv_layers[i](x * x_mask)
+      x = self.norm_layers[i](x)
+      x = self.relu_drop(x)
+    x = x_org + self.proj(x)
+    return x * x_mask
+class DDSConv(nn.Module):
+  """
+  Dialted and Depth-Separable Convolution
+  """
+  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+    super().__init__()
+    self.channels = channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    self.drop = nn.Dropout(p_dropout)
+    self.convs_sep = nn.ModuleList()
+    self.convs_1x1 = nn.ModuleList()
+    self.norms_1 = nn.ModuleList()
+    self.norms_2 = nn.ModuleList()
+    for i in range(n_layers):
+      dilation = kernel_size ** i
+      padding = (kernel_size * dilation - dilation) // 2
+      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
+          groups=channels, dilation=dilation, padding=padding
+      ))
+      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+      self.norms_1.append(LayerNorm(channels))
+      self.norms_2.append(LayerNorm(channels))
+  def forward(self, x, x_mask, g=None):
+    if g is not None:
+      x = x + g
+    for i in range(self.n_layers):
+      y = self.convs_sep[i](x * x_mask)
+      y = self.norms_1[i](y)
+      y = F.gelu(y)
+      y = self.convs_1x1[i](y)
+      y = self.norms_2[i](y)
+      y = F.gelu(y)
+      y = self.drop(y)
+      x = x + y
+    return x * x_mask
+class WN(torch.nn.Module):
+  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+    super(WN, self).__init__()
+    assert(kernel_size % 2 == 1)
+    self.hidden_channels =hidden_channels
+    self.kernel_size = kernel_size,
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.p_dropout = p_dropout
+    self.in_layers = torch.nn.ModuleList()
+    self.res_skip_layers = torch.nn.ModuleList()
+    self.drop = nn.Dropout(p_dropout)
+    if gin_channels != 0:
+      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+    for i in range(n_layers):
+      dilation = dilation_rate ** i
+      padding = int((kernel_size * dilation - dilation) / 2)
+      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+                                 dilation=dilation, padding=padding)
+      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+      self.in_layers.append(in_layer)
+      # last one is not necessary
+      if i < n_layers - 1:
+        res_skip_channels = 2 * hidden_channels
+      else:
+        res_skip_channels = hidden_channels
+      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+      self.res_skip_layers.append(res_skip_layer)
+  def forward(self, x, x_mask, g=None, **kwargs):
+    output = torch.zeros_like(x)
+    n_channels_tensor = torch.IntTensor([self.hidden_channels])
+    if g is not None:
+      g = self.cond_layer(g)
+    for i in range(self.n_layers):
+      x_in = self.in_layers[i](x)
+      if g is not None:
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+      else:
+        g_l = torch.zeros_like(x_in)
+      acts = commons.fused_add_tanh_sigmoid_multiply(
+          x_in,
+          g_l,
+          n_channels_tensor)
+      acts = self.drop(acts)
+      res_skip_acts = self.res_skip_layers[i](acts)
+      if i < self.n_layers - 1:
+        res_acts = res_skip_acts[:,:self.hidden_channels,:]
+        x = (x + res_acts) * x_mask
+        output = output + res_skip_acts[:,self.hidden_channels:,:]
+      else:
+        output = output + res_skip_acts
+    return output * x_mask
+  def remove_weight_norm(self):
+    if self.gin_channels != 0:
+      torch.nn.utils.remove_weight_norm(self.cond_layer)
+    for l in self.in_layers:
+      torch.nn.utils.remove_weight_norm(l)
+    for l in self.res_skip_layers:
+     torch.nn.utils.remove_weight_norm(l)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Log(nn.Module):
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+      logdet = torch.sum(-y, [1, 2])
+      return y, logdet
+    else:
+      x = torch.exp(x) * x_mask
+      return x
+class Flip(nn.Module):
+  def forward(self, x, *args, reverse=False, **kwargs):
+    x = torch.flip(x, [1])
+    if not reverse:
+      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+      return x, logdet
+    else:
+      return x
+class ElementwiseAffine(nn.Module):
+  def __init__(self, channels):
+    super().__init__()
+    self.channels = channels
+    self.m = nn.Parameter(torch.zeros(channels,1))
+    self.logs = nn.Parameter(torch.zeros(channels,1))
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = self.m + torch.exp(self.logs) * x
+      y = y * x_mask
+      logdet = torch.sum(self.logs * x_mask, [1,2])
+      return y, logdet
+    else:
+      x = (x - self.m) * torch.exp(-self.logs) * x_mask
+      return x
+class ResidualCouplingLayer(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      p_dropout=0,
+      gin_channels=0,
+      mean_only=False):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+numpy==1.22.0
+scipy
+torch
+transformers
+librosa==0.8.1
+webrtcvad==2.0.10
+protobuf
+cpm_kernels
+mdtex2html
+sentencepiece
+accelerate
+loguru
+edge_tts
+altair
+gradio==3.36.1
+nltk
+openai

speaker_encoder/__init__.py ADDED Viewed

File without changes

speaker_encoder/audio.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from scipy.ndimage.morphology import binary_dilation
+from speaker_encoder.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+int16_max = (2 ** 15) - 1
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(fpath_or_wav, sr=None)
+    else:
+        wav = fpath_or_wav
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, source_sr, sampling_rate)
+    # Apply the preprocessing: normalize volume and shorten long silences
+    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    wav = trim_long_silences(wav)
+    return wav
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    return wav[audio_mask == True]
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))

speaker_encoder/ckpt/pretrained_bak_5805000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc7ff82ef75becd495aab2ede3a8220da393a717f178ae9534df355a6173bbca
+size 17090379

speaker_encoder/compute_embed.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from speaker_encoder import inference as encoder
+from multiprocessing.pool import Pool
+from functools import partial
+from pathlib import Path
+# from utils import logmmse
+# from tqdm import tqdm
+# import numpy as np
+# import librosa
+def embed_utterance(fpaths, encoder_model_fpath):
+    if not encoder.is_loaded():
+        encoder.load_model(encoder_model_fpath)
+    # Compute the speaker embedding of the utterance
+    wav_fpath, embed_fpath = fpaths
+    wav = np.load(wav_fpath)
+    wav = encoder.preprocess_wav(wav)
+    embed = encoder.embed_utterance(wav)
+    np.save(embed_fpath, embed, allow_pickle=False)
+def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
+    wav_dir = outdir_root.joinpath("audio")
+    metadata_fpath = synthesizer_root.joinpath("train.txt")
+    assert wav_dir.exists() and metadata_fpath.exists()
+    embed_dir = synthesizer_root.joinpath("embeds")
+    embed_dir.mkdir(exist_ok=True)
+    # Gather the input wave filepath and the target output embed filepath
+    with metadata_fpath.open("r") as metadata_file:
+        metadata = [line.split("|") for line in metadata_file]
+        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
+    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
+    # Embed the utterances in separate threads
+    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
+    job = Pool(n_processes).imap(func, fpaths)
+    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

speaker_encoder/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+librispeech_datasets = {
+    "train": {
+        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+        "other": ["LibriSpeech/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriSpeech/test-clean"],
+        "other": ["LibriSpeech/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriSpeech/dev-clean"],
+        "other": ["LibriSpeech/dev-other"]
+    },
+}
+libritts_datasets = {
+    "train": {
+        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+        "other": ["LibriTTS/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriTTS/test-clean"],
+        "other": ["LibriTTS/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriTTS/dev-clean"],
+        "other": ["LibriTTS/dev-other"]
+    },
+}
+voxceleb_datasets = {
+    "voxceleb1" : {
+        "train": ["VoxCeleb1/wav"],
+        "test": ["VoxCeleb1/test_wav"]
+    },
+    "voxceleb2" : {
+        "train": ["VoxCeleb2/dev/aac"],
+        "test": ["VoxCeleb2/test_wav"]
+    }
+}
+other_datasets = [
+    "LJSpeech-1.1",
+    "VCTK-Corpus/wav48",
+]
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]

speaker_encoder/data_objects/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
2	+ from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

speaker_encoder/data_objects/random_cycler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import random
+class RandomCycler:
+    """
+    Creates an internal copy of a sequence and allows access to its items in a constrained random
+    order. For a source sequence of n items and one or several consecutive queries of a total
+    of m items, the following guarantees hold (one implies the other):
+        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+    """
+    def __init__(self, source):
+        if len(source) == 0:
+            raise Exception("Can't create RandomCycler from an empty collection")
+        self.all_items = list(source)
+        self.next_items = []
+    def sample(self, count: int):
+        shuffle = lambda l: random.sample(l, len(l))
+        out = []
+        while count > 0:
+            if count >= len(self.all_items):
+                out.extend(shuffle(list(self.all_items)))
+                count -= len(self.all_items)
+                continue
+            n = min(count, len(self.next_items))
+            out.extend(self.next_items[:n])
+            count -= n
+            self.next_items = self.next_items[n:]
+            if len(self.next_items) == 0:
+                self.next_items = shuffle(list(self.all_items))
+        return out
+    def __next__(self):
+        return self.sample(1)[0]

speaker_encoder/data_objects/speaker.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.utterance import Utterance
+from pathlib import Path
+# Contains the set of utterances of a single speaker
+class Speaker:
+    def __init__(self, root: Path):
+        self.root = root
+        self.name = root.name
+        self.utterances = None
+        self.utterance_cycler = None
+    def _load_utterances(self):
+        with self.root.joinpath("_sources.txt").open("r") as sources_file:
+            sources = [l.split(",") for l in sources_file]
+        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+        self.utterance_cycler = RandomCycler(self.utterances)
+    def random_partial(self, count, n_frames):
+        """
+        Samples a batch of <count> unique partial utterances from the disk in a way that all
+        utterances come up at least once every two cycles and in a random order every time.
+        :param count: The number of partial utterances to sample from the set of utterances from
+        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
+        the number of utterances available.
+        :param n_frames: The number of frames in the partial utterance.
+        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
+        frames are the frames of the partial utterances and range is the range of the partial
+        utterance with regard to the complete utterance.
+        """
+        if self.utterances is None:
+            self._load_utterances()
+        utterances = self.utterance_cycler.sample(count)
+        a = [(u,) + u.random_partial(n_frames) for u in utterances]
+        return a

speaker_encoder/data_objects/speaker_batch.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import numpy as np
+from typing import List
+from speaker_encoder.data_objects.speaker import Speaker
+class SpeakerBatch:
+    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+        self.speakers = speakers
+        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])

speaker_encoder/data_objects/speaker_verification_dataset.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.speaker_batch import SpeakerBatch
+from speaker_encoder.data_objects.speaker import Speaker
+from speaker_encoder.params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+# TODO: improve with a pool of speakers for data efficiency
+class SpeakerVerificationDataset(Dataset):
+    def __init__(self, datasets_root: Path):
+        self.root = datasets_root
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        if len(speaker_dirs) == 0:
+            raise Exception("No speakers found. Make sure you are pointing to the directory "
+                            "containing all preprocessed speaker directories.")
+        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+        self.speaker_cycler = RandomCycler(self.speakers)
+    def __len__(self):
+        return int(1e10)
+    def __getitem__(self, index):
+        return next(self.speaker_cycler)
+    def get_logs(self):
+        log_string = ""
+        for log_fpath in self.root.glob("*.txt"):
+            with log_fpath.open("r") as log_file:
+                log_string += "".join(log_file.readlines())
+        return log_string
+class SpeakerVerificationDataLoader(DataLoader):
+    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
+                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
+                 worker_init_fn=None):
+        self.utterances_per_speaker = utterances_per_speaker
+        super().__init__(
+            dataset=dataset,
+            batch_size=speakers_per_batch,
+            shuffle=False,
+            sampler=sampler,
+            batch_sampler=batch_sampler,
+            num_workers=num_workers,
+            collate_fn=self.collate,
+            pin_memory=pin_memory,
+            drop_last=False,
+            timeout=timeout,
+            worker_init_fn=worker_init_fn
+        )
+    def collate(self, speakers):
+        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)

speaker_encoder/data_objects/utterance.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import numpy as np
+class Utterance:
+    def __init__(self, frames_fpath, wave_fpath):
+        self.frames_fpath = frames_fpath
+        self.wave_fpath = wave_fpath
+    def get_frames(self):
+        return np.load(self.frames_fpath)
+    def random_partial(self, n_frames):
+        """
+        Crops the frames into a partial utterance of n_frames
+        :param n_frames: The number of frames of the partial utterance
+        :return: the partial utterance frames and a tuple indicating the start and end of the
+        partial utterance in the complete utterance.
+        """
+        frames = self.get_frames()
+        if frames.shape[0] == n_frames:
+            start = 0
+        else:
+            start = np.random.randint(0, frames.shape[0] - n_frames)
+        end = start + n_frames
+        return frames[start:end], (start, end)

speaker_encoder/hparams.py ADDED Viewed

	@@ -0,0 +1,31 @@

+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3

speaker_encoder/inference.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from speaker_encoder.params_data import *
+from speaker_encoder.model import SpeakerEncoder
+from speaker_encoder.audio import preprocess_wav   # We want to expose this function from here
+from matplotlib import cm
+from speaker_encoder import audio
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+def load_model(weights_fpath: Path, device=None):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the
+    first call to embed_frames() with the default weights file.
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+    model will be loaded and will run on this device. Outputs will however always be on the cpu.
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = SpeakerEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath)
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+def is_loaded():
+    return _model is not None
+def embed_frames_batch(frames_batch):
+    """
+    Computes embeddings for a batch of mel spectrogram.
+    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
+    (batch_size, n_frames, n_channels)
+    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+    """
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+    frames = torch.from_numpy(frames_batch).to(_device)
+    embed = _model.forward(frames).detach().cpu().numpy()
+    return embed
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those
+    defined in params_data.py.
+    The returned ranges may be indexing further than the length of the waveform. It is
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+    utterances are entirely disjoint.
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    return wav_slices, mel_slices
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
+    normalized average. If False, the utterance is instead computed from feeding the entire
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+    returned. If <using_partials> is simultaneously set to False, both these values will be None
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = audio.wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    # Split the utterance into partials
+    frames = audio.wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_clim(*color_range)
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)

speaker_encoder/model.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from speaker_encoder.params_model import *
+from speaker_encoder.params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+class SpeakerEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,     # 40
+                            hidden_size=model_hidden_size, # 256
+                            num_layers=model_num_layers,   # 3
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size,
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+        (batch_size, n_frames, n_channels)
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        # L2-normalize it
+        embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        return embeds
+    def similarity_matrix(self, embeds):
+        """
+        Computes the similarity matrix according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, speakers_per_batch)
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+        centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+        centroids_excl /= (utterances_per_speaker - 1)
+        centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
+        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+        # We vectorize the computation for efficiency.
+        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+                                 speakers_per_batch).to(self.loss_device)
+        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+        for j in range(speakers_per_batch):
+            mask = np.where(mask_matrix[j])[0]
+            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+        ## Even more vectorized version (slower maybe because of transpose)
+        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+        #                           ).to(self.loss_device)
+        # eye = np.eye(speakers_per_batch, dtype=np.int)
+        # mask = np.where(1 - eye)
+        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+        # mask = np.where(eye)
+        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+        # sim_matrix2 = sim_matrix2.transpose(1, 2)
+        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+        return sim_matrix
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Loss
+        sim_matrix = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
+                                         speakers_per_batch))
+        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+        loss = self.loss_fn(sim_matrix, target)
+        # EER (not backpropagated)
+        with torch.no_grad():
+            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+            labels = np.array([inv_argmax(i) for i in ground_truth])
+            preds = sim_matrix.detach().cpu().numpy()
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
+            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+        return loss, eer

speaker_encoder/params_data.py ADDED Viewed

	@@ -0,0 +1,29 @@

+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+## Audio volume normalization
+audio_norm_target_dBFS = -30

speaker_encoder/params_model.py ADDED Viewed

	@@ -0,0 +1,11 @@

+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10

speaker_encoder/preprocess.py ADDED Viewed

	@@ -0,0 +1,285 @@

+from multiprocess.pool import ThreadPool
+from speaker_encoder.params_data import *
+from speaker_encoder.config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from speaker_encoder import audio
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+class DatasetLog:
+    """
+    Registers metadata about the dataset in a text file.
+    """
+    def __init__(self, root, name):
+        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+        self.sample_data = dict()
+        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Creating dataset %s on %s" % (name, start_time))
+        self.write_line("-----")
+        self._log_params()
+    def _log_params(self):
+        from speaker_encoder import params_data
+        self.write_line("Parameter values:")
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            self.write_line("\t%s: %s" % (param_name, value))
+        self.write_line("-----")
+    def write_line(self, line):
+        self.text_file.write("%s\n" % line)
+    def add_sample(self, **kwargs):
+        for param_name, value in kwargs.items():
+            if not param_name in self.sample_data:
+                self.sample_data[param_name] = []
+            self.sample_data[param_name].append(value)
+    def finalize(self):
+        self.write_line("Statistics:")
+        for param_name, values in self.sample_data.items():
+            self.write_line("\t%s:" % param_name)
+            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+        self.write_line("-----")
+        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Finished on %s" % end_time)
+        self.text_file.close()
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+    dataset_root = datasets_root.joinpath(dataset_name)
+    if not dataset_root.exists():
+        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+        return None, None
+    return dataset_root, DatasetLog(out_dir, dataset_name)
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    # Function to preprocess utterances for one speaker
+    def preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        # Create an output directory with that name, as well as a txt file containing a
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        # There's a possibility that the preprocessing was interrupted earlier, check if
+        # there already is a sources file.
+        if sources_fpath.exists():
+            try:
+                with sources_fpath.open("r") as sources_file:
+                    existing_fnames = {line.split(",")[0] for line in sources_file}
+            except:
+                existing_fnames = {}
+        else:
+            existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        sources_file.close()
+    # Process the utterances for each speaker
+    with ThreadPool(8) as pool:
+        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+                  unit="speakers"))
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+# Function to preprocess utterances for one speaker
+def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        # Create an output directory with that name, as well as a txt file containing a
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        # There's a possibility that the preprocessing was interrupted earlier, check if
+        # there already is a sources file.
+        # if sources_fpath.exists():
+        #     try:
+        #         with sources_fpath.open("r") as sources_file:
+        #             existing_fnames = {line.split(",")[0] for line in sources_file}
+        #     except:
+        #         existing_fnames = {}
+        # else:
+        #     existing_fnames = {}
+        existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            # logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        sources_file.close()
+        return len(wav)
+def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    # from multiprocessing import Pool, cpu_count
+    from pathos.multiprocessing import ProcessingPool as Pool
+    # Function to preprocess utterances for one speaker
+    def __preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        # Create an output directory with that name, as well as a txt file containing a
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        wav_lens = []
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            # logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+            wav_lens.append(len(wav))
+        sources_file.close()
+        return wav_lens
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    # Process the utterances for each speaker
+    # with ThreadPool(8) as pool:
+    #     list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+    #               unit="speakers"))
+    pool = Pool(processes=20)
+    for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1):
+        for wav_len in wav_lens:
+            logger.add_sample(duration=wav_len / sampling_rate)
+        print(f'{i}/{len(speaker_dirs)} \r')
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+    for dataset_name in librispeech_datasets["train"]["other"]:
+        # Initialize the preprocessing
+        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+        if not dataset_root:
+            return
+        # Preprocess all speakers
+        speaker_dirs = list(dataset_root.glob("*"))
+        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+                                 skip_existing, logger)
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb1"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    # Get the contents of the meta file
+    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+        metadata = [line.split("\t") for line in metafile][1:]
+    # Select the ID and the nationality, filter out non-anglophone speakers
+    nationalities = {line[0]: line[3] for line in metadata}
+    # keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
+    #                     nationality.lower() in anglophone_nationalites]
+    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()]
+    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
+          (len(keep_speaker_ids), len(nationalities)))
+    # Get the speaker directories for anglophone speakers only
+    speaker_dirs = dataset_root.joinpath("wav").glob("*")
+    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+                    speaker_dir.name in keep_speaker_ids]
+    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
+          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+    # Preprocess all speakers
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+                             skip_existing, logger)
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb2"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    # Get the speaker directories
+    # Preprocess all speakers
+    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+    _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+                             skip_existing, logger)

speaker_encoder/train.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from speaker_encoder.visualizations import Visualizations
+from speaker_encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from speaker_encoder.params_model import *
+from speaker_encoder.model import SpeakerEncoder
+from utils.profiler import Profiler
+from pathlib import Path
+import torch
+def sync(device: torch.device):
+    # FIXME
+    return
+    # For correct profiling (cuda operations are async)
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+          no_visdom: bool):
+    # Create a dataset and a dataloader
+    dataset = SpeakerVerificationDataset(clean_data_root)
+    loader = SpeakerVerificationDataLoader(
+        dataset,
+        speakers_per_batch,       # 64
+        utterances_per_speaker,   # 10
+        num_workers=8,
+    )
+    # Setup the device on which to run the forward pass and the loss. These can be different,
+    # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+    # hyperparameters) faster on the CPU.
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # FIXME: currently, the gradient is None if loss_device is cuda
+    loss_device = torch.device("cpu")
+    # Create the model and the optimizer
+    model = SpeakerEncoder(device, loss_device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+    init_step = 1
+    # Configure file path for the model
+    state_fpath = models_dir.joinpath(run_id + ".pt")
+    backup_dir = models_dir.joinpath(run_id + "_backups")
+    # Load any existing model
+    if not force_restart:
+        if state_fpath.exists():
+            print("Found existing model \"%s\", loading it and resuming training." % run_id)
+            checkpoint = torch.load(state_fpath)
+            init_step = checkpoint["step"]
+            model.load_state_dict(checkpoint["model_state"])
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+            optimizer.param_groups[0]["lr"] = learning_rate_init
+        else:
+            print("No model \"%s\" found, starting training from scratch." % run_id)
+    else:
+        print("Starting the training from scratch.")
+    model.train()
+    # Initialize the visualization environment
+    vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+    vis.log_dataset(dataset)
+    vis.log_params()
+    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+    vis.log_implementation({"Device": device_name})
+    # Training loop
+    profiler = Profiler(summarize_every=10, disabled=False)
+    for step, speaker_batch in enumerate(loader, init_step):
+        profiler.tick("Blocking, waiting for batch (threaded)")
+        # Forward pass
+        inputs = torch.from_numpy(speaker_batch.data).to(device)
+        sync(device)
+        profiler.tick("Data to %s" % device)
+        embeds = model(inputs)
+        sync(device)
+        profiler.tick("Forward pass")
+        embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+        loss, eer = model.loss(embeds_loss)
+        sync(loss_device)
+        profiler.tick("Loss")
+        # Backward pass
+        model.zero_grad()
+        loss.backward()
+        profiler.tick("Backward pass")
+        model.do_gradient_ops()
+        optimizer.step()
+        profiler.tick("Parameter update")
+        # Update visualizations
+        # learning_rate = optimizer.param_groups[0]["lr"]
+        vis.update(loss.item(), eer, step)
+        # Draw projections and save them to the backup folder
+        if umap_every != 0 and step % umap_every == 0:
+            print("Drawing and saving projections (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+            embeds = embeds.detach().cpu().numpy()
+            vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+            vis.save()
+        # Overwrite the latest version of the model
+        if save_every != 0 and step % save_every == 0:
+            print("Saving the model (step %d)" % step)
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, state_fpath)
+        # Make a backup
+        if backup_every != 0 and step % backup_every == 0:
+            print("Making a backup (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, backup_fpath)
+        profiler.tick("Extras (visualizations, saving)")

speaker_encoder/visualizations.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+colormap = np.array([
+    [76, 255, 0],
+    [0, 127, 70],
+    [255, 0, 0],
+    [255, 217, 38],
+    [0, 135, 255],
+    [165, 0, 165],
+    [255, 167, 255],
+    [0, 255, 255],
+    [255, 96, 38],
+    [142, 76, 0],
+    [33, 0, 127],
+    [0, 0, 0],
+    [183, 183, 183],
+], dtype=np.float) / 255
+class Visualizations:
+    def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+        # Tracking data
+        self.last_update_timestamp = timer()
+        self.update_every = update_every
+        self.step_times = []
+        self.losses = []
+        self.eers = []
+        print("Updating the visualizations every %d steps." % update_every)
+        # If visdom is disabled TODO: use a better paradigm for that
+        self.disabled = disabled
+        if self.disabled:
+            return
+        # Set the environment name
+        now = str(datetime.now().strftime("%d-%m %Hh%M"))
+        if env_name is None:
+            self.env_name = now
+        else:
+            self.env_name = "%s (%s)" % (env_name, now)
+        # Connect to visdom and open the corresponding window in the browser
+        try:
+            self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+        except ConnectionError:
+            raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+                            "start it.")
+        # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+        # Create the windows
+        self.loss_win = None
+        self.eer_win = None
+        # self.lr_win = None
+        self.implementation_win = None
+        self.projection_win = None
+        self.implementation_string = ""
+    def log_params(self):
+        if self.disabled:
+            return
+        from speaker_encoder import params_data
+        from speaker_encoder import params_model
+        param_string = "<b>Model parameters</b>:<br>"
+        for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+            value = getattr(params_model, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        param_string += "<b>Data parameters</b>:<br>"
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        self.vis.text(param_string, opts={"title": "Parameters"})
+    def log_dataset(self, dataset: SpeakerVerificationDataset):
+        if self.disabled:
+            return
+        dataset_string = ""
+        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
+        dataset_string += "\n" + dataset.get_logs()
+        dataset_string = dataset_string.replace("\n", "<br>")
+        self.vis.text(dataset_string, opts={"title": "Dataset"})
+    def log_implementation(self, params):
+        if self.disabled:
+            return
+        implementation_string = ""
+        for param, value in params.items():
+            implementation_string += "<b>%s</b>: %s\n" % (param, value)
+            implementation_string = implementation_string.replace("\n", "<br>")
+        self.implementation_string = implementation_string
+        self.implementation_win = self.vis.text(
+            implementation_string,
+            opts={"title": "Training implementation"}
+        )
+    def update(self, loss, eer, step):
+        # Update the tracking data
+        now = timer()
+        self.step_times.append(1000 * (now - self.last_update_timestamp))
+        self.last_update_timestamp = now
+        self.losses.append(loss)
+        self.eers.append(eer)
+        print(".", end="")
+        # Update the plots every <update_every> steps
+        if step % self.update_every != 0:
+            return
+        time_string = "Step time:  mean: %5dms  std: %5dms" % \
+                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+        print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
+              (step, np.mean(self.losses), np.mean(self.eers), time_string))
+        if not self.disabled:
+            self.loss_win = self.vis.line(
+                [np.mean(self.losses)],
+                [step],
+                win=self.loss_win,
+                update="append" if self.loss_win else None,
+                opts=dict(
+                    legend=["Avg. loss"],
+                    xlabel="Step",
+                    ylabel="Loss",
+                    title="Loss",
+                )
+            )
+            self.eer_win = self.vis.line(
+                [np.mean(self.eers)],
+                [step],
+                win=self.eer_win,
+                update="append" if self.eer_win else None,
+                opts=dict(
+                    legend=["Avg. EER"],
+                    xlabel="Step",
+                    ylabel="EER",
+                    title="Equal error rate"
+                )
+            )
+            if self.implementation_win is not None:
+                self.vis.text(
+                    self.implementation_string + ("<b>%s</b>" % time_string),
+                    win=self.implementation_win,
+                    opts={"title": "Training implementation"},
+                )
+        # Reset the tracking
+        self.losses.clear()
+        self.eers.clear()
+        self.step_times.clear()
+    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+                         max_speakers=10):
+        max_speakers = min(max_speakers, len(colormap))
+        embeds = embeds[:max_speakers * utterances_per_speaker]
+        n_speakers = len(embeds) // utterances_per_speaker
+        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+        colors = [colormap[i] for i in ground_truth]
+        reducer = umap.UMAP()
+        projected = reducer.fit_transform(embeds)
+        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+        plt.gca().set_aspect("equal", "datalim")
+        plt.title("UMAP projection (step %d)" % step)
+        if not self.disabled:
+            self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+        if out_fpath is not None:
+            plt.savefig(out_fpath)
+        plt.clf()
+    def save(self):
+        if not self.disabled:
+            self.vis.save([self.env_name])

speaker_encoder/voice_encoder.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from speaker_encoder.hparams import *
+from speaker_encoder import audio
+from pathlib import Path
+from typing import Union, List
+from torch import nn
+from time import perf_counter as timer
+import numpy as np
+import torch
+class SpeakerEncoder(nn.Module):
+    def __init__(self, weights_fpath, device: Union[str, torch.device]=None, verbose=True):
+        """
+        :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda").
+        If None, defaults to cuda if it is available on your machine, otherwise the model will
+        run on cpu. Outputs are always returned on the cpu, as numpy arrays.
+        """
+        super().__init__()
+        # Define the network
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+        # Get the target device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        self.device = device
+        # Load the pretrained model'speaker weights
+        # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
+        # if not weights_fpath.exists():
+        #     raise Exception("Couldn't find the voice encoder pretrained model at %s." %
+        #                     weights_fpath)
+        start = timer()
+        checkpoint = torch.load(weights_fpath, map_location="cpu")
+        self.load_state_dict(checkpoint["model_state"], strict=False)
+        self.to(device)
+        if verbose:
+            print("Loaded the voice encoder model on %s in %.2f seconds." %
+                  (device.type, timer() - start))
+    def forward(self, mels: torch.FloatTensor):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape
+        (batch_size, n_frames, n_channels)
+        :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size).
+        Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
+        """
+        # Pass the input through the LSTM layers and retrieve the final hidden state of the last
+        # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings.
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+    @staticmethod
+    def compute_partial_slices(n_samples: int, rate, min_coverage):
+        """
+        Computes where to split an utterance waveform and its corresponding mel spectrogram to
+        obtain partial utterances of <partials_n_frames> each. Both the waveform and the
+        mel spectrogram slices are returned, so as to make each partial utterance waveform
+        correspond to its spectrogram.
+        The returned ranges may be indexing further than the length of the waveform. It is
+        recommended that you pad the waveform with zeros up to wav_slices[-1].stop.
+        :param n_samples: the number of samples in the waveform
+        :param rate: how many partial utterances should occur per second. Partial utterances must
+        cover the span of the entire utterance, thus the rate should not be lower than the inverse
+        of the duration of a partial utterance. By default, partial utterances are 1.6s long and
+        the minimum rate is thus 0.625.
+        :param min_coverage: when reaching the last partial utterance, it may or may not have
+        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
+        then the last partial utterance will be considered by zero-padding the audio. Otherwise,
+        it will be discarded. If there aren't enough frames for one partial utterance,
+        this parameter is ignored so that the function always returns at least one slice.
+        :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+        respectively the waveform and the mel spectrogram with these slices to obtain the partial
+        utterances.
+        """
+        assert 0 < min_coverage <= 1
+        # Compute how many frames separate two partial utterances
+        samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+        n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+        frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
+        assert 0 < frame_step, "The rate is too high"
+        assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
+            (sampling_rate / (samples_per_frame * partials_n_frames))
+        # Compute the slices
+        wav_slices, mel_slices = [], []
+        steps = max(1, n_frames - partials_n_frames + frame_step + 1)
+        for i in range(0, steps, frame_step):
+            mel_range = np.array([i, i + partials_n_frames])
+            wav_range = mel_range * samples_per_frame
+            mel_slices.append(slice(*mel_range))
+            wav_slices.append(slice(*wav_range))
+        # Evaluate whether extra padding is warranted or not
+        last_wav_range = wav_slices[-1]
+        coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+        if coverage < min_coverage and len(mel_slices) > 1:
+            mel_slices = mel_slices[:-1]
+            wav_slices = wav_slices[:-1]
+        return wav_slices, mel_slices
+    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
+        """
+        Computes an embedding for a single utterance. The utterance is divided in partial
+        utterances and an embedding is computed for each. The complete utterance embedding is the
+        L2-normed average embedding of the partial utterances.
+        TODO: independent batched version of this function
+        :param wav: a preprocessed utterance waveform as a numpy array of float32
+        :param return_partials: if True, the partial embeddings will also be returned along with
+        the wav slices corresponding to each partial utterance.
+        :param rate: how many partial utterances should occur per second. Partial utterances must
+        cover the span of the entire utterance, thus the rate should not be lower than the inverse
+        of the duration of a partial utterance. By default, partial utterances are 1.6s long and
+        the minimum rate is thus 0.625.
+        :param min_coverage: when reaching the last partial utterance, it may or may not have
+        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
+        then the last partial utterance will be considered by zero-padding the audio. Otherwise,
+        it will be discarded. If there aren't enough frames for one partial utterance,
+        this parameter is ignored so that the function always returns at least one slice.
+        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+        <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+        (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+        returned.
+        """
+        # Compute where to split the utterance into partials and pad the waveform with zeros if
+        # the partial utterances cover a larger range.
+        wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
+        max_wave_length = wav_slices[-1].stop
+        if max_wave_length >= len(wav):
+            wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+        # Split the utterance into partials and forward them through the model
+        mel = audio.wav_to_mel_spectrogram(wav)
+        mels = np.array([mel[s] for s in mel_slices])
+        with torch.no_grad():
+            mels = torch.from_numpy(mels).to(self.device)
+            partial_embeds = self(mels).cpu().numpy()
+        # Compute the utterance embedding from the partial embeddings
+        raw_embed = np.mean(partial_embeds, axis=0)
+        embed = raw_embed / np.linalg.norm(raw_embed, 2)
+        if return_partials:
+            return embed, partial_embeds, wav_slices
+        return embed
+    def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
+        """
+        Compute the embedding of a collection of wavs (presumably from the same speaker) by
+        averaging their embedding and L2-normalizing it.
+        :param wavs: list of wavs a numpy arrays of float32.
+        :param kwargs: extra arguments to embed_utterance()
+        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
+        """
+        raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \
+                             for wav in wavs], axis=0)
+        return raw_embed / np.linalg.norm(raw_embed, 2)

tts_voice.py ADDED Viewed

	@@ -0,0 +1,290 @@

+tts_order_voice = {'英语 (美国)-Jenny-女': 'en-US-JennyNeural',
+ '英语 (美国)-Guy-男': 'en-US-GuyNeural',
+ '英语 (美国)-Ana-女': 'en-US-AnaNeural',
+ '英语 (美国)-Aria-女': 'en-US-AriaNeural',
+ '英语 (美国)-Christopher-男': 'en-US-ChristopherNeural',
+ '英语 (美国)-Eric-男': 'en-US-EricNeural',
+ '英语 (美国)-Michelle-女': 'en-US-MichelleNeural',
+ '英语 (美国)-Roger-男': 'en-US-RogerNeural',
+ '西班牙语 (墨西哥)-Dalia-女': 'es-MX-DaliaNeural',
+ '西班牙语 (墨西哥)-Jorge-男': 'es-MX-JorgeNeural',
+ '韩语 (韩国)-Sun-Hi-女': 'ko-KR-SunHiNeural',
+ '韩语 (韩国)-InJoon-男': 'ko-KR-InJoonNeural',
+'泰语 (泰国)-Premwadee-女': 'th-TH-PremwadeeNeural',
+ '泰语 (泰国)-Niwat-男': 'th-TH-NiwatNeural',
+ '越南语 (越南)-HoaiMy-女': 'vi-VN-HoaiMyNeural',
+'越南语 (越南)-NamMinh-男': 'vi-VN-NamMinhNeural',
+ '日语 (日本)-Nanami-女': 'ja-JP-NanamiNeural',
+ '日语 (日本)-Keita-男': 'ja-JP-KeitaNeural',
+ '法语 (法国)-Denise-女': 'fr-FR-DeniseNeural',
+ '法语 (法国)-Eloise-女': 'fr-FR-EloiseNeural',
+ '法语 (法国)-Henri-男': 'fr-FR-HenriNeural',
+ '葡萄牙语 (巴西)-Francisca-女': 'pt-BR-FranciscaNeural',
+ '葡萄牙语 (巴西)-Antonio-男': 'pt-BR-AntonioNeural',
+ '印度尼西亚语 (印度尼西亚)-Ardi-男': 'id-ID-ArdiNeural',
+ '印度尼西亚语 (印度尼西亚)-Gadis-女': 'id-ID-GadisNeural',
+ '希伯来语 (以色列)-Avri-男': 'he-IL-AvriNeural',
+ '希伯来语 (以色列)-Hila-女': 'he-IL-HilaNeural',
+'意大利语 (意大利)-Isabella-女': 'it-IT-IsabellaNeural',
+ '意大利语 (意大利)-Diego-男': 'it-IT-DiegoNeural',
+ '意大利语 (意大利)-Elsa-女': 'it-IT-ElsaNeural',
+ '荷兰语 (荷兰)-Colette-女': 'nl-NL-ColetteNeural',
+ '荷兰语 (荷兰)-Fenna-女': 'nl-NL-FennaNeural',
+ '荷兰语 (荷兰)-Maarten-男': 'nl-NL-MaartenNeural',
+'马来语 (马来西亚)-Osman-男': 'ms-MY-OsmanNeural',
+ '马来语 (马来西亚)-Yasmin-女': 'ms-MY-YasminNeural',
+ '挪威语 (挪威)-Pernille-女': 'nb-NO-PernilleNeural',
+ '挪威语 (挪威)-Finn-男': 'nb-NO-FinnNeural',
+ '瑞典语 (瑞典)-Sofie-女': 'sv-SE-SofieNeural',
+ '瑞典语 (瑞典)-Mattias-男': 'sv-SE-MattiasNeural',
+ '阿拉伯语 (沙特阿拉伯)-Hamed-男': 'ar-SA-HamedNeural',
+ '阿拉伯语 (沙特阿拉伯)-Zariyah-女': 'ar-SA-ZariyahNeural',
+ '希腊语 (希腊)-Athina-女': 'el-GR-AthinaNeural',
+ '希腊语 (希腊)-Nestoras-男': 'el-GR-NestorasNeural',
+'德语 (德国)-Katja-女': 'de-DE-KatjaNeural',
+ '德语 (德国)-Amala-女': 'de-DE-AmalaNeural',
+ '德语 (德国)-Conrad-男': 'de-DE-ConradNeural',
+ '德语 (德国)-Killian-男': 'de-DE-KillianNeural',
+ '阿拉伯语 (南非)-Adri-女': 'af-ZA-AdriNeural',
+ '阿拉伯语 (南非)-Willem-男': 'af-ZA-WillemNeural',
+ '阿姆哈拉语 (埃塞俄比亚)-Ameha-男': 'am-ET-AmehaNeural',
+ '阿姆哈拉语 (埃塞俄比亚)-Mekdes-女': 'am-ET-MekdesNeural',
+ '阿拉伯语 (阿拉伯联合酋长国)-Fatima-女': 'ar-AE-FatimaNeural',
+ '阿拉伯语 (阿拉伯联合酋长国)-Hamdan-男': 'ar-AE-HamdanNeural',
+ '阿拉伯语 (巴林)-Ali-男': 'ar-BH-AliNeural',
+ '阿拉伯语 (巴林)-Laila-女': 'ar-BH-LailaNeural',
+ '阿拉伯语 (阿尔及利亚)-Ismael-男': 'ar-DZ-IsmaelNeural',
+ '阿拉伯语 (埃及)-Salma-女': 'ar-EG-SalmaNeural',
+ '阿拉伯语 (埃及)-Shakir-男': 'ar-EG-ShakirNeural',
+ '阿拉伯语 (伊拉克)-Bassel-男': 'ar-IQ-BasselNeural',
+ '阿拉伯语 (伊拉克)-Rana-女': 'ar-IQ-RanaNeural',
+ '阿拉伯语 (约旦)-Sana-女': 'ar-JO-SanaNeural',
+ '阿拉伯语 (约旦)-Taim-男': 'ar-JO-TaimNeural',
+ '阿拉伯语 (科威特)-Fahed-男': 'ar-KW-FahedNeural',
+ '阿拉伯语 (科威特)-Noura-女': 'ar-KW-NouraNeural',
+ '阿拉伯语 (黎巴嫩)-Layla-女': 'ar-LB-LaylaNeural',
+ '阿拉伯语 (黎巴嫩)-Rami-男': 'ar-LB-RamiNeural',
+ '阿拉伯语 (利比亚)-Iman-女': 'ar-LY-ImanNeural',
+ '阿拉伯语 (利比亚)-Omar-男': 'ar-LY-OmarNeural',
+ '阿拉伯语 (摩洛哥)-Jamal-男': 'ar-MA-JamalNeural',
+ '阿拉伯语 (摩洛哥)-Mouna-女': 'ar-MA-MounaNeural',
+ '阿拉伯语 (阿曼)-Abdullah-男': 'ar-OM-AbdullahNeural',
+ '阿拉伯语 (阿曼)-Aysha-女': 'ar-OM-AyshaNeural',
+ '阿拉伯语 (卡塔尔)-Amal-女': 'ar-QA-AmalNeural',
+ '阿拉伯语 (卡塔尔)-Moaz-男': 'ar-QA-MoazNeural',
+ '阿拉伯语 (叙利亚)-Amany-女': 'ar-SY-AmanyNeural',
+ '阿拉伯语 (叙利亚)-Laith-男': 'ar-SY-LaithNeural',
+ '阿拉伯语 (突尼斯)-Hedi-男': 'ar-TN-HediNeural',
+ '阿拉伯语 (突尼斯)-Reem-女': 'ar-TN-ReemNeural',
+ '阿拉伯语 (也门)-Maryam-女': 'ar-YE-MaryamNeural',
+ '阿拉伯语 (也门)-Saleh-男': 'ar-YE-SalehNeural',
+ '阿塞拜疆语 (阿塞拜疆)-Babek-男': 'az-AZ-BabekNeural',
+ '阿塞拜疆语 (阿塞拜疆)-Banu-女': 'az-AZ-BanuNeural',
+ '保加利亚语 (保加利亚)-Borislav-男': 'bg-BG-BorislavNeural',
+ '保加利亚语 (保加利亚)-Kalina-女': 'bg-BG-KalinaNeural',
+ '孟加拉语 (孟加拉国)-Nabanita-女': 'bn-BD-NabanitaNeural',
+ '孟加拉语 (孟加拉国)-Pradeep-男': 'bn-BD-PradeepNeural',
+ '孟加拉语 (印度)-Bashkar-男': 'bn-IN-BashkarNeural',
+ '孟加拉语 (印度)-Tanishaa-女': 'bn-IN-TanishaaNeural',
+ '波斯尼亚语 (波斯尼亚和黑塞哥维那)-Goran-男': 'bs-BA-GoranNeural',
+ '波斯尼亚语 (波斯尼亚和黑塞哥维那)-Vesna-女': 'bs-BA-VesnaNeural',
+ '加泰罗尼亚语 (西班牙)-Joana-女': 'ca-ES-JoanaNeural',
+ '加泰罗尼亚语 (西班牙)-Enric-男': 'ca-ES-EnricNeural',
+ '捷克语 (捷克共和国)-Antonin-男': 'cs-CZ-AntoninNeural',
+ '捷克语 (捷克共和国)-Vlasta-女': 'cs-CZ-VlastaNeural',
+ '威尔士语 (英国)-Aled-男': 'cy-GB-AledNeural',
+ '威尔士语 (英国)-Nia-女': 'cy-GB-NiaNeural',
+ '丹麦语 (丹麦)-Christel-女': 'da-DK-ChristelNeural',
+ '丹麦语 (丹麦)-Jeppe-男': 'da-DK-JeppeNeural',
+ '德语 (奥地利)-Ingrid-女': 'de-AT-IngridNeural',
+ '德语 (奥地利)-Jonas-男': 'de-AT-JonasNeural',
+ '德语 (瑞士)-Jan-男': 'de-CH-JanNeural',
+ '德语 (瑞士)-Leni-女': 'de-CH-LeniNeural',
+ '英语 (澳大利亚)-Natasha-女': 'en-AU-NatashaNeural',
+ '英语 (澳大利亚)-William-男': 'en-AU-WilliamNeural',
+ '英语 (加拿大)-Clara-女': 'en-CA-ClaraNeural',
+ '英语 (加拿大)-Liam-男': 'en-CA-LiamNeural',
+ '英语 (英国)-Libby-女': 'en-GB-LibbyNeural',
+ '英语 (英国)-Maisie-女': 'en-GB-MaisieNeural',
+ '英语 (英国)-Ryan-男': 'en-GB-RyanNeural',
+ '英语 (英国)-Sonia-女': 'en-GB-SoniaNeural',
+ '英语 (英国)-Thomas-男': 'en-GB-ThomasNeural',
+ '英语 (香港)-Sam-男': 'en-HK-SamNeural',
+ '英语 (香港)-Yan-女': 'en-HK-YanNeural',
+ '英语 (爱尔兰)-Connor-男': 'en-IE-ConnorNeural',
+ '英语 (爱尔兰)-Emily-女': 'en-IE-EmilyNeural',
+ '英语 (印度)-Neerja-女': 'en-IN-NeerjaNeural',
+ '英语 (印度)-Prabhat-男': 'en-IN-PrabhatNeural',
+ '英语 (肯尼亚)-Asilia-女': 'en-KE-AsiliaNeural',
+ '英语 (肯尼亚)-Chilemba-男': 'en-KE-ChilembaNeural',
+ '英语 (尼日利亚)-Abeo-男': 'en-NG-AbeoNeural',
+ '英语 (尼日利亚)-Ezinne-女': 'en-NG-EzinneNeural',
+ '英语 (新西兰)-Mitchell-男': 'en-NZ-MitchellNeural',
+ '英语 (菲律宾)-James-男': 'en-PH-JamesNeural',
+ '英语 (菲律宾)-Rosa-女': 'en-PH-RosaNeural',
+ '英语 (新加坡)-Luna-女': 'en-SG-LunaNeural',
+ '英语 (新加坡)-Wayne-男': 'en-SG-WayneNeural',
+ '英语 (坦桑尼亚)-Elimu-男': 'en-TZ-ElimuNeural',
+ '英语 (坦桑尼亚)-Imani-女': 'en-TZ-ImaniNeural',
+ '英语 (南非)-Leah-女': 'en-ZA-LeahNeural',
+ '英语 (南非)-Luke-男': 'en-ZA-LukeNeural',
+ '西班牙语 (阿根廷)-Elena-女': 'es-AR-ElenaNeural',
+ '西班牙语 (阿根廷)-Tomas-男': 'es-AR-TomasNeural',
+ '西班牙语 (玻利维亚)-Marcelo-男': 'es-BO-MarceloNeural',
+ '西班牙语 (玻利维亚)-Sofia-女': 'es-BO-SofiaNeural',
+ '西班牙语 (哥伦比亚)-Gonzalo-男': 'es-CO-GonzaloNeural',
+ '西班牙语 (哥伦比亚)-Salome-女': 'es-CO-SalomeNeural',
+ '西班牙语 (哥斯达黎加)-Juan-男': 'es-CR-JuanNeural',
+ '西班牙语 (哥斯达黎加)-Maria-女': 'es-CR-MariaNeural',
+ '西班牙语 (古巴)-Belkys-女': 'es-CU-BelkysNeural',
+ '西班牙语 (多米尼加共和国)-Emilio-男': 'es-DO-EmilioNeural',
+ '西班牙语 (多米尼加共和国)-Ramona-女': 'es-DO-RamonaNeural',
+ '西班牙语 (厄瓜多尔)-Andrea-女': 'es-EC-AndreaNeural',
+ '西班牙语 (厄瓜多尔)-Luis-男': 'es-EC-LuisNeural',
+ '西班牙语 (西班牙)-Alvaro-男': 'es-ES-AlvaroNeural',
+ '西班牙语 (西班牙)-Elvira-女': 'es-ES-ElviraNeural',
+ '西班牙语 (赤道几内亚)-Teresa-女': 'es-GQ-TeresaNeural',
+ '西班牙语 (危地马拉)-Andres-男': 'es-GT-AndresNeural',
+ '西班牙语 (危地马拉)-Marta-女': 'es-GT-MartaNeural',
+ '西班牙语 (洪都拉斯)-Carlos-男': 'es-HN-CarlosNeural',
+ '西班牙语 (洪都拉斯)-Karla-女': 'es-HN-KarlaNeural',
+ '西班牙语 (尼加拉瓜)-Federico-男': 'es-NI-FedericoNeural',
+ '西班牙语 (尼加拉瓜)-Yolanda-女': 'es-NI-YolandaNeural',
+ '西班牙语 (巴拿马)-Margarita-女': 'es-PA-MargaritaNeural',
+ '西班牙语 (巴拿马)-Roberto-男': 'es-PA-RobertoNeural',
+ '西班牙语 (秘鲁)-Alex-男': 'es-PE-AlexNeural',
+ '西班牙语 (秘鲁)-Camila-女': 'es-PE-CamilaNeural',
+ '西班牙语 (波多黎各)-Karina-女': 'es-PR-KarinaNeural',
+ '西班牙语 (波多黎各)-Victor-男': 'es-PR-VictorNeural',
+ '西班牙语 (巴拉圭)-Mario-男': 'es-PY-MarioNeural',
+ '西班牙语 (巴拉圭)-Tania-女': 'es-PY-TaniaNeural',
+ '西班牙语 (萨尔瓦多)-Lorena-女': 'es-SV-LorenaNeural',
+ '西班牙语 (萨尔瓦多)-Rodrigo-男': 'es-SV-RodrigoNeural',
+ '西班牙语 (美国)-Alonso-男': 'es-US-AlonsoNeural',
+ '西班牙语 (美国)-Paloma-女': 'es-US-PalomaNeural',
+ '西班牙语 (乌拉圭)-Mateo-男': 'es-UY-MateoNeural',
+ '西班牙语 (乌拉圭)-Valentina-女': 'es-UY-ValentinaNeural',
+ '西班牙语 (委内瑞拉)-Paola-女': 'es-VE-PaolaNeural',
+ '西班牙语 (委内瑞拉)-Sebastian-男': 'es-VE-SebastianNeural',
+ '爱沙尼亚语 (爱沙尼亚)-Anu-���': 'et-EE-AnuNeural',
+ '爱沙尼亚语 (爱沙尼亚)-Kert-男': 'et-EE-KertNeural',
+ '波斯语 (伊朗)-Dilara-女': 'fa-IR-DilaraNeural',
+ '波斯语 (伊朗)-Farid-男': 'fa-IR-FaridNeural',
+ '芬兰语 (芬兰)-Harri-男': 'fi-FI-HarriNeural',
+ '芬兰语 (芬兰)-Noora-女': 'fi-FI-NooraNeural',
+ '法语 (比利时)-Charline-女': 'fr-BE-CharlineNeural',
+ '法语 (比利时)-Gerard-男': 'fr-BE-GerardNeural',
+ '法语 (加拿大)-Sylvie-女': 'fr-CA-SylvieNeural',
+ '法语 (加拿大)-Antoine-男': 'fr-CA-AntoineNeural',
+ '法语 (加拿大)-Jean-男': 'fr-CA-JeanNeural',
+ '法语 (瑞士)-Ariane-女': 'fr-CH-ArianeNeural',
+ '法语 (瑞士)-Fabrice-男': 'fr-CH-FabriceNeural',
+ '爱尔兰语 (爱尔兰)-Colm-男': 'ga-IE-ColmNeural',
+ '爱尔兰语 (爱尔兰)-Orla-女': 'ga-IE-OrlaNeural',
+ '加利西亚语 (西班牙)-Roi-男': 'gl-ES-RoiNeural',
+ '加利西亚语 (西班牙)-Sabela-女': 'gl-ES-SabelaNeural',
+ '古吉拉特语 (印度)-Dhwani-女': 'gu-IN-DhwaniNeural',
+ '古吉拉特语 (印度)-Niranjan-男': 'gu-IN-NiranjanNeural',
+ '印地语 (印度)-Madhur-男': 'hi-IN-MadhurNeural',
+ '印地语 (印度)-Swara-女': 'hi-IN-SwaraNeural',
+ '克罗地亚语 (克罗地亚)-Gabrijela-女': 'hr-HR-GabrijelaNeural',
+ '克罗地亚语 (克罗地亚)-Srecko-男': 'hr-HR-SreckoNeural',
+ '匈牙利语 (匈牙利)-Noemi-女': 'hu-HU-NoemiNeural',
+ '匈牙利语 (匈牙利)-Tamas-男': 'hu-HU-TamasNeural',
+ '冰岛语 (冰岛)-Gudrun-女': 'is-IS-GudrunNeural',
+ '冰岛语 (冰岛)-Gunnar-男': 'is-IS-GunnarNeural',
+ '爪哇语 (印度尼西亚)-Dimas-男': 'jv-ID-DimasNeural',
+ '爪哇语 (印度尼西亚)-Siti-女': 'jv-ID-SitiNeural',
+ '格鲁吉亚语 (格鲁吉亚)-Eka-女': 'ka-GE-EkaNeural',
+ '格鲁吉亚语 (格鲁吉亚)-Giorgi-男': 'ka-GE-GiorgiNeural',
+ '哈萨克语 (哈萨克斯坦)-Aigul-女': 'kk-KZ-AigulNeural',
+ '哈萨克语 (哈萨克斯坦)-Daulet-男': 'kk-KZ-DauletNeural',
+ '高棉语 (柬埔寨)-Piseth-男': 'km-KH-PisethNeural',
+ '高棉语 (柬埔寨)-Sreymom-女': 'km-KH-SreymomNeural',
+ '卡纳达语 (印度)-Gagan-男': 'kn-IN-GaganNeural',
+ '卡纳达语 (印度)-Sapna-女': 'kn-IN-SapnaNeural',
+ '老挝语 (老挝)-Chanthavong-男': 'lo-LA-ChanthavongNeural',
+ '老挝语 (老挝)-Keomany-女': 'lo-LA-KeomanyNeural',
+ '立陶宛语 (立陶宛)-Leonas-男': 'lt-LT-LeonasNeural',
+ '立陶宛语 (立陶宛)-Ona-女': 'lt-LT-OnaNeural',
+ '拉脱维亚语 (拉脱维亚)-Everita-女': 'lv-LV-EveritaNeural',
+ '拉脱维亚语 (拉脱维亚)-Nils-男': 'lv-LV-NilsNeural',
+ '马其顿语 (北马其顿共和国)-Aleksandar-男': 'mk-MK-AleksandarNeural',
+ '马其顿语 (北马其顿共和国)-Marija-女': 'mk-MK-MarijaNeural',
+ '马拉雅拉姆语 (印度)-Midhun-男': 'ml-IN-MidhunNeural',
+ '马拉雅拉姆语 (印度)-Sobhana-女': 'ml-IN-SobhanaNeural',
+ '蒙古语 (蒙古)-Bataa-男': 'mn-MN-BataaNeural',
+ '蒙古语 (蒙古)-Yesui-女': 'mn-MN-YesuiNeural',
+ '马拉地语 (印度)-Aarohi-女': 'mr-IN-AarohiNeural',
+ '马拉地语 (印度)-Manohar-男': 'mr-IN-ManoharNeural',
+ '马耳他语 (马耳他)-Grace-女': 'mt-MT-GraceNeural',
+ '马耳他语 (马耳他)-Joseph-男': 'mt-MT-JosephNeural',
+ '缅甸语 (缅甸)-Nilar-女': 'my-MM-NilarNeural',
+ '缅甸语 (缅甸)-Thiha-男': 'my-MM-ThihaNeural',
+ '尼泊尔语 (尼泊尔)-Hemkala-女': 'ne-NP-HemkalaNeural',
+ '尼泊尔语 (尼泊尔)-Sagar-男': 'ne-NP-SagarNeural',
+ '荷兰语 (比利时)-Arnaud-男': 'nl-BE-ArnaudNeural',
+ '荷兰语 (比利时)-Dena-女': 'nl-BE-DenaNeural',
+ '波兰语 (波兰)-Marek-男': 'pl-PL-MarekNeural',
+ '波兰语 (波兰)-Zofia-女': 'pl-PL-ZofiaNeural',
+ '普什图语 (阿富汗)-Gul Nawaz-男': 'ps-AF-GulNawazNeural',
+ '普什图语 (阿富汗)-Latifa-女': 'ps-AF-LatifaNeural',
+ '葡萄牙语 (葡萄牙)-Duarte-男': 'pt-PT-DuarteNeural',
+ '葡萄牙语 (葡萄牙)-Raquel-女': 'pt-PT-RaquelNeural',
+ '罗马尼亚语 (罗马尼亚)-Alina-女': 'ro-RO-AlinaNeural',
+ '罗马尼亚语 (罗马尼亚)-Emil-男': 'ro-RO-EmilNeural',
+ '俄语 (俄罗斯)-Svetlana-女': 'ru-RU-SvetlanaNeural',
+ '俄语 (俄罗斯)-Dmitry-男': 'ru-RU-DmitryNeural',
+ '僧伽罗语 (斯里兰卡)-Sameera-男': 'si-LK-SameeraNeural',
+ '僧伽罗语 (斯里兰卡)-Thilini-女': 'si-LK-ThiliniNeural',
+ '斯洛伐克语 (斯洛伐克)-Lukas-男': 'sk-SK-LukasNeural',
+ '斯洛伐克语 (斯洛伐克)-Viktoria-女': 'sk-SK-ViktoriaNeural',
+ '斯洛文尼亚语 (斯洛文尼亚)-Petra-女': 'sl-SI-PetraNeural',
+ '斯洛文尼亚语 (斯洛文尼亚)-Rok-男': 'sl-SI-RokNeural',
+ '索马里语 (索马里)-Muuse-男': 'so-SO-MuuseNeural',
+ '索马里语 (索马里)-Ubax-女': 'so-SO-UbaxNeural',
+ '阿尔巴尼亚语 (阿尔巴尼亚)-Anila-女': 'sq-AL-AnilaNeural',
+ '阿尔巴尼亚语 (阿尔巴尼亚)-Ilir-男': 'sq-AL-IlirNeural',
+ '塞尔维亚语 (塞尔维亚)-Nicholas-男': 'sr-RS-NicholasNeural',
+ '塞尔维亚语 (塞尔维亚)-Sophie-女': 'sr-RS-SophieNeural',
+ '巽他语 (印度尼西亚)-Jajang-男': 'su-ID-JajangNeural',
+ '巽他语 (印度尼��亚)-Tuti-女': 'su-ID-TutiNeural',
+ '斯瓦希里语 (肯尼亚)-Rafiki-男': 'sw-KE-RafikiNeural',
+ '斯瓦希里语 (肯尼亚)-Zuri-女': 'sw-KE-ZuriNeural',
+ '斯瓦希里语 (坦桑尼亚)-Daudi-男': 'sw-TZ-DaudiNeural',
+ '斯瓦希里语 (坦桑尼亚)-Rehema-女': 'sw-TZ-RehemaNeural',
+ '泰米尔语 (印度)-Pallavi-女': 'ta-IN-PallaviNeural',
+ '泰米尔语 (印度)-Valluvar-男': 'ta-IN-ValluvarNeural',
+ '泰米尔语 (斯里兰卡)-Kumar-男': 'ta-LK-KumarNeural',
+ '泰米尔语 (斯里兰卡)-Saranya-女': 'ta-LK-SaranyaNeural',
+ '泰米尔语 (马来西亚)-Kani-女': 'ta-MY-KaniNeural',
+ '泰米尔语 (马来西亚)-Surya-男': 'ta-MY-SuryaNeural',
+ '泰米尔语 (新加坡)-Anbu-男': 'ta-SG-AnbuNeural',
+ '泰卢固语 (印度)-Mohan-男': 'te-IN-MohanNeural',
+ '泰卢固语 (印度)-Shruti-女': 'te-IN-ShrutiNeural',
+ '土耳其语 (土耳其)-Ahmet-男': 'tr-TR-AhmetNeural',
+ '土耳其语 (土耳其)-Emel-女': 'tr-TR-EmelNeural',
+ '乌克兰语 (乌克兰)-Ostap-男': 'uk-UA-OstapNeural',
+ '乌克兰语 (乌克兰)-Polina-女': 'uk-UA-PolinaNeural',
+ '乌尔都语 (印度)-Gul-女': 'ur-IN-GulNeural',
+ '乌尔都语 (印度)-Salman-男': 'ur-IN-SalmanNeural',
+ '乌尔都语 (巴基斯坦)-Asad-男': 'ur-PK-AsadNeural',
+ '乌尔都语 (巴基斯坦)-Uzma-女': 'ur-PK-UzmaNeural',
+ '乌兹别克语 (乌兹别克斯坦)-Madina-女': 'uz-UZ-MadinaNeural',
+ '乌兹别克语 (乌兹别克斯坦)-Sardor-男': 'uz-UZ-SardorNeural',
+ '普通话 (中国大陆)-Xiaoxiao-女': 'zh-CN-XiaoxiaoNeural',
+ '普通话 (中国大陆)-Yunyang-男': 'zh-CN-YunyangNeural',
+ '普通话 (中国大陆)-Yunxi-男': 'zh-CN-YunxiNeural',
+ '普通话 (中国大陆)-Xiaoyi-女': 'zh-CN-XiaoyiNeural',
+ '普通话 (中国大陆)-Yunjian-男': 'zh-CN-YunjianNeural',
+ '普通话 (中国大陆)-Yunxia-男': 'zh-CN-YunxiaNeural',
+ '东北话 (中国大陆)-Xiaobei-女': 'zh-CN-liaoning-XiaobeiNeural',
+ '中原官话 (中国陕西)-Xiaoni-女': 'zh-CN-shaanxi-XiaoniNeural',
+ '粤语 (中国香港)-HiuMaan-女': 'zh-HK-HiuMaanNeural',
+ '粤语 (中国香港)-HiuGaai-女': 'zh-HK-HiuGaaiNeural',
+ '粤语 (中国香港)-WanLung-男': 'zh-HK-WanLungNeural',
+ '台湾普通话-HsiaoChen-女': 'zh-TW-HsiaoChenNeural',
+ '台湾普通话-HsiaoYu-女': 'zh-TW-HsiaoYuNeural',
+ '台湾普通话-YunJhe-男': 'zh-TW-YunJheNeural',
+ '祖鲁语 (南非)-Thando-女': 'zu-ZA-ThandoNeural',
+ '祖鲁语 (南非)-Themba-男': 'zu-ZA-ThembaNeural'}

utils.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import os
+import sys
+import argparse
+import logging
+import json
+import subprocess
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+from torch.nn import functional as F
+from commons import sequence_mask
+MATPLOTLIB_FLAG = False
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+def get_cmodel(rank):
+    checkpoint = torch.load('wavlm/WavLM-Large.pt')
+    cfg = WavLMConfig(checkpoint['cfg'])
+    cmodel = WavLM(cfg).cuda(rank)
+    cmodel.load_state_dict(checkpoint['model'])
+    cmodel.eval()
+    return cmodel
+def get_content(cmodel, y):
+    with torch.no_grad():
+        c = cmodel.extract_features(y.squeeze(1))[0]
+    c = c.transpose(1, 2)
+    return c
+def get_vocoder(rank):
+    with open("hifigan/config.json", "r") as f:
+        config = json.load(f)
+    config = hifigan.AttrDict(config)
+    vocoder = hifigan.Generator(config)
+    ckpt = torch.load("hifigan/generator_v1")
+    vocoder.load_state_dict(ckpt["generator"])
+    vocoder.eval()
+    vocoder.remove_weight_norm()
+    vocoder.cuda(rank)
+    return vocoder
+def transform(mel, height): # 68-92
+    #r = np.random.random()
+    #rate = r * 0.3 + 0.85 # 0.85-1.15
+    #height = int(mel.size(-2) * rate)
+    tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
+    if height >= mel.size(-2):
+        return tgt[:, :mel.size(-2), :]
+    else:
+        silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
+        silence += torch.randn_like(silence) / 10
+        return torch.cat((tgt, silence), 1)
+def stretch(mel, width): # 0.5-2
+    return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
+def load_checkpoint(checkpoint_path, model, optimizer=None):
+  assert os.path.isfile(checkpoint_path)
+  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+  iteration = checkpoint_dict['iteration']
+  learning_rate = checkpoint_dict['learning_rate']
+  if optimizer is not None:
+    optimizer.load_state_dict(checkpoint_dict['optimizer'])
+  saved_state_dict = checkpoint_dict['model']
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  new_state_dict= {}
+  for k, v in state_dict.items():
+    try:
+      new_state_dict[k] = saved_state_dict[k]
+    except:
+      logger.info("%s is not in the checkpoint" % k)
+      new_state_dict[k] = v
+  if hasattr(model, 'module'):
+    model.module.load_state_dict(new_state_dict)
+  else:
+    model.load_state_dict(new_state_dict)
+  logger.info("Loaded checkpoint '{}' (iteration {})" .format(
+    checkpoint_path, iteration))
+  return model, optimizer, learning_rate, iteration
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+  logger.info("Saving model and optimizer state at iteration {} to {}".format(
+    iteration, checkpoint_path))
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  torch.save({'model': state_dict,
+              'iteration': iteration,
+              'optimizer': optimizer.state_dict(),
+              'learning_rate': learning_rate}, checkpoint_path)
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+  for k, v in scalars.items():
+    writer.add_scalar(k, v, global_step)
+  for k, v in histograms.items():
+    writer.add_histogram(k, v, global_step)
+  for k, v in images.items():
+    writer.add_image(k, v, global_step, dataformats='HWC')
+  for k, v in audios.items():
+    writer.add_audio(k, v, global_step, audio_sampling_rate)
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+  f_list = glob.glob(os.path.join(dir_path, regex))
+  f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+  x = f_list[-1]
+  print(x)
+  return x
+def plot_spectrogram_to_numpy(spectrogram):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+  fig, ax = plt.subplots(figsize=(10,2))
+  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                  interpolation='none')
+  plt.colorbar(im, ax=ax)
+  plt.xlabel("Frames")
+  plt.ylabel("Channels")
+  plt.tight_layout()
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+def plot_alignment_to_numpy(alignment, info=None):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+  fig, ax = plt.subplots(figsize=(6, 4))
+  im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+                  interpolation='none')
+  fig.colorbar(im, ax=ax)
+  xlabel = 'Decoder timestep'
+  if info is not None:
+      xlabel += '\n\n' + info
+  plt.xlabel(xlabel)
+  plt.ylabel('Encoder timestep')
+  plt.tight_layout()
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+def load_wav_to_torch(full_path):
+  sampling_rate, data = read(full_path)
+  return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+def load_filepaths_and_text(filename, split="|"):
+  with open(filename, encoding='utf-8') as f:
+    filepaths_and_text = [line.strip().split(split) for line in f]
+  return filepaths_and_text
+def get_hparams(init=True):
+  parser = argparse.ArgumentParser()
+  parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
+                      help='JSON file for configuration')
+  parser.add_argument('-m', '--model', type=str, required=True,
+                      help='Model name')
+  args = parser.parse_args()
+  model_dir = os.path.join("./logs", args.model)
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+  config_path = args.config
+  config_save_path = os.path.join(model_dir, "config.json")
+  if init:
+    with open(config_path, "r") as f:
+      data = f.read()
+    with open(config_save_path, "w") as f:
+      f.write(data)
+  else:
+    with open(config_save_path, "r") as f:
+      data = f.read()
+  config = json.loads(data)
+  hparams = HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+def get_hparams_from_dir(model_dir):
+  config_save_path = os.path.join(model_dir, "config.json")
+  with open(config_save_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+  hparams =HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+def get_hparams_from_file(config_path):
+  with open(config_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+  hparams =HParams(**config)
+  return hparams
+def check_git_hash(model_dir):
+  source_dir = os.path.dirname(os.path.realpath(__file__))
+  if not os.path.exists(os.path.join(source_dir, ".git")):
+    logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+      source_dir
+    ))
+    return
+  cur_hash = subprocess.getoutput("git rev-parse HEAD")
+  path = os.path.join(model_dir, "githash")
+  if os.path.exists(path):
+    saved_hash = open(path).read()
+    if saved_hash != cur_hash:
+      logger.warn("git hash values are different. {}(saved) != {}(current)".format(
+        saved_hash[:8], cur_hash[:8]))
+  else:
+    open(path, "w").write(cur_hash)
+def get_logger(model_dir, filename="train.log"):
+  global logger
+  logger = logging.getLogger(os.path.basename(model_dir))
+  logger.setLevel(logging.DEBUG)
+  formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+  h = logging.FileHandler(os.path.join(model_dir, filename))
+  h.setLevel(logging.DEBUG)
+  h.setFormatter(formatter)
+  logger.addHandler(h)
+  return logger
+class HParams():
+  def __init__(self, **kwargs):
+    for k, v in kwargs.items():
+      if type(v) == dict:
+        v = HParams(**v)
+      self[k] = v
+  def keys(self):
+    return self.__dict__.keys()
+  def items(self):
+    return self.__dict__.items()
+  def values(self):
+    return self.__dict__.values()
+  def __len__(self):
+    return len(self.__dict__)
+  def __getitem__(self, key):
+    return getattr(self, key)
+  def __setitem__(self, key, value):
+    return setattr(self, key, value)
+  def __contains__(self, key):
+    return key in self.__dict__
+  def __repr__(self):
+    return self.__dict__.__repr__()