Spaces:

1rsh
/

gujarati-tisv

Sleeping

App Files Files Community

Irsh Vijayvargia commited on Jun 18

Commit

42a4544

•

1 Parent(s): 3e34e2e

Add application file

Browse files

Files changed (29) hide show

app.py +132 -0
config/config.yaml +40 -0
gradio.ipynb +292 -0
requirements.txt +8 -0
speech_id_checkpoint/saved_02.model +3 -0
utils/.ipynb_checkpoints/VAD_segments-checkpoint.py +153 -0
utils/.ipynb_checkpoints/__init__-checkpoint.py +0 -0
utils/.ipynb_checkpoints/data_load-checkpoint.py +57 -0
utils/.ipynb_checkpoints/evaluation-checkpoint.py +192 -0
utils/.ipynb_checkpoints/hparam-checkpoint.py +59 -0
utils/.ipynb_checkpoints/kan-checkpoint.py +285 -0
utils/.ipynb_checkpoints/speech_embedder_net-checkpoint.py +112 -0
utils/.ipynb_checkpoints/utils-checkpoint.py +173 -0
utils/VAD_segments.py +153 -0
utils/__init__.py +0 -0
utils/__pycache__/VAD_segments.cpython-39.pyc +0 -0
utils/__pycache__/__init__.cpython-39.pyc +0 -0
utils/__pycache__/data_load.cpython-39.pyc +0 -0
utils/__pycache__/evaluation.cpython-39.pyc +0 -0
utils/__pycache__/hparam.cpython-39.pyc +0 -0
utils/__pycache__/kan.cpython-39.pyc +0 -0
utils/__pycache__/speech_embedder_net.cpython-39.pyc +0 -0
utils/__pycache__/utils.cpython-39.pyc +0 -0
utils/data_load.py +57 -0
utils/evaluation.py +192 -0
utils/hparam.py +59 -0
utils/kan.py +285 -0
utils/speech_embedder_net.py +112 -0
utils/utils.py +173 -0

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+import librosa
+import numpy as np
+import os
+import webrtcvad
+import wave
+import contextlib
+import gradio as gr
+from utils.VAD_segments import *
+from utils.hparam import hparam as hp
+from utils.speech_embedder_net import *
+from utils.evaluation import *
+def read_wave(audio_data):
+    """Reads audio data and returns (PCM audio data, sample rate).
+    Assumes the input is a tuple (sample_rate, numpy_array).
+    If the sample rate is unsupported, resamples to 16000 Hz.
+    """
+    sample_rate, data = audio_data
+    # Ensure data is in the correct shape
+    assert len(data.shape) == 1, "Audio data must be a 1D array"
+    # Convert to floating point if necessary
+    if not np.issubdtype(data.dtype, np.floating):
+        data = data.astype(np.float32) / np.iinfo(data.dtype).max
+    # Supported sample rates
+    supported_sample_rates = (8000, 16000, 32000, 48000)
+    # If sample rate is not supported, resample to 16000 Hz
+    if sample_rate not in supported_sample_rates:
+        data = librosa.resample(data, orig_sr=sample_rate, target_sr=16000)
+        sample_rate = 16000
+    # Convert numpy array to PCM format
+    pcm_data = (data * np.iinfo(np.int16).max).astype(np.int16).tobytes()
+    return data, pcm_data
+def VAD_chunk(aggressiveness, data):
+    audio, byte_audio = read_wave(data)
+    vad = webrtcvad.Vad(int(aggressiveness))
+    frames = frame_generator(20, byte_audio, hp.data.sr)
+    frames = list(frames)
+    times = vad_collector(hp.data.sr, 20, 200, vad, frames)
+    speech_times = []
+    speech_segs = []
+    for i, time in enumerate(times):
+        start = np.round(time[0],decimals=2)
+        end = np.round(time[1],decimals=2)
+        j = start
+        while j + .4 < end:
+            end_j = np.round(j+.4,decimals=2)
+            speech_times.append((j, end_j))
+            speech_segs.append(audio[int(j*hp.data.sr):int(end_j*hp.data.sr)])
+            j = end_j
+        else:
+            speech_times.append((j, end))
+            speech_segs.append(audio[int(j*hp.data.sr):int(end*hp.data.sr)])
+    return speech_times, speech_segs
+def get_embedding(data, embedder_net, device, n_threshold=-1):
+    times, segs = VAD_chunk(0, data)
+    if not segs:
+        print(f'No voice activity detected')
+        return None
+    concat_seg = concat_segs(times, segs)
+    if not concat_seg:
+        print(f'No concatenated segments')
+        return None
+    STFT_frames = get_STFTs(concat_seg)
+    if not STFT_frames:
+        #print(f'No STFT frames')
+        return None
+    STFT_frames = np.stack(STFT_frames, axis=2)
+    STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0)), device=device)
+    with torch.no_grad():
+        embeddings = embedder_net(STFT_frames)
+        embeddings = embeddings[:n_threshold, :]
+    avg_embedding = torch.mean(embeddings, dim=0, keepdim=True).cpu().numpy()
+    return avg_embedding
+model_path = "./speech_id_checkpoint/saved_02.model"
+device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+embedder_net = SpeechEmbedder().to(device)
+embedder_net.load_state_dict(torch.load(model_path, map_location=device))
+embedder_net.eval()
+def process_audio(audio1, audio2, threshold):
+    e1 = get_embedding(audio1, embedder_net, device)
+    if(e1 is None):
+        return "No Voice Detected in file 1"
+    e2 = get_embedding(audio2, embedder_net, device)
+    if(e2 is None):
+        return "No Voice Detected in file 2"
+    cosi = cosine_similarity(e1, e2)
+    if(cosi > threshold):
+        return f"Same Speaker"
+    else:
+        return f"Different Speaker"
+# Define the Gradio interface
+def gradio_interface(audio1, audio2, threshold):
+    output_text = process_audio(audio1, audio2, threshold)
+    return output_text
+# Create the Gradio interface with microphone inputs
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[gr.Audio("microphone", type="numpy", label="Audio File 1"),
+            gr.Audio("microphone", type="numpy", label="Audio File 2"),
+            gr.Slider(0.0, 1.0, value=0.85, step=0.01, label="Threshold")
+           ],
+    outputs="text",
+    title="Gujarati Text Independent Speaker Verification",
+    description="Record two audio files and get the text output from the model."
+)
+# Launch the interface
+iface.launch(share=False)

config/config.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+training: !!bool "false"
+device: "mps"
+unprocessed_data: './DATA_DIR/*/*.wav'
+---
+data:
+    train_path: './train_tisv'
+    train_path_unprocessed: './TIMIT/TRAIN/*/*/*.wav'
+    test_path: './test_tisv'
+    test_path_unprocessed: './TIMIT/TEST/*/*/*.wav'
+    data_preprocessed: !!bool "true"
+    sr: 16000
+    nfft: 512 #For mel spectrogram preprocess
+    window: 0.025 #(s)
+    hop: 0.01 #(s)
+    nmels: 40 #Number of mel energies
+    tisv_frame: 180 #Max number of time steps in input after preprocess
+---
+model:
+    hidden: 768 #Number of LSTM hidden layer units
+    num_layer: 3 #Number of LSTM layers
+    proj: 256 #Embedding size
+    model_path: './speech_id_checkpoint/ckpt_epoch_840_batch_id_6.pth' #Model path for testing, inference, or resuming training
+---
+train:
+    N : 4 #Number of speakers in batch
+    M : 6 #Number of utterances per speaker
+    num_workers: 0 #number of workers for dataloader
+    lr: 0.01
+    epochs: 1000 #Max training speaker epoch
+    log_interval: 30 #Epochs before printing progress
+    log_file: './speech_id_checkpoint/Stats'
+    checkpoint_interval: 100 #Save model after x speaker epochs
+    checkpoint_dir: './speech_id_checkpoint'
+    restore: !!bool "true" #Resume training from previous model path
+---
+test:
+    N : 4 #Number of speakers in batch
+    M : 6 #Number of utterances per speaker
+    num_workers: 8 #number of workers for data laoder
+    epochs: 10 #testing speaker epochs

gradio.ipynb ADDED Viewed

	@@ -0,0 +1,292 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "23237138-936a-44b4-9eb6-f16045d2c91d",
+   "metadata": {},
+   "source": [
+    "### **Gradio Demo | LSTM Speaker Embedding Model for Gujarati Speaker Verification**\n",
+    "****\n",
+    "**Author:** Irsh Vijay <br>\n",
+    "**Organization**: Wadhwani Institute for Artificial Intelligence <br>\n",
+    "****\n",
+    "This notebook has the required code to run a gradio demo."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1d2cfd8b-9498-4236-9d32-718e9e0597cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import librosa\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import webrtcvad\n",
+    "import wave\n",
+    "import contextlib\n",
+    "\n",
+    "from utils.VAD_segments import *\n",
+    "from utils.hparam import hparam as hp\n",
+    "from utils.speech_embedder_net import *\n",
+    "from utils.evaluation import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "3e9e1006-83d2-4492-a210-26b2c3717cd5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_wave(audio_data):\n",
+    "    \"\"\"Reads audio data and returns (PCM audio data, sample rate).\n",
+    "    Assumes the input is a tuple (sample_rate, numpy_array).\n",
+    "    If the sample rate is unsupported, resamples to 16000 Hz.\n",
+    "    \"\"\"\n",
+    "    sample_rate, data = audio_data\n",
+    "\n",
+    "    # Ensure data is in the correct shape\n",
+    "    assert len(data.shape) == 1, \"Audio data must be a 1D array\"\n",
+    "\n",
+    "    # Convert to floating point if necessary\n",
+    "    if not np.issubdtype(data.dtype, np.floating):\n",
+    "        data = data.astype(np.float32) / np.iinfo(data.dtype).max\n",
+    "    \n",
+    "    # Supported sample rates\n",
+    "    supported_sample_rates = (8000, 16000, 32000, 48000)\n",
+    "    \n",
+    "    # If sample rate is not supported, resample to 16000 Hz\n",
+    "    if sample_rate not in supported_sample_rates:\n",
+    "        data = librosa.resample(data, orig_sr=sample_rate, target_sr=16000)\n",
+    "        sample_rate = 16000\n",
+    "    \n",
+    "    # Convert numpy array to PCM format\n",
+    "    pcm_data = (data * np.iinfo(np.int16).max).astype(np.int16).tobytes()\n",
+    "\n",
+    "    return data, pcm_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0b56a2fc-83c3-4b36-95b8-5f1b656150ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def VAD_chunk(aggressiveness, data):\n",
+    "    audio, byte_audio = read_wave(data)\n",
+    "    vad = webrtcvad.Vad(int(aggressiveness))\n",
+    "    frames = frame_generator(20, byte_audio, hp.data.sr)\n",
+    "    frames = list(frames)\n",
+    "    times = vad_collector(hp.data.sr, 20, 200, vad, frames)\n",
+    "    speech_times = []\n",
+    "    speech_segs = []\n",
+    "    for i, time in enumerate(times):\n",
+    "        start = np.round(time[0],decimals=2)\n",
+    "        end = np.round(time[1],decimals=2)\n",
+    "        j = start\n",
+    "        while j + .4 < end:\n",
+    "            end_j = np.round(j+.4,decimals=2)\n",
+    "            speech_times.append((j, end_j))\n",
+    "            speech_segs.append(audio[int(j*hp.data.sr):int(end_j*hp.data.sr)])\n",
+    "            j = end_j\n",
+    "        else:\n",
+    "            speech_times.append((j, end))\n",
+    "            speech_segs.append(audio[int(j*hp.data.sr):int(end*hp.data.sr)])\n",
+    "    return speech_times, speech_segs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "72f257cf-7d3f-4ec5-944a-57779ba377e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_embedding(data, embedder_net, device, n_threshold=-1):\n",
+    "    times, segs = VAD_chunk(0, data)\n",
+    "    if not segs:\n",
+    "        print(f'No voice activity detected')\n",
+    "        return None\n",
+    "    concat_seg = concat_segs(times, segs)\n",
+    "    if not concat_seg:\n",
+    "        print(f'No concatenated segments')\n",
+    "        return None\n",
+    "    STFT_frames = get_STFTs(concat_seg)\n",
+    "    if not STFT_frames:\n",
+    "        #print(f'No STFT frames')\n",
+    "        return None\n",
+    "    STFT_frames = np.stack(STFT_frames, axis=2)\n",
+    "    STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0)), device=device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        embeddings = embedder_net(STFT_frames)\n",
+    "        embeddings = embeddings[:n_threshold, :]\n",
+    "        \n",
+    "    avg_embedding = torch.mean(embeddings, dim=0, keepdim=True).cpu().numpy()\n",
+    "    return avg_embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "200df766-407d-4367-b0fb-7a6118653731",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = \"./speech_id_checkpoint/saved_01.model\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "db7613e6-67a8-4920-a999-caca4a0de360",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "SpeechEmbedder(\n",
+       "  (LSTM_stack): LSTM(40, 768, num_layers=3, batch_first=True)\n",
+       "  (projection): Linear(in_features=768, out_features=256, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = torch.device(\"mps\" if torch.backends.mps.is_available() else \"cpu\")\n",
+    "\n",
+    "embedder_net = SpeechEmbedder().to(device)\n",
+    "embedder_net.load_state_dict(torch.load(model_path, map_location=device))\n",
+    "embedder_net.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "8a7dd9bd-7b40-41f9-8e2f-d68be18f2111",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "bd6c073d-eab8-4ae6-8ba6-d90a0ec54c0e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7868\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7868/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def process_audio(audio1, audio2, threshold):\n",
+    "    e1 = get_embedding(audio1, embedder_net, device)\n",
+    "    if(e1 is None):\n",
+    "        return \"No Voice Detected in file 1\"\n",
+    "    e2 = get_embedding(audio2, embedder_net, device)\n",
+    "    if(e2 is None):\n",
+    "        return \"No Voice Detected in file 2\"\n",
+    "\n",
+    "    cosi = cosine_similarity(e1, e2)\n",
+    "\n",
+    "    if(cosi > threshold):\n",
+    "        return f\"Same Speaker\" \n",
+    "    else:\n",
+    "        return f\"Different Speaker\" \n",
+    "\n",
+    "# Define the Gradio interface\n",
+    "def gradio_interface(audio1, audio2, threshold):\n",
+    "    output_text = process_audio(audio1, audio2, threshold)\n",
+    "    return output_text\n",
+    "\n",
+    "# Create the Gradio interface with microphone inputs\n",
+    "iface = gr.Interface(\n",
+    "    fn=gradio_interface,\n",
+    "    inputs=[gr.Audio(\"microphone\", type=\"numpy\", label=\"Audio File 1\"),\n",
+    "            gr.Audio(\"microphone\", type=\"numpy\", label=\"Audio File 2\"),\n",
+    "            gr.Slider(0.0, 1.0, value=0.85, step=0.01, label=\"Threshold\")\n",
+    "           ],\n",
+    "    outputs=\"text\",\n",
+    "    title=\"LSTM Based Speaker Verification\",\n",
+    "    description=\"Record two audio files and get the text output from the model.\"\n",
+    ")\n",
+    "\n",
+    "# Launch the interface\n",
+    "iface.launch(share=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a098495c-9e7b-4232-86fc-55a1890c5e27",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b99a253e-9b91-4210-b934-8bd1b6a2d912",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+librosa
+numpy
+webrtcvad
+wave
+contextlib
+gradio
+PyYAML

speech_id_checkpoint/saved_02.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51b96ce4d80a01ebe039ed6bc67c1a9731315742d5814fed842d4a22785c5836
+size 48543874

utils/.ipynb_checkpoints/VAD_segments-checkpoint.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Dec 18 16:22:41 2018
+@author: Harry
+Modified from https://github.com/wiseman/py-webrtcvad/blob/master/example.py
+"""
+import collections
+import contextlib
+import numpy as np
+import sys
+import librosa
+import wave
+import webrtcvad
+from utils.hparam import hparam as hp
+def read_wave(path, sr):
+    """Reads a .wav file.
+    Takes the path, and returns (PCM audio data, sample rate).
+    Assumes sample width == 2
+    """
+    with contextlib.closing(wave.open(path, 'rb')) as wf:
+        num_channels = wf.getnchannels()
+        assert num_channels == 1
+        sample_width = wf.getsampwidth()
+        assert sample_width == 2
+        sample_rate = wf.getframerate()
+        assert sample_rate in (8000, 16000, 32000, 48000)
+        pcm_data = wf.readframes(wf.getnframes())
+    data, _ = librosa.load(path, sr=sr)
+    assert len(data.shape) == 1
+    assert sr in (8000, 16000, 32000, 48000)
+    return data, pcm_data
+class Frame(object):
+    """Represents a "frame" of audio data."""
+    def __init__(self, bytes, timestamp, duration):
+        self.bytes = bytes
+        self.timestamp = timestamp
+        self.duration = duration
+def frame_generator(frame_duration_ms, audio, sample_rate):
+    """Generates audio frames from PCM audio data.
+    Takes the desired frame duration in milliseconds, the PCM data, and
+    the sample rate.
+    Yields Frames of the requested duration.
+    """
+    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
+    offset = 0
+    timestamp = 0.0
+    duration = (float(n) / sample_rate) / 2.0
+    while offset + n < len(audio):
+        yield Frame(audio[offset:offset + n], timestamp, duration)
+        timestamp += duration
+        offset += n
+def vad_collector(sample_rate, frame_duration_ms,
+                  padding_duration_ms, vad, frames):
+    """Filters out non-voiced audio frames.
+    Given a webrtcvad.Vad and a source of audio frames, yields only
+    the voiced audio.
+    Uses a padded, sliding window algorithm over the audio frames.
+    When more than 90% of the frames in the window are voiced (as
+    reported by the VAD), the collector triggers and begins yielding
+    audio frames. Then the collector waits until 90% of the frames in
+    the window are unvoiced to detrigger.
+    The window is padded at the front and back to provide a small
+    amount of silence or the beginnings/endings of speech around the
+    voiced frames.
+    Arguments:
+    sample_rate - The audio sample rate, in Hz.
+    frame_duration_ms - The frame duration in milliseconds.
+    padding_duration_ms - The amount to pad the window, in milliseconds.
+    vad - An instance of webrtcvad.Vad.
+    frames - a source of audio frames (sequence or generator).
+    Returns: A generator that yields PCM audio data.
+    """
+    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
+    # We use a deque for our sliding window/ring buffer.
+    ring_buffer = collections.deque(maxlen=num_padding_frames)
+    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
+    # NOTTRIGGERED state.
+    triggered = False
+    voiced_frames = []
+    for frame in frames:
+        is_speech = vad.is_speech(frame.bytes, sample_rate)
+        if not triggered:
+            ring_buffer.append((frame, is_speech))
+            num_voiced = len([f for f, speech in ring_buffer if speech])
+            # If we're NOTTRIGGERED and more than 90% of the frames in
+            # the ring buffer are voiced frames, then enter the
+            # TRIGGERED state.
+            if num_voiced > 0.9 * ring_buffer.maxlen:
+                triggered = True
+                start = ring_buffer[0][0].timestamp
+                # We want to yield all the audio we see from now until
+                # we are NOTTRIGGERED, but we have to start with the
+                # audio that's already in the ring buffer.
+                for f, s in ring_buffer:
+                    voiced_frames.append(f)
+                ring_buffer.clear()
+        else:
+            # We're in the TRIGGERED state, so collect the audio data
+            # and add it to the ring buffer.
+            voiced_frames.append(frame)
+            ring_buffer.append((frame, is_speech))
+            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
+            # If more than 90% of the frames in the ring buffer are
+            # unvoiced, then enter NOTTRIGGERED and yield whatever
+            # audio we've collected.
+            if num_unvoiced > 0.9 * ring_buffer.maxlen:
+                triggered = False
+                yield (start, frame.timestamp + frame.duration)
+                ring_buffer.clear()
+                voiced_frames = []
+    # If we have any leftover voiced audio when we run out of input,
+    # yield it.
+    if voiced_frames:
+        yield (start, frame.timestamp + frame.duration)
+def VAD_chunk(aggressiveness, path):
+    audio, byte_audio = read_wave(path, sr=hp.data.sr)
+    vad = webrtcvad.Vad(int(aggressiveness))
+    frames = frame_generator(20, byte_audio, hp.data.sr)
+    frames = list(frames)
+    times = vad_collector(hp.data.sr, 20, 200, vad, frames)
+    speech_times = []
+    speech_segs = []
+    for i, time in enumerate(times):
+        start = np.round(time[0],decimals=2)
+        end = np.round(time[1],decimals=2)
+        j = start
+        while j + .4 < end:
+            end_j = np.round(j+.4,decimals=2)
+            speech_times.append((j, end_j))
+            speech_segs.append(audio[int(j*hp.data.sr):int(end_j*hp.data.sr)])
+            j = end_j
+        else:
+            speech_times.append((j, end))
+            speech_segs.append(audio[int(j*hp.data.sr):int(end*hp.data.sr)])
+    return speech_times, speech_segs
+if __name__ == '__main__':
+    speech_times, speech_segs = VAD_chunk(sys.argv[1], sys.argv[2])

utils/.ipynb_checkpoints/__init__-checkpoint.py ADDED Viewed

File without changes

utils/.ipynb_checkpoints/data_load-checkpoint.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Mostly copied from https://github.com/HarryVolek/PyTorch_Speaker_Verification
+"""
+import glob
+import numpy as np
+import os
+import random
+from random import shuffle
+import torch
+from torch.utils.data import Dataset
+from utils.hparam import hparam as hp
+from utils.utils import mfccs_and_spec
+class GujaratiSpeakerVerificationDataset(Dataset):
+    def __init__(self, shuffle=True, utter_start=0, split='train'):
+        # data path
+        if split!='val':
+            self.path = hp.data.train_path
+            self.utter_num = hp.train.M
+        else:
+            self.path = hp.data.test_path
+            self.utter_num = hp.test.M
+        self.file_list = os.listdir(self.path)
+        self.shuffle=shuffle
+        self.utter_start = utter_start
+        self.split = split
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, idx):
+        np_file_list = os.listdir(self.path)
+        if self.shuffle:
+            selected_file = random.sample(np_file_list, 1)[0]  # select random speaker
+        else:
+            selected_file = np_file_list[idx]
+        utters = np.load(os.path.join(self.path, selected_file))
+                # load utterance spectrogram of selected speaker
+        if self.shuffle:
+            utter_index = np.random.randint(0, utters.shape[0], self.utter_num)   # select M utterances per speaker
+            utterance = utters[utter_index]
+        else:
+            utterance = utters[self.utter_start: self.utter_start+self.utter_num] # utterances of a speaker [batch(M), n_mels, frames]
+        utterance = utterance[:,:,:160]               # TODO implement variable length batch size
+        utterance = torch.tensor(np.transpose(utterance, axes=(0,2,1)))     # transpose [batch, frames, n_mels]
+        return utterance
+    def __repr__(self):
+        return f"{self.__class__.__name__}(split={self.split!r}, num_speakers={len(self.file_list)}, num_utterances={self.utter_num})"

utils/.ipynb_checkpoints/evaluation-checkpoint.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from torch.utils.data import Dataset
+from tqdm.auto import tqdm
+import os
+import librosa
+import numpy as np
+import torch
+import random
+from numpy.linalg import norm
+from utils.VAD_segments import VAD_chunk
+from utils.hparam import hparam as hp
+class GujaratiSpeakerVerificationDatasetTest(Dataset):
+    def __init__(self, path, shuffle=True, utter_start=0):
+        # data path
+        self.path = path
+        self.file_list = os.listdir(self.path)
+        self.shuffle=shuffle
+        self.utter_start = utter_start
+        self.utter_num = 4
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, idx):
+        np_file_list = self.file_list
+        selected_file = np_file_list[idx]
+        utters = np.load(os.path.join(self.path, selected_file))
+                # load utterance spectrogram of selected speaker
+        if self.shuffle:
+            utter_index = np.random.randint(0, utters.shape[0], self.utter_num)   # select M utterances per speaker
+            utterance = utters[utter_index]
+        else:
+            utterance = utters[self.utter_start: self.utter_start+self.utter_num] # utterances of a speaker [batch(M), n_mels, frames]
+        utterance = utterance[:,:,:160]               # TODO implement variable length batch size
+        utterance = torch.tensor(np.transpose(utterance, axes=(0,2,1)))     # transpose [batch, frames, n_mels]
+        return utterance
+def concat_segs(times, segs):
+    concat_seg = []
+    seg_concat = segs[0]
+    for i in range(0, len(times)-1):
+        if times[i][1] == times[i+1][0]:
+            seg_concat = np.concatenate((seg_concat, segs[i+1]))
+        else:
+            concat_seg.append(seg_concat)
+            seg_concat = segs[i+1]
+    else:
+        concat_seg.append(seg_concat)
+    return concat_seg
+def get_STFTs(segs):
+    sr = 16000
+    STFT_frames = []
+    for seg in segs:
+        S = librosa.core.stft(y=seg, n_fft=hp.data.nfft,
+                              win_length=int(hp.data.window * sr), hop_length=int(hp.data.hop * sr))
+        S = np.abs(S)**2
+        mel_basis = librosa.filters.mel(sr=sr, n_fft=hp.data.nfft, n_mels=hp.data.nmels)
+        S = np.log10(np.dot(mel_basis, S) + 1e-6)
+        for j in range(0, S.shape[1], int(.12/hp.data.hop)):
+            if j + 24 < S.shape[1]:
+                STFT_frames.append(S[:, j:j+24])
+            else:
+                break
+    return STFT_frames
+def get_embedding(file_path, embedder_net, device, n_threshold=-1):
+    times, segs = VAD_chunk(2, file_path)
+    if not segs:
+        print(f'No voice activity detected in {file_path}')
+        return None
+    concat_seg = concat_segs(times, segs)
+    if not concat_seg:
+        print(f'No concatenated segments for {file_path}')
+        return None
+    STFT_frames = get_STFTs(concat_seg)
+    if not STFT_frames:
+        #print(f'No STFT frames for {file_path}')
+        return None
+    STFT_frames = np.stack(STFT_frames, axis=2)
+    STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0)), device=device)
+    with torch.no_grad():
+        embeddings = embedder_net(STFT_frames)
+        embeddings = embeddings[:n_threshold, :]
+    avg_embedding = torch.mean(embeddings, dim=0, keepdim=True).cpu().numpy()
+    return avg_embedding
+def get_speaker_embeddings_listdir(embedder_net, device, list_dir, k):
+    speaker_embeddings = {}
+    for speaker_name in tqdm(list_dir, leave = False):
+        speaker_dir = speaker_name
+        if os.path.isdir(speaker_dir) and speaker_dir[0] != ".DS_Store":
+            speaker_embeddings[speaker_name] = []
+            for i in range(10):
+                embeddings = []
+                audio_files = [os.path.join(speaker_dir, f) for f in os.listdir(speaker_dir) if f.endswith('.wav')]
+                random.shuffle(audio_files)
+                count = 0
+                iter_ = 0
+                while(count <= k):
+                    file_path = audio_files[iter_]
+                    embedding = get_embedding(file_path, embedder_net, device)
+                    try:
+                        _ = embedding.shape
+                        embeddings.append(embedding)
+                        count+=1
+                        iter_+=1
+                    except:
+                        iter_+=1
+                speaker_embeddings[speaker_name].append(np.mean(embeddings, axis=0))
+    return speaker_embeddings
+def create_pairs(speaker_embeddings):
+    pairs = []
+    labels = []
+    speakers = list(speaker_embeddings.keys())
+    for i in range(len(speakers)):
+        for j in range(len(speakers)):
+            for k1 in range(10):
+                for k2 in range(10):
+                    emb1 = speaker_embeddings[speakers[i]][k1]
+                    emb2 = speaker_embeddings[speakers[j]][k2]
+                    pairs.append((emb1, emb2))
+                    if i == j and not((emb1 == emb2).all()):
+                        labels.append(1)  # Same speaker
+                    else:
+                        labels.append(0)  # Different speakers
+    return pairs, labels
+class EmbeddingPairDataset(Dataset):
+    def __init__(self, pairs, labels):
+        self.pairs = pairs
+        self.labels = labels
+    def __len__(self):
+        return len(self.pairs)
+    def __getitem__(self, idx):
+        emb1, emb2 = self.pairs[idx]
+        label = self.labels[idx]
+        emb1, emb2 = torch.tensor(emb1, dtype=torch.float32), torch.tensor(emb2, dtype=torch.float32)
+        concatenated = torch.cat((emb1, emb2), dim=1)
+        return concatenated.squeeze(), torch.tensor(label, dtype=torch.float32)
+    def __len__(self):
+        return len(self.labels)
+    def __repr__(self):
+        return f"{self.__class__.__name__}(length={self.__len__()})"
+def cosine_similarity(A, B):
+    A = A.flatten().astype(np.float64)
+    B = B.flatten().astype(np.float64)
+    cosine = np.dot(A,B)/(norm(A)*norm(B))
+    return cosine
+def create_subset(dataset, num_zeros):
+    pairs = dataset.pairs
+    labels = dataset.labels
+    pairs_1 = [pairs[i] for i in range(len(pairs)) if labels[i] == 1]
+    labels_1 = [labels[i] for i in range(len(labels)) if labels[i] == 1]
+    pairs_0 = [pairs[i] for i in range(len(pairs)) if labels[i] == 0]
+    labels_0 = [labels[i] for i in range(len(labels)) if labels[i] == 0]
+    num_zeros = min(num_zeros, len(pairs_0))
+    pairs_0 = pairs_0[:num_zeros]
+    labels_0 = labels_0[:num_zeros]
+    filtered_pairs = pairs_1 + pairs_0
+    filtered_labels = labels_1 + labels_0
+    return filtered_pairs, filtered_labels

utils/.ipynb_checkpoints/hparam-checkpoint.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# -*- coding: utf-8 -*-
+#!/usr/bin/env python
+import yaml
+def load_hparam(filename):
+    stream = open(filename, 'r')
+    docs = yaml.load_all(stream, Loader=yaml.Loader)
+    hparam_dict = dict()
+    for doc in docs:
+        for k, v in doc.items():
+            hparam_dict[k] = v
+    return hparam_dict
+def merge_dict(user, default):
+    if isinstance(user, dict) and isinstance(default, dict):
+        for k, v in default.items():
+            if k not in user:
+                user[k] = v
+            else:
+                user[k] = merge_dict(user[k], v)
+    return user
+class Dotdict(dict):
+    """
+    a dictionary that supports dot notation
+    as well as dictionary access notation
+    usage: d = DotDict() or d = DotDict({'val1':'first'})
+    set attributes: d.val2 = 'second' or d['val2'] = 'second'
+    get attributes: d.val2 or d['val2']
+    """
+    __getattr__ = dict.__getitem__
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+    def __init__(self, dct=None):
+        dct = dict() if not dct else dct
+        for key, value in dct.items():
+            if hasattr(value, 'keys'):
+                value = Dotdict(value)
+            self[key] = value
+class Hparam(Dotdict):
+    def __init__(self, file='config/config.yaml'):
+        super(Dotdict, self).__init__()
+        hp_dict = load_hparam(file)
+        hp_dotdict = Dotdict(hp_dict)
+        for k, v in hp_dotdict.items():
+            setattr(self, k, v)
+    __getattr__ = Dotdict.__getitem__
+    __setattr__ = Dotdict.__setitem__
+    __delattr__ = Dotdict.__delitem__
+hparam = Hparam()

utils/.ipynb_checkpoints/kan-checkpoint.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import torch
+import torch.nn.functional as F
+import math
+class KANLinear(torch.nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        grid_size=5,
+        spline_order=3,
+        scale_noise=0.1,
+        scale_base=1.0,
+        scale_spline=1.0,
+        enable_standalone_scale_spline=True,
+        base_activation=torch.nn.SiLU,
+        grid_eps=0.02,
+        grid_range=[-1, 1],
+    ):
+        super(KANLinear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.grid_size = grid_size
+        self.spline_order = spline_order
+        h = (grid_range[1] - grid_range[0]) / grid_size
+        grid = (
+            (
+                torch.arange(-spline_order, grid_size + spline_order + 1) * h
+                + grid_range[0]
+            )
+            .expand(in_features, -1)
+            .contiguous()
+        )
+        self.register_buffer("grid", grid)
+        self.base_weight = torch.nn.Parameter(torch.Tensor(out_features, in_features))
+        self.spline_weight = torch.nn.Parameter(
+            torch.Tensor(out_features, in_features, grid_size + spline_order)
+        )
+        if enable_standalone_scale_spline:
+            self.spline_scaler = torch.nn.Parameter(
+                torch.Tensor(out_features, in_features)
+            )
+        self.scale_noise = scale_noise
+        self.scale_base = scale_base
+        self.scale_spline = scale_spline
+        self.enable_standalone_scale_spline = enable_standalone_scale_spline
+        self.base_activation = base_activation()
+        self.grid_eps = grid_eps
+        self.reset_parameters()
+    def reset_parameters(self):
+        torch.nn.init.kaiming_uniform_(self.base_weight, a=math.sqrt(5) * self.scale_base)
+        with torch.no_grad():
+            noise = (
+                (
+                    torch.rand(self.grid_size + 1, self.in_features, self.out_features)
+                    - 1 / 2
+                )
+                * self.scale_noise
+                / self.grid_size
+            )
+            self.spline_weight.data.copy_(
+                (self.scale_spline if not self.enable_standalone_scale_spline else 1.0)
+                * self.curve2coeff(
+                    self.grid.T[self.spline_order : -self.spline_order],
+                    noise,
+                )
+            )
+            if self.enable_standalone_scale_spline:
+                # torch.nn.init.constant_(self.spline_scaler, self.scale_spline)
+                torch.nn.init.kaiming_uniform_(self.spline_scaler, a=math.sqrt(5) * self.scale_spline)
+    def b_splines(self, x: torch.Tensor):
+        """
+        Compute the B-spline bases for the given input tensor.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: B-spline bases tensor of shape (batch_size, in_features, grid_size + spline_order).
+        """
+        assert x.dim() == 2 and x.size(1) == self.in_features
+        grid: torch.Tensor = (
+            self.grid
+        )  # (in_features, grid_size + 2 * spline_order + 1)
+        x = x.unsqueeze(-1)
+        bases = ((x >= grid[:, :-1]) & (x < grid[:, 1:])).to(x.dtype)
+        for k in range(1, self.spline_order + 1):
+            bases = (
+                (x - grid[:, : -(k + 1)])
+                / (grid[:, k:-1] - grid[:, : -(k + 1)])
+                * bases[:, :, :-1]
+            ) + (
+                (grid[:, k + 1 :] - x)
+                / (grid[:, k + 1 :] - grid[:, 1:(-k)])
+                * bases[:, :, 1:]
+            )
+        assert bases.size() == (
+            x.size(0),
+            self.in_features,
+            self.grid_size + self.spline_order,
+        )
+        return bases.contiguous()
+    def curve2coeff(self, x: torch.Tensor, y: torch.Tensor):
+        """
+        Compute the coefficients of the curve that interpolates the given points.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+            y (torch.Tensor): Output tensor of shape (batch_size, in_features, out_features).
+        Returns:
+            torch.Tensor: Coefficients tensor of shape (out_features, in_features, grid_size + spline_order).
+        """
+        assert x.dim() == 2 and x.size(1) == self.in_features
+        assert y.size() == (x.size(0), self.in_features, self.out_features)
+        A = self.b_splines(x).transpose(
+            0, 1
+        )  # (in_features, batch_size, grid_size + spline_order)
+        B = y.transpose(0, 1)  # (in_features, batch_size, out_features)
+        solution = torch.linalg.lstsq(
+            A, B
+        ).solution  # (in_features, grid_size + spline_order, out_features)
+        result = solution.permute(
+            2, 0, 1
+        )  # (out_features, in_features, grid_size + spline_order)
+        assert result.size() == (
+            self.out_features,
+            self.in_features,
+            self.grid_size + self.spline_order,
+        )
+        return result.contiguous()
+    @property
+    def scaled_spline_weight(self):
+        return self.spline_weight * (
+            self.spline_scaler.unsqueeze(-1)
+            if self.enable_standalone_scale_spline
+            else 1.0
+        )
+    def forward(self, x: torch.Tensor):
+        assert x.size(-1) == self.in_features
+        original_shape = x.shape
+        x = x.view(-1, self.in_features)
+        base_output = F.linear(self.base_activation(x), self.base_weight)
+        spline_output = F.linear(
+            self.b_splines(x).view(x.size(0), -1),
+            self.scaled_spline_weight.view(self.out_features, -1),
+        )
+        output = base_output + spline_output
+        output = output.view(*original_shape[:-1], self.out_features)
+        return output
+    @torch.no_grad()
+    def update_grid(self, x: torch.Tensor, margin=0.01):
+        assert x.dim() == 2 and x.size(1) == self.in_features
+        batch = x.size(0)
+        splines = self.b_splines(x)  # (batch, in, coeff)
+        splines = splines.permute(1, 0, 2)  # (in, batch, coeff)
+        orig_coeff = self.scaled_spline_weight  # (out, in, coeff)
+        orig_coeff = orig_coeff.permute(1, 2, 0)  # (in, coeff, out)
+        unreduced_spline_output = torch.bmm(splines, orig_coeff)  # (in, batch, out)
+        unreduced_spline_output = unreduced_spline_output.permute(
+            1, 0, 2
+        )  # (batch, in, out)
+        # sort each channel individually to collect data distribution
+        x_sorted = torch.sort(x, dim=0)[0]
+        grid_adaptive = x_sorted[
+            torch.linspace(
+                0, batch - 1, self.grid_size + 1, dtype=torch.int64, device=x.device
+            )
+        ]
+        uniform_step = (x_sorted[-1] - x_sorted[0] + 2 * margin) / self.grid_size
+        grid_uniform = (
+            torch.arange(
+                self.grid_size + 1, dtype=torch.float32, device=x.device
+            ).unsqueeze(1)
+            * uniform_step
+            + x_sorted[0]
+            - margin
+        )
+        grid = self.grid_eps * grid_uniform + (1 - self.grid_eps) * grid_adaptive
+        grid = torch.concatenate(
+            [
+                grid[:1]
+                - uniform_step
+                * torch.arange(self.spline_order, 0, -1, device=x.device).unsqueeze(1),
+                grid,
+                grid[-1:]
+                + uniform_step
+                * torch.arange(1, self.spline_order + 1, device=x.device).unsqueeze(1),
+            ],
+            dim=0,
+        )
+        self.grid.copy_(grid.T)
+        self.spline_weight.data.copy_(self.curve2coeff(x, unreduced_spline_output))
+    def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0):
+        """
+        Compute the regularization loss.
+        This is a dumb simulation of the original L1 regularization as stated in the
+        paper, since the original one requires computing absolutes and entropy from the
+        expanded (batch, in_features, out_features) intermediate tensor, which is hidden
+        behind the F.linear function if we want an memory efficient implementation.
+        The L1 regularization is now computed as mean absolute value of the spline
+        weights. The authors implementation also includes this term in addition to the
+        sample-based regularization.
+        """
+        l1_fake = self.spline_weight.abs().mean(-1)
+        regularization_loss_activation = l1_fake.sum()
+        p = l1_fake / regularization_loss_activation
+        regularization_loss_entropy = -torch.sum(p * p.log())
+        return (
+            regularize_activation * regularization_loss_activation
+            + regularize_entropy * regularization_loss_entropy
+        )
+class KAN(torch.nn.Module):
+    def __init__(
+        self,
+        layers_hidden,
+        grid_size=5,
+        spline_order=3,
+        scale_noise=0.1,
+        scale_base=1.0,
+        scale_spline=1.0,
+        base_activation=torch.nn.SiLU,
+        grid_eps=0.02,
+        grid_range=[-1, 1],
+    ):
+        super(KAN, self).__init__()
+        self.grid_size = grid_size
+        self.spline_order = spline_order
+        self.layers = torch.nn.ModuleList()
+        for in_features, out_features in zip(layers_hidden, layers_hidden[1:]):
+            self.layers.append(
+                KANLinear(
+                    in_features,
+                    out_features,
+                    grid_size=grid_size,
+                    spline_order=spline_order,
+                    scale_noise=scale_noise,
+                    scale_base=scale_base,
+                    scale_spline=scale_spline,
+                    base_activation=base_activation,
+                    grid_eps=grid_eps,
+                    grid_range=grid_range,
+                )
+            )
+    def forward(self, x: torch.Tensor, update_grid=False):
+        for layer in self.layers:
+            if update_grid:
+                layer.update_grid(x)
+            x = layer(x)
+        return x
+    def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0):
+        return sum(
+            layer.regularization_loss(regularize_activation, regularize_entropy)
+            for layer in self.layers
+        )

utils/.ipynb_checkpoints/speech_embedder_net-checkpoint.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep  5 20:58:34 2018
+@author: harry
+"""
+import torch
+import torch.nn as nn
+from utils.hparam import hparam as hp
+from utils.utils import get_centroids, get_cossim, calc_loss
+from utils.kan import KANLinear
+class SpeechEmbedder(nn.Module):
+    def __init__(self):
+        super(SpeechEmbedder, self).__init__()
+        self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
+        for name, param in self.LSTM_stack.named_parameters():
+          if 'bias' in name:
+             nn.init.constant_(param, 0.0)
+          elif 'weight' in name:
+             nn.init.xavier_normal_(param)
+        self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
+    def forward(self, x):
+        x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
+        #only use last frame
+        x = x[:,x.size(1)-1]
+        x = self.projection(x.float())
+        x = x / torch.norm(x, dim=1).unsqueeze(1)
+        return x
+class SpeechEmbedderGRU(nn.Module):
+    def __init__(self):
+        super(SpeechEmbedderGRU, self).__init__()
+        self.GRU_stack = nn.GRU(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
+        for name, param in self.GRU_stack.named_parameters():
+            if 'bias' in name:
+                nn.init.constant_(param, 0.0)
+            elif 'weight' in name:
+                nn.init.xavier_normal_(param)
+        self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
+    def forward(self, x):
+        x, _ = self.GRU_stack(x.float()) #(batch, frames, n_mels)
+        #only use last frame
+        x = x[:,x.size(1)-1]
+        x = self.projection(x.float())
+        x = x / torch.norm(x, dim=1).unsqueeze(1)
+        return x
+class SpeechEmbedderKAN(nn.Module):
+    def __init__(self):
+        super(SpeechEmbedderKAN, self).__init__()
+        self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
+        for name, param in self.LSTM_stack.named_parameters():
+            if 'bias' in name:
+                nn.init.constant_(param, 0.0)
+            elif 'weight' in name:
+                nn.init.xavier_normal_(param)
+        self.projection = KANLinear(hp.model.hidden, hp.model.proj)
+    def forward(self, x):
+        x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
+        #only use last frame
+        x = x[:,x.size(1)-1]
+        x = self.projection(x.float())
+        x = x / torch.norm(x, dim=1).unsqueeze(1)
+        return x
+class SpeechEmbedderBidirectional(nn.Module):
+    def __init__(self):
+        super(SpeechEmbedderBidirectional, self).__init__()
+        self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True, bidirectional=True)
+        for name, param in self.LSTM_stack.named_parameters():
+            if 'bias' in name:
+                nn.init.constant_(param, 0.0)
+            elif 'weight' in name:
+                nn.init.xavier_normal_(param)
+        self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
+    def forward(self, x):
+        x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
+        #only use last frame
+        x = x[:, :, :hp.model.hidden]
+        x = x[:,x.size(1)-1]
+        x = self.projection(x.float())
+        x = x / torch.norm(x, dim=1).unsqueeze(1)
+        return x
+class GE2ELoss(nn.Module):
+    def __init__(self, device):
+        super(GE2ELoss, self).__init__()
+        self.w = nn.Parameter(torch.tensor(10.0).to(device), requires_grad=True)
+        self.b = nn.Parameter(torch.tensor(-5.0).to(device), requires_grad=True)
+        self.device = device
+    def forward(self, embeddings):
+        torch.clamp(self.w, 1e-6)
+        centroids = get_centroids(embeddings)
+        cossim = get_cossim(embeddings, centroids)
+        sim_matrix = self.w*cossim.to(self.device) + self.b
+        loss, _ = calc_loss(sim_matrix)
+        return loss

utils/.ipynb_checkpoints/utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,173 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Sep 20 16:56:19 2018
+@author: harry
+"""
+import librosa
+import numpy as np
+import torch
+import torch.autograd as grad
+import torch.nn.functional as F
+from utils.hparam import hparam as hp
+def get_centroids_prior(embeddings):
+    centroids = []
+    for speaker in embeddings:
+        centroid = 0
+        for utterance in speaker:
+            centroid = centroid + utterance
+        centroid = centroid/len(speaker)
+        centroids.append(centroid)
+    centroids = torch.stack(centroids)
+    return centroids
+def get_centroids(embeddings):
+    centroids = embeddings.mean(dim=1)
+    return centroids
+def get_centroid(embeddings, speaker_num, utterance_num):
+    centroid = 0
+    for utterance_id, utterance in enumerate(embeddings[speaker_num]):
+        if utterance_id == utterance_num:
+            continue
+        centroid = centroid + utterance
+    centroid = centroid/(len(embeddings[speaker_num])-1)
+    return centroid
+def get_utterance_centroids(embeddings):
+    """
+    Returns the centroids for each utterance of a speaker, where
+    the utterance centroid is the speaker centroid without considering
+    this utterance
+    Shape of embeddings should be:
+        (speaker_ct, utterance_per_speaker_ct, embedding_size)
+    """
+    sum_centroids = embeddings.sum(dim=1)
+    # we want to subtract out each utterance, prior to calculating the
+    # the utterance centroid
+    sum_centroids = sum_centroids.reshape(
+        sum_centroids.shape[0], 1, sum_centroids.shape[-1]
+    )
+    # we want the mean but not including the utterance itself, so -1
+    num_utterances = embeddings.shape[1] - 1
+    centroids = (sum_centroids - embeddings) / num_utterances
+    return centroids
+def get_cossim_prior(embeddings, centroids):
+    # Calculates cosine similarity matrix. Requires (N, M, feature) input
+    cossim = torch.zeros(embeddings.size(0),embeddings.size(1),centroids.size(0))
+    for speaker_num, speaker in enumerate(embeddings):
+        for utterance_num, utterance in enumerate(speaker):
+            for centroid_num, centroid in enumerate(centroids):
+                if speaker_num == centroid_num:
+                    centroid = get_centroid(embeddings, speaker_num, utterance_num)
+                output = F.cosine_similarity(utterance,centroid,dim=0)+1e-6
+                cossim[speaker_num][utterance_num][centroid_num] = output
+    return cossim
+def get_cossim(embeddings, centroids):
+    # number of utterances per speaker
+    num_utterances = embeddings.shape[1]
+    utterance_centroids = get_utterance_centroids(embeddings)
+    # flatten the embeddings and utterance centroids to just utterance,
+    # so we can do cosine similarity
+    utterance_centroids_flat = utterance_centroids.view(
+        utterance_centroids.shape[0] * utterance_centroids.shape[1],
+        -1
+    )
+    embeddings_flat = embeddings.view(
+        embeddings.shape[0] * num_utterances,
+        -1
+    )
+    # the cosine distance between utterance and the associated centroids
+    # for that utterance
+    # this is each speaker's utterances against his own centroid, but each
+    # comparison centroid has the current utterance removed
+    cos_same = F.cosine_similarity(embeddings_flat, utterance_centroids_flat)
+    # now we get the cosine distance between each utterance and the other speakers'
+    # centroids
+    # to do so requires comparing each utterance to each centroid. To keep the
+    # operation fast, we vectorize by using matrices L (embeddings) and
+    # R (centroids) where L has each utterance repeated sequentially for all
+    # comparisons and R has the entire centroids frame repeated for each utterance
+    centroids_expand = centroids.repeat((num_utterances * embeddings.shape[0], 1))
+    embeddings_expand = embeddings_flat.unsqueeze(1).repeat(1, embeddings.shape[0], 1)
+    embeddings_expand = embeddings_expand.view(
+        embeddings_expand.shape[0] * embeddings_expand.shape[1],
+        embeddings_expand.shape[-1]
+    )
+    cos_diff = F.cosine_similarity(embeddings_expand, centroids_expand)
+    cos_diff = cos_diff.view(
+        embeddings.size(0),
+        num_utterances,
+        centroids.size(0)
+    )
+    # assign the cosine distance for same speakers to the proper idx
+    same_idx = list(range(embeddings.size(0)))
+    cos_diff[same_idx, :, same_idx] = cos_same.view(embeddings.shape[0], num_utterances)
+    cos_diff = cos_diff + 1e-6
+    return cos_diff
+def calc_loss_prior(sim_matrix):
+    # Calculates loss from (N, M, K) similarity matrix
+    per_embedding_loss = torch.zeros(sim_matrix.size(0), sim_matrix.size(1))
+    for j in range(len(sim_matrix)):
+        for i in range(sim_matrix.size(1)):
+            per_embedding_loss[j][i] = -(sim_matrix[j][i][j] - ((torch.exp(sim_matrix[j][i]).sum()+1e-6).log_()))
+    loss = per_embedding_loss.sum()
+    return loss, per_embedding_loss
+def calc_loss(sim_matrix):
+    same_idx = list(range(sim_matrix.size(0)))
+    pos = sim_matrix[same_idx, :, same_idx]
+    neg = (torch.exp(sim_matrix).sum(dim=2) + 1e-6).log_()
+    per_embedding_loss = -1 * (pos - neg)
+    loss = per_embedding_loss.sum()
+    return loss, per_embedding_loss
+def normalize_0_1(values, max_value, min_value):
+    normalized = np.clip((values - min_value) / (max_value - min_value), 0, 1)
+    return normalized
+def mfccs_and_spec(wav_file, wav_process = False, calc_mfccs=False, calc_mag_db=False):
+    sound_file, _ = librosa.core.load(wav_file, sr=hp.data.sr)
+    window_length = int(hp.data.window*hp.data.sr)
+    hop_length = int(hp.data.hop*hp.data.sr)
+    duration = hp.data.tisv_frame * hp.data.hop + hp.data.window
+    # Cut silence and fix length
+    if wav_process == True:
+        sound_file, index = librosa.effects.trim(sound_file, frame_length=window_length, hop_length=hop_length)
+        length = int(hp.data.sr * duration)
+        sound_file = librosa.util.fix_length(sound_file, length)
+    spec = librosa.stft(sound_file, n_fft=hp.data.nfft, hop_length=hop_length, win_length=window_length)
+    mag_spec = np.abs(spec)
+    mel_basis = librosa.filters.mel(hp.data.sr, hp.data.nfft, n_mels=hp.data.nmels)
+    mel_spec = np.dot(mel_basis, mag_spec)
+    mag_db = librosa.amplitude_to_db(mag_spec)
+    #db mel spectrogram
+    mel_db = librosa.amplitude_to_db(mel_spec).T
+    mfccs = None
+    if calc_mfccs:
+        mfccs = np.dot(librosa.filters.dct(40, mel_db.shape[0]), mel_db).T
+    return mfccs, mel_db, mag_db
+if __name__ == "__main__":
+    w = grad.Variable(torch.tensor(1.0))
+    b = grad.Variable(torch.tensor(0.0))
+    embeddings = torch.tensor([[0,1,0],[0,0,1], [0,1,0], [0,1,0], [1,0,0], [1,0,0]]).to(torch.float).reshape(3,2,3)
+    centroids = get_centroids(embeddings)
+    cossim = get_cossim(embeddings, centroids)
+    sim_matrix = w*cossim + b
+    loss, per_embedding_loss = calc_loss(sim_matrix)

utils/VAD_segments.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Dec 18 16:22:41 2018
+@author: Harry
+Modified from https://github.com/wiseman/py-webrtcvad/blob/master/example.py
+"""
+import collections
+import contextlib
+import numpy as np
+import sys
+import librosa
+import wave
+import webrtcvad
+from utils.hparam import hparam as hp
+def read_wave(path, sr):
+    """Reads a .wav file.
+    Takes the path, and returns (PCM audio data, sample rate).
+    Assumes sample width == 2
+    """
+    with contextlib.closing(wave.open(path, 'rb')) as wf:
+        num_channels = wf.getnchannels()
+        assert num_channels == 1
+        sample_width = wf.getsampwidth()
+        assert sample_width == 2
+        sample_rate = wf.getframerate()
+        assert sample_rate in (8000, 16000, 32000, 48000)
+        pcm_data = wf.readframes(wf.getnframes())
+    data, _ = librosa.load(path, sr=sr)
+    assert len(data.shape) == 1
+    assert sr in (8000, 16000, 32000, 48000)
+    return data, pcm_data
+class Frame(object):
+    """Represents a "frame" of audio data."""
+    def __init__(self, bytes, timestamp, duration):
+        self.bytes = bytes
+        self.timestamp = timestamp
+        self.duration = duration
+def frame_generator(frame_duration_ms, audio, sample_rate):
+    """Generates audio frames from PCM audio data.
+    Takes the desired frame duration in milliseconds, the PCM data, and
+    the sample rate.
+    Yields Frames of the requested duration.
+    """
+    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
+    offset = 0
+    timestamp = 0.0
+    duration = (float(n) / sample_rate) / 2.0
+    while offset + n < len(audio):
+        yield Frame(audio[offset:offset + n], timestamp, duration)
+        timestamp += duration
+        offset += n
+def vad_collector(sample_rate, frame_duration_ms,
+                  padding_duration_ms, vad, frames):
+    """Filters out non-voiced audio frames.
+    Given a webrtcvad.Vad and a source of audio frames, yields only
+    the voiced audio.
+    Uses a padded, sliding window algorithm over the audio frames.
+    When more than 90% of the frames in the window are voiced (as
+    reported by the VAD), the collector triggers and begins yielding
+    audio frames. Then the collector waits until 90% of the frames in
+    the window are unvoiced to detrigger.
+    The window is padded at the front and back to provide a small
+    amount of silence or the beginnings/endings of speech around the
+    voiced frames.
+    Arguments:
+    sample_rate - The audio sample rate, in Hz.
+    frame_duration_ms - The frame duration in milliseconds.
+    padding_duration_ms - The amount to pad the window, in milliseconds.
+    vad - An instance of webrtcvad.Vad.
+    frames - a source of audio frames (sequence or generator).
+    Returns: A generator that yields PCM audio data.
+    """
+    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
+    # We use a deque for our sliding window/ring buffer.
+    ring_buffer = collections.deque(maxlen=num_padding_frames)
+    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
+    # NOTTRIGGERED state.
+    triggered = False
+    voiced_frames = []
+    for frame in frames:
+        is_speech = vad.is_speech(frame.bytes, sample_rate)
+        if not triggered:
+            ring_buffer.append((frame, is_speech))
+            num_voiced = len([f for f, speech in ring_buffer if speech])
+            # If we're NOTTRIGGERED and more than 90% of the frames in
+            # the ring buffer are voiced frames, then enter the
+            # TRIGGERED state.
+            if num_voiced > 0.9 * ring_buffer.maxlen:
+                triggered = True
+                start = ring_buffer[0][0].timestamp
+                # We want to yield all the audio we see from now until
+                # we are NOTTRIGGERED, but we have to start with the
+                # audio that's already in the ring buffer.
+                for f, s in ring_buffer:
+                    voiced_frames.append(f)
+                ring_buffer.clear()
+        else:
+            # We're in the TRIGGERED state, so collect the audio data
+            # and add it to the ring buffer.
+            voiced_frames.append(frame)
+            ring_buffer.append((frame, is_speech))
+            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
+            # If more than 90% of the frames in the ring buffer are
+            # unvoiced, then enter NOTTRIGGERED and yield whatever
+            # audio we've collected.
+            if num_unvoiced > 0.9 * ring_buffer.maxlen:
+                triggered = False
+                yield (start, frame.timestamp + frame.duration)
+                ring_buffer.clear()
+                voiced_frames = []
+    # If we have any leftover voiced audio when we run out of input,
+    # yield it.
+    if voiced_frames:
+        yield (start, frame.timestamp + frame.duration)
+def VAD_chunk(aggressiveness, path):
+    audio, byte_audio = read_wave(path, sr=hp.data.sr)
+    vad = webrtcvad.Vad(int(aggressiveness))
+    frames = frame_generator(20, byte_audio, hp.data.sr)
+    frames = list(frames)
+    times = vad_collector(hp.data.sr, 20, 200, vad, frames)
+    speech_times = []
+    speech_segs = []
+    for i, time in enumerate(times):
+        start = np.round(time[0],decimals=2)
+        end = np.round(time[1],decimals=2)
+        j = start
+        while j + .4 < end:
+            end_j = np.round(j+.4,decimals=2)
+            speech_times.append((j, end_j))
+            speech_segs.append(audio[int(j*hp.data.sr):int(end_j*hp.data.sr)])
+            j = end_j
+        else:
+            speech_times.append((j, end))
+            speech_segs.append(audio[int(j*hp.data.sr):int(end*hp.data.sr)])
+    return speech_times, speech_segs
+if __name__ == '__main__':
+    speech_times, speech_segs = VAD_chunk(sys.argv[1], sys.argv[2])

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/VAD_segments.cpython-39.pyc ADDED Viewed

Binary file (4.68 kB). View file

utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (162 Bytes). View file

utils/__pycache__/data_load.cpython-39.pyc ADDED Viewed

Binary file (2.05 kB). View file

utils/__pycache__/evaluation.cpython-39.pyc ADDED Viewed

Binary file (6.62 kB). View file

utils/__pycache__/hparam.cpython-39.pyc ADDED Viewed

Binary file (1.98 kB). View file

utils/__pycache__/kan.cpython-39.pyc ADDED Viewed

Binary file (7.57 kB). View file

utils/__pycache__/speech_embedder_net.cpython-39.pyc ADDED Viewed

Binary file (4.45 kB). View file

utils/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (4.7 kB). View file

utils/data_load.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Mostly copied from https://github.com/HarryVolek/PyTorch_Speaker_Verification
+"""
+import glob
+import numpy as np
+import os
+import random
+from random import shuffle
+import torch
+from torch.utils.data import Dataset
+from utils.hparam import hparam as hp
+from utils.utils import mfccs_and_spec
+class GujaratiSpeakerVerificationDataset(Dataset):
+    def __init__(self, shuffle=True, utter_start=0, split='train'):
+        # data path
+        if split!='val':
+            self.path = hp.data.train_path
+            self.utter_num = hp.train.M
+        else:
+            self.path = hp.data.test_path
+            self.utter_num = hp.test.M
+        self.file_list = os.listdir(self.path)
+        self.shuffle=shuffle
+        self.utter_start = utter_start
+        self.split = split
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, idx):
+        np_file_list = os.listdir(self.path)
+        if self.shuffle:
+            selected_file = random.sample(np_file_list, 1)[0]  # select random speaker
+        else:
+            selected_file = np_file_list[idx]
+        utters = np.load(os.path.join(self.path, selected_file))
+                # load utterance spectrogram of selected speaker
+        if self.shuffle:
+            utter_index = np.random.randint(0, utters.shape[0], self.utter_num)   # select M utterances per speaker
+            utterance = utters[utter_index]
+        else:
+            utterance = utters[self.utter_start: self.utter_start+self.utter_num] # utterances of a speaker [batch(M), n_mels, frames]
+        utterance = utterance[:,:,:160]               # TODO implement variable length batch size
+        utterance = torch.tensor(np.transpose(utterance, axes=(0,2,1)))     # transpose [batch, frames, n_mels]
+        return utterance
+    def __repr__(self):
+        return f"{self.__class__.__name__}(split={self.split!r}, num_speakers={len(self.file_list)}, num_utterances={self.utter_num})"

utils/evaluation.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from torch.utils.data import Dataset
+from tqdm.auto import tqdm
+import os
+import librosa
+import numpy as np
+import torch
+import random
+from numpy.linalg import norm
+from utils.VAD_segments import VAD_chunk
+from utils.hparam import hparam as hp
+class GujaratiSpeakerVerificationDatasetTest(Dataset):
+    def __init__(self, path, shuffle=True, utter_start=0):
+        # data path
+        self.path = path
+        self.file_list = os.listdir(self.path)
+        self.shuffle=shuffle
+        self.utter_start = utter_start
+        self.utter_num = 4
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, idx):
+        np_file_list = self.file_list
+        selected_file = np_file_list[idx]
+        utters = np.load(os.path.join(self.path, selected_file))
+                # load utterance spectrogram of selected speaker
+        if self.shuffle:
+            utter_index = np.random.randint(0, utters.shape[0], self.utter_num)   # select M utterances per speaker
+            utterance = utters[utter_index]
+        else:
+            utterance = utters[self.utter_start: self.utter_start+self.utter_num] # utterances of a speaker [batch(M), n_mels, frames]
+        utterance = utterance[:,:,:160]               # TODO implement variable length batch size
+        utterance = torch.tensor(np.transpose(utterance, axes=(0,2,1)))     # transpose [batch, frames, n_mels]
+        return utterance
+def concat_segs(times, segs):
+    concat_seg = []
+    seg_concat = segs[0]
+    for i in range(0, len(times)-1):
+        if times[i][1] == times[i+1][0]:
+            seg_concat = np.concatenate((seg_concat, segs[i+1]))
+        else:
+            concat_seg.append(seg_concat)
+            seg_concat = segs[i+1]
+    else:
+        concat_seg.append(seg_concat)
+    return concat_seg
+def get_STFTs(segs):
+    sr = 16000
+    STFT_frames = []
+    for seg in segs:
+        S = librosa.core.stft(y=seg, n_fft=hp.data.nfft,
+                              win_length=int(hp.data.window * sr), hop_length=int(hp.data.hop * sr))
+        S = np.abs(S)**2
+        mel_basis = librosa.filters.mel(sr=sr, n_fft=hp.data.nfft, n_mels=hp.data.nmels)
+        S = np.log10(np.dot(mel_basis, S) + 1e-6)
+        for j in range(0, S.shape[1], int(.12/hp.data.hop)):
+            if j + 24 < S.shape[1]:
+                STFT_frames.append(S[:, j:j+24])
+            else:
+                break
+    return STFT_frames
+def get_embedding(file_path, embedder_net, device, n_threshold=-1):
+    times, segs = VAD_chunk(2, file_path)
+    if not segs:
+        print(f'No voice activity detected in {file_path}')
+        return None
+    concat_seg = concat_segs(times, segs)
+    if not concat_seg:
+        print(f'No concatenated segments for {file_path}')
+        return None
+    STFT_frames = get_STFTs(concat_seg)
+    if not STFT_frames:
+        #print(f'No STFT frames for {file_path}')
+        return None
+    STFT_frames = np.stack(STFT_frames, axis=2)
+    STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0)), device=device)
+    with torch.no_grad():
+        embeddings = embedder_net(STFT_frames)
+        embeddings = embeddings[:n_threshold, :]
+    avg_embedding = torch.mean(embeddings, dim=0, keepdim=True).cpu().numpy()
+    return avg_embedding
+def get_speaker_embeddings_listdir(embedder_net, device, list_dir, k):
+    speaker_embeddings = {}
+    for speaker_name in tqdm(list_dir, leave = False):
+        speaker_dir = speaker_name
+        if os.path.isdir(speaker_dir) and speaker_dir[0] != ".DS_Store":
+            speaker_embeddings[speaker_name] = []
+            for i in range(10):
+                embeddings = []
+                audio_files = [os.path.join(speaker_dir, f) for f in os.listdir(speaker_dir) if f.endswith('.wav')]
+                random.shuffle(audio_files)
+                count = 0
+                iter_ = 0
+                while(count <= k):
+                    file_path = audio_files[iter_]
+                    embedding = get_embedding(file_path, embedder_net, device)
+                    try:
+                        _ = embedding.shape
+                        embeddings.append(embedding)
+                        count+=1
+                        iter_+=1
+                    except:
+                        iter_+=1
+                speaker_embeddings[speaker_name].append(np.mean(embeddings, axis=0))
+    return speaker_embeddings
+def create_pairs(speaker_embeddings):
+    pairs = []
+    labels = []
+    speakers = list(speaker_embeddings.keys())
+    for i in range(len(speakers)):
+        for j in range(len(speakers)):
+            for k1 in range(10):
+                for k2 in range(10):
+                    emb1 = speaker_embeddings[speakers[i]][k1]
+                    emb2 = speaker_embeddings[speakers[j]][k2]
+                    pairs.append((emb1, emb2))
+                    if i == j and not((emb1 == emb2).all()):
+                        labels.append(1)  # Same speaker
+                    else:
+                        labels.append(0)  # Different speakers
+    return pairs, labels
+class EmbeddingPairDataset(Dataset):
+    def __init__(self, pairs, labels):
+        self.pairs = pairs
+        self.labels = labels
+    def __len__(self):
+        return len(self.pairs)
+    def __getitem__(self, idx):
+        emb1, emb2 = self.pairs[idx]
+        label = self.labels[idx]
+        emb1, emb2 = torch.tensor(emb1, dtype=torch.float32), torch.tensor(emb2, dtype=torch.float32)
+        concatenated = torch.cat((emb1, emb2), dim=1)
+        return concatenated.squeeze(), torch.tensor(label, dtype=torch.float32)
+    def __len__(self):
+        return len(self.labels)
+    def __repr__(self):
+        return f"{self.__class__.__name__}(length={self.__len__()})"
+def cosine_similarity(A, B):
+    A = A.flatten().astype(np.float64)
+    B = B.flatten().astype(np.float64)
+    cosine = np.dot(A,B)/(norm(A)*norm(B))
+    return cosine
+def create_subset(dataset, num_zeros):
+    pairs = dataset.pairs
+    labels = dataset.labels
+    pairs_1 = [pairs[i] for i in range(len(pairs)) if labels[i] == 1]
+    labels_1 = [labels[i] for i in range(len(labels)) if labels[i] == 1]
+    pairs_0 = [pairs[i] for i in range(len(pairs)) if labels[i] == 0]
+    labels_0 = [labels[i] for i in range(len(labels)) if labels[i] == 0]
+    num_zeros = min(num_zeros, len(pairs_0))
+    pairs_0 = pairs_0[:num_zeros]
+    labels_0 = labels_0[:num_zeros]
+    filtered_pairs = pairs_1 + pairs_0
+    filtered_labels = labels_1 + labels_0
+    return filtered_pairs, filtered_labels

utils/hparam.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# -*- coding: utf-8 -*-
+#!/usr/bin/env python
+import yaml
+def load_hparam(filename):
+    stream = open(filename, 'r')
+    docs = yaml.load_all(stream, Loader=yaml.Loader)
+    hparam_dict = dict()
+    for doc in docs:
+        for k, v in doc.items():
+            hparam_dict[k] = v
+    return hparam_dict
+def merge_dict(user, default):
+    if isinstance(user, dict) and isinstance(default, dict):
+        for k, v in default.items():
+            if k not in user:
+                user[k] = v
+            else:
+                user[k] = merge_dict(user[k], v)
+    return user
+class Dotdict(dict):
+    """
+    a dictionary that supports dot notation
+    as well as dictionary access notation
+    usage: d = DotDict() or d = DotDict({'val1':'first'})
+    set attributes: d.val2 = 'second' or d['val2'] = 'second'
+    get attributes: d.val2 or d['val2']
+    """
+    __getattr__ = dict.__getitem__
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+    def __init__(self, dct=None):
+        dct = dict() if not dct else dct
+        for key, value in dct.items():
+            if hasattr(value, 'keys'):
+                value = Dotdict(value)
+            self[key] = value
+class Hparam(Dotdict):
+    def __init__(self, file='config/config.yaml'):
+        super(Dotdict, self).__init__()
+        hp_dict = load_hparam(file)
+        hp_dotdict = Dotdict(hp_dict)
+        for k, v in hp_dotdict.items():
+            setattr(self, k, v)
+    __getattr__ = Dotdict.__getitem__
+    __setattr__ = Dotdict.__setitem__
+    __delattr__ = Dotdict.__delitem__
+hparam = Hparam()

utils/kan.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import torch
+import torch.nn.functional as F
+import math
+class KANLinear(torch.nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        grid_size=5,
+        spline_order=3,
+        scale_noise=0.1,
+        scale_base=1.0,
+        scale_spline=1.0,
+        enable_standalone_scale_spline=True,
+        base_activation=torch.nn.SiLU,
+        grid_eps=0.02,
+        grid_range=[-1, 1],
+    ):
+        super(KANLinear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.grid_size = grid_size
+        self.spline_order = spline_order
+        h = (grid_range[1] - grid_range[0]) / grid_size
+        grid = (
+            (
+                torch.arange(-spline_order, grid_size + spline_order + 1) * h
+                + grid_range[0]
+            )
+            .expand(in_features, -1)
+            .contiguous()
+        )
+        self.register_buffer("grid", grid)
+        self.base_weight = torch.nn.Parameter(torch.Tensor(out_features, in_features))
+        self.spline_weight = torch.nn.Parameter(
+            torch.Tensor(out_features, in_features, grid_size + spline_order)
+        )
+        if enable_standalone_scale_spline:
+            self.spline_scaler = torch.nn.Parameter(
+                torch.Tensor(out_features, in_features)
+            )
+        self.scale_noise = scale_noise
+        self.scale_base = scale_base
+        self.scale_spline = scale_spline
+        self.enable_standalone_scale_spline = enable_standalone_scale_spline
+        self.base_activation = base_activation()
+        self.grid_eps = grid_eps
+        self.reset_parameters()
+    def reset_parameters(self):
+        torch.nn.init.kaiming_uniform_(self.base_weight, a=math.sqrt(5) * self.scale_base)
+        with torch.no_grad():
+            noise = (
+                (
+                    torch.rand(self.grid_size + 1, self.in_features, self.out_features)
+                    - 1 / 2
+                )
+                * self.scale_noise
+                / self.grid_size
+            )
+            self.spline_weight.data.copy_(
+                (self.scale_spline if not self.enable_standalone_scale_spline else 1.0)
+                * self.curve2coeff(
+                    self.grid.T[self.spline_order : -self.spline_order],
+                    noise,
+                )
+            )
+            if self.enable_standalone_scale_spline:
+                # torch.nn.init.constant_(self.spline_scaler, self.scale_spline)
+                torch.nn.init.kaiming_uniform_(self.spline_scaler, a=math.sqrt(5) * self.scale_spline)
+    def b_splines(self, x: torch.Tensor):
+        """
+        Compute the B-spline bases for the given input tensor.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: B-spline bases tensor of shape (batch_size, in_features, grid_size + spline_order).
+        """
+        assert x.dim() == 2 and x.size(1) == self.in_features
+        grid: torch.Tensor = (
+            self.grid
+        )  # (in_features, grid_size + 2 * spline_order + 1)
+        x = x.unsqueeze(-1)
+        bases = ((x >= grid[:, :-1]) & (x < grid[:, 1:])).to(x.dtype)
+        for k in range(1, self.spline_order + 1):
+            bases = (
+                (x - grid[:, : -(k + 1)])
+                / (grid[:, k:-1] - grid[:, : -(k + 1)])
+                * bases[:, :, :-1]
+            ) + (
+                (grid[:, k + 1 :] - x)
+                / (grid[:, k + 1 :] - grid[:, 1:(-k)])
+                * bases[:, :, 1:]
+            )
+        assert bases.size() == (
+            x.size(0),
+            self.in_features,
+            self.grid_size + self.spline_order,
+        )
+        return bases.contiguous()
+    def curve2coeff(self, x: torch.Tensor, y: torch.Tensor):
+        """
+        Compute the coefficients of the curve that interpolates the given points.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+            y (torch.Tensor): Output tensor of shape (batch_size, in_features, out_features).
+        Returns:
+            torch.Tensor: Coefficients tensor of shape (out_features, in_features, grid_size + spline_order).
+        """
+        assert x.dim() == 2 and x.size(1) == self.in_features
+        assert y.size() == (x.size(0), self.in_features, self.out_features)
+        A = self.b_splines(x).transpose(
+            0, 1
+        )  # (in_features, batch_size, grid_size + spline_order)
+        B = y.transpose(0, 1)  # (in_features, batch_size, out_features)
+        solution = torch.linalg.lstsq(
+            A, B
+        ).solution  # (in_features, grid_size + spline_order, out_features)
+        result = solution.permute(
+            2, 0, 1
+        )  # (out_features, in_features, grid_size + spline_order)
+        assert result.size() == (
+            self.out_features,
+            self.in_features,
+            self.grid_size + self.spline_order,
+        )
+        return result.contiguous()
+    @property
+    def scaled_spline_weight(self):
+        return self.spline_weight * (
+            self.spline_scaler.unsqueeze(-1)
+            if self.enable_standalone_scale_spline
+            else 1.0
+        )
+    def forward(self, x: torch.Tensor):
+        assert x.size(-1) == self.in_features
+        original_shape = x.shape
+        x = x.view(-1, self.in_features)
+        base_output = F.linear(self.base_activation(x), self.base_weight)
+        spline_output = F.linear(
+            self.b_splines(x).view(x.size(0), -1),
+            self.scaled_spline_weight.view(self.out_features, -1),
+        )
+        output = base_output + spline_output
+        output = output.view(*original_shape[:-1], self.out_features)
+        return output
+    @torch.no_grad()
+    def update_grid(self, x: torch.Tensor, margin=0.01):
+        assert x.dim() == 2 and x.size(1) == self.in_features
+        batch = x.size(0)
+        splines = self.b_splines(x)  # (batch, in, coeff)
+        splines = splines.permute(1, 0, 2)  # (in, batch, coeff)
+        orig_coeff = self.scaled_spline_weight  # (out, in, coeff)
+        orig_coeff = orig_coeff.permute(1, 2, 0)  # (in, coeff, out)
+        unreduced_spline_output = torch.bmm(splines, orig_coeff)  # (in, batch, out)
+        unreduced_spline_output = unreduced_spline_output.permute(
+            1, 0, 2
+        )  # (batch, in, out)
+        # sort each channel individually to collect data distribution
+        x_sorted = torch.sort(x, dim=0)[0]
+        grid_adaptive = x_sorted[
+            torch.linspace(
+                0, batch - 1, self.grid_size + 1, dtype=torch.int64, device=x.device
+            )
+        ]
+        uniform_step = (x_sorted[-1] - x_sorted[0] + 2 * margin) / self.grid_size
+        grid_uniform = (
+            torch.arange(
+                self.grid_size + 1, dtype=torch.float32, device=x.device
+            ).unsqueeze(1)
+            * uniform_step
+            + x_sorted[0]
+            - margin
+        )
+        grid = self.grid_eps * grid_uniform + (1 - self.grid_eps) * grid_adaptive
+        grid = torch.concatenate(
+            [
+                grid[:1]
+                - uniform_step
+                * torch.arange(self.spline_order, 0, -1, device=x.device).unsqueeze(1),
+                grid,
+                grid[-1:]
+                + uniform_step
+                * torch.arange(1, self.spline_order + 1, device=x.device).unsqueeze(1),
+            ],
+            dim=0,
+        )
+        self.grid.copy_(grid.T)
+        self.spline_weight.data.copy_(self.curve2coeff(x, unreduced_spline_output))
+    def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0):
+        """
+        Compute the regularization loss.
+        This is a dumb simulation of the original L1 regularization as stated in the
+        paper, since the original one requires computing absolutes and entropy from the
+        expanded (batch, in_features, out_features) intermediate tensor, which is hidden
+        behind the F.linear function if we want an memory efficient implementation.
+        The L1 regularization is now computed as mean absolute value of the spline
+        weights. The authors implementation also includes this term in addition to the
+        sample-based regularization.
+        """
+        l1_fake = self.spline_weight.abs().mean(-1)
+        regularization_loss_activation = l1_fake.sum()
+        p = l1_fake / regularization_loss_activation
+        regularization_loss_entropy = -torch.sum(p * p.log())
+        return (
+            regularize_activation * regularization_loss_activation
+            + regularize_entropy * regularization_loss_entropy
+        )
+class KAN(torch.nn.Module):
+    def __init__(
+        self,
+        layers_hidden,
+        grid_size=5,
+        spline_order=3,
+        scale_noise=0.1,
+        scale_base=1.0,
+        scale_spline=1.0,
+        base_activation=torch.nn.SiLU,
+        grid_eps=0.02,
+        grid_range=[-1, 1],
+    ):
+        super(KAN, self).__init__()
+        self.grid_size = grid_size
+        self.spline_order = spline_order
+        self.layers = torch.nn.ModuleList()
+        for in_features, out_features in zip(layers_hidden, layers_hidden[1:]):
+            self.layers.append(
+                KANLinear(
+                    in_features,
+                    out_features,
+                    grid_size=grid_size,
+                    spline_order=spline_order,
+                    scale_noise=scale_noise,
+                    scale_base=scale_base,
+                    scale_spline=scale_spline,
+                    base_activation=base_activation,
+                    grid_eps=grid_eps,
+                    grid_range=grid_range,
+                )
+            )
+    def forward(self, x: torch.Tensor, update_grid=False):
+        for layer in self.layers:
+            if update_grid:
+                layer.update_grid(x)
+            x = layer(x)
+        return x
+    def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0):
+        return sum(
+            layer.regularization_loss(regularize_activation, regularize_entropy)
+            for layer in self.layers
+        )

utils/speech_embedder_net.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep  5 20:58:34 2018
+@author: harry
+"""
+import torch
+import torch.nn as nn
+from utils.hparam import hparam as hp
+from utils.utils import get_centroids, get_cossim, calc_loss
+from utils.kan import KANLinear
+class SpeechEmbedder(nn.Module):
+    def __init__(self):
+        super(SpeechEmbedder, self).__init__()
+        self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
+        for name, param in self.LSTM_stack.named_parameters():
+          if 'bias' in name:
+             nn.init.constant_(param, 0.0)
+          elif 'weight' in name:
+             nn.init.xavier_normal_(param)
+        self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
+    def forward(self, x):
+        x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
+        #only use last frame
+        x = x[:,x.size(1)-1]
+        x = self.projection(x.float())
+        x = x / torch.norm(x, dim=1).unsqueeze(1)
+        return x
+class SpeechEmbedderGRU(nn.Module):
+    def __init__(self):
+        super(SpeechEmbedderGRU, self).__init__()
+        self.GRU_stack = nn.GRU(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
+        for name, param in self.GRU_stack.named_parameters():
+            if 'bias' in name:
+                nn.init.constant_(param, 0.0)
+            elif 'weight' in name:
+                nn.init.xavier_normal_(param)
+        self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
+    def forward(self, x):
+        x, _ = self.GRU_stack(x.float()) #(batch, frames, n_mels)
+        #only use last frame
+        x = x[:,x.size(1)-1]
+        x = self.projection(x.float())
+        x = x / torch.norm(x, dim=1).unsqueeze(1)
+        return x
+class SpeechEmbedderKAN(nn.Module):
+    def __init__(self):
+        super(SpeechEmbedderKAN, self).__init__()
+        self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
+        for name, param in self.LSTM_stack.named_parameters():
+            if 'bias' in name:
+                nn.init.constant_(param, 0.0)
+            elif 'weight' in name:
+                nn.init.xavier_normal_(param)
+        self.projection = KANLinear(hp.model.hidden, hp.model.proj)
+    def forward(self, x):
+        x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
+        #only use last frame
+        x = x[:,x.size(1)-1]
+        x = self.projection(x.float())
+        x = x / torch.norm(x, dim=1).unsqueeze(1)
+        return x
+class SpeechEmbedderBidirectional(nn.Module):
+    def __init__(self):
+        super(SpeechEmbedderBidirectional, self).__init__()
+        self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True, bidirectional=True)
+        for name, param in self.LSTM_stack.named_parameters():
+            if 'bias' in name:
+                nn.init.constant_(param, 0.0)
+            elif 'weight' in name:
+                nn.init.xavier_normal_(param)
+        self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
+    def forward(self, x):
+        x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
+        #only use last frame
+        x = x[:, :, :hp.model.hidden]
+        x = x[:,x.size(1)-1]
+        x = self.projection(x.float())
+        x = x / torch.norm(x, dim=1).unsqueeze(1)
+        return x
+class GE2ELoss(nn.Module):
+    def __init__(self, device):
+        super(GE2ELoss, self).__init__()
+        self.w = nn.Parameter(torch.tensor(10.0).to(device), requires_grad=True)
+        self.b = nn.Parameter(torch.tensor(-5.0).to(device), requires_grad=True)
+        self.device = device
+    def forward(self, embeddings):
+        torch.clamp(self.w, 1e-6)
+        centroids = get_centroids(embeddings)
+        cossim = get_cossim(embeddings, centroids)
+        sim_matrix = self.w*cossim.to(self.device) + self.b
+        loss, _ = calc_loss(sim_matrix)
+        return loss

utils/utils.py ADDED Viewed

	@@ -0,0 +1,173 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Sep 20 16:56:19 2018
+@author: harry
+"""
+import librosa
+import numpy as np
+import torch
+import torch.autograd as grad
+import torch.nn.functional as F
+from utils.hparam import hparam as hp
+def get_centroids_prior(embeddings):
+    centroids = []
+    for speaker in embeddings:
+        centroid = 0
+        for utterance in speaker:
+            centroid = centroid + utterance
+        centroid = centroid/len(speaker)
+        centroids.append(centroid)
+    centroids = torch.stack(centroids)
+    return centroids
+def get_centroids(embeddings):
+    centroids = embeddings.mean(dim=1)
+    return centroids
+def get_centroid(embeddings, speaker_num, utterance_num):
+    centroid = 0
+    for utterance_id, utterance in enumerate(embeddings[speaker_num]):
+        if utterance_id == utterance_num:
+            continue
+        centroid = centroid + utterance
+    centroid = centroid/(len(embeddings[speaker_num])-1)
+    return centroid
+def get_utterance_centroids(embeddings):
+    """
+    Returns the centroids for each utterance of a speaker, where
+    the utterance centroid is the speaker centroid without considering
+    this utterance
+    Shape of embeddings should be:
+        (speaker_ct, utterance_per_speaker_ct, embedding_size)
+    """
+    sum_centroids = embeddings.sum(dim=1)
+    # we want to subtract out each utterance, prior to calculating the
+    # the utterance centroid
+    sum_centroids = sum_centroids.reshape(
+        sum_centroids.shape[0], 1, sum_centroids.shape[-1]
+    )
+    # we want the mean but not including the utterance itself, so -1
+    num_utterances = embeddings.shape[1] - 1
+    centroids = (sum_centroids - embeddings) / num_utterances
+    return centroids
+def get_cossim_prior(embeddings, centroids):
+    # Calculates cosine similarity matrix. Requires (N, M, feature) input
+    cossim = torch.zeros(embeddings.size(0),embeddings.size(1),centroids.size(0))
+    for speaker_num, speaker in enumerate(embeddings):
+        for utterance_num, utterance in enumerate(speaker):
+            for centroid_num, centroid in enumerate(centroids):
+                if speaker_num == centroid_num:
+                    centroid = get_centroid(embeddings, speaker_num, utterance_num)
+                output = F.cosine_similarity(utterance,centroid,dim=0)+1e-6
+                cossim[speaker_num][utterance_num][centroid_num] = output
+    return cossim
+def get_cossim(embeddings, centroids):
+    # number of utterances per speaker
+    num_utterances = embeddings.shape[1]
+    utterance_centroids = get_utterance_centroids(embeddings)
+    # flatten the embeddings and utterance centroids to just utterance,
+    # so we can do cosine similarity
+    utterance_centroids_flat = utterance_centroids.view(
+        utterance_centroids.shape[0] * utterance_centroids.shape[1],
+        -1
+    )
+    embeddings_flat = embeddings.view(
+        embeddings.shape[0] * num_utterances,
+        -1
+    )
+    # the cosine distance between utterance and the associated centroids
+    # for that utterance
+    # this is each speaker's utterances against his own centroid, but each
+    # comparison centroid has the current utterance removed
+    cos_same = F.cosine_similarity(embeddings_flat, utterance_centroids_flat)
+    # now we get the cosine distance between each utterance and the other speakers'
+    # centroids
+    # to do so requires comparing each utterance to each centroid. To keep the
+    # operation fast, we vectorize by using matrices L (embeddings) and
+    # R (centroids) where L has each utterance repeated sequentially for all
+    # comparisons and R has the entire centroids frame repeated for each utterance
+    centroids_expand = centroids.repeat((num_utterances * embeddings.shape[0], 1))
+    embeddings_expand = embeddings_flat.unsqueeze(1).repeat(1, embeddings.shape[0], 1)
+    embeddings_expand = embeddings_expand.view(
+        embeddings_expand.shape[0] * embeddings_expand.shape[1],
+        embeddings_expand.shape[-1]
+    )
+    cos_diff = F.cosine_similarity(embeddings_expand, centroids_expand)
+    cos_diff = cos_diff.view(
+        embeddings.size(0),
+        num_utterances,
+        centroids.size(0)
+    )
+    # assign the cosine distance for same speakers to the proper idx
+    same_idx = list(range(embeddings.size(0)))
+    cos_diff[same_idx, :, same_idx] = cos_same.view(embeddings.shape[0], num_utterances)
+    cos_diff = cos_diff + 1e-6
+    return cos_diff
+def calc_loss_prior(sim_matrix):
+    # Calculates loss from (N, M, K) similarity matrix
+    per_embedding_loss = torch.zeros(sim_matrix.size(0), sim_matrix.size(1))
+    for j in range(len(sim_matrix)):
+        for i in range(sim_matrix.size(1)):
+            per_embedding_loss[j][i] = -(sim_matrix[j][i][j] - ((torch.exp(sim_matrix[j][i]).sum()+1e-6).log_()))
+    loss = per_embedding_loss.sum()
+    return loss, per_embedding_loss
+def calc_loss(sim_matrix):
+    same_idx = list(range(sim_matrix.size(0)))
+    pos = sim_matrix[same_idx, :, same_idx]
+    neg = (torch.exp(sim_matrix).sum(dim=2) + 1e-6).log_()
+    per_embedding_loss = -1 * (pos - neg)
+    loss = per_embedding_loss.sum()
+    return loss, per_embedding_loss
+def normalize_0_1(values, max_value, min_value):
+    normalized = np.clip((values - min_value) / (max_value - min_value), 0, 1)
+    return normalized
+def mfccs_and_spec(wav_file, wav_process = False, calc_mfccs=False, calc_mag_db=False):
+    sound_file, _ = librosa.core.load(wav_file, sr=hp.data.sr)
+    window_length = int(hp.data.window*hp.data.sr)
+    hop_length = int(hp.data.hop*hp.data.sr)
+    duration = hp.data.tisv_frame * hp.data.hop + hp.data.window
+    # Cut silence and fix length
+    if wav_process == True:
+        sound_file, index = librosa.effects.trim(sound_file, frame_length=window_length, hop_length=hop_length)
+        length = int(hp.data.sr * duration)
+        sound_file = librosa.util.fix_length(sound_file, length)
+    spec = librosa.stft(sound_file, n_fft=hp.data.nfft, hop_length=hop_length, win_length=window_length)
+    mag_spec = np.abs(spec)
+    mel_basis = librosa.filters.mel(hp.data.sr, hp.data.nfft, n_mels=hp.data.nmels)
+    mel_spec = np.dot(mel_basis, mag_spec)
+    mag_db = librosa.amplitude_to_db(mag_spec)
+    #db mel spectrogram
+    mel_db = librosa.amplitude_to_db(mel_spec).T
+    mfccs = None
+    if calc_mfccs:
+        mfccs = np.dot(librosa.filters.dct(40, mel_db.shape[0]), mel_db).T
+    return mfccs, mel_db, mag_db
+if __name__ == "__main__":
+    w = grad.Variable(torch.tensor(1.0))
+    b = grad.Variable(torch.tensor(0.0))
+    embeddings = torch.tensor([[0,1,0],[0,0,1], [0,1,0], [0,1,0], [1,0,0], [1,0,0]]).to(torch.float).reshape(3,2,3)
+    centroids = get_centroids(embeddings)
+    cossim = get_cossim(embeddings, centroids)
+    sim_matrix = w*cossim + b
+    loss, per_embedding_loss = calc_loss(sim_matrix)