maskgct

Running

File size: 6,198 Bytes

c968fc3

# Source: https://github.com/microsoft/DNS-Challenge/tree/master/DNSMOS
#
# Copyright (c) 2022 Microsoft
#
# This code is licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0) license.
# The full license text is available at the root of the source repository.
#
# Note: This code has been modified to fit the context of this repository.
#       This code is included in an MIT-licensed repository.
#       The repository's MIT license does not apply to this code.

import os
import librosa
import numpy as np
import onnxruntime as ort
import pandas as pd
import tqdm
import warnings


warnings.filterwarnings("ignore")

SAMPLING_RATE = 16000
INPUT_LENGTH = 9.01


class ComputeScore:
    """
    ComputeScore class for evaluating DNSMOS.
    """

    def __init__(self, primary_model_path, device="cpu") -> None:
        """
        Initialize the ComputeScore object.

        Args:
            primary_model_path (str): Path to the primary model.
            device (str): Device to run the models on ('cpu' or 'cuda').

        Returns:
            None

        Raises:
            RuntimeError: If the device is not supported.
        """
        if device == "cuda":
            self.onnx_sess = ort.InferenceSession(
                primary_model_path, providers=["CUDAExecutionProvider"]
            )
            print("Using CUDA:", self.onnx_sess.get_providers())
        else:
            self.onnx_sess = ort.InferenceSession(primary_model_path)

    def audio_melspec(
        self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True
    ):
        """
        Compute the mel spectrogram of an audio signal.

        Args:
            audio (np.ndarray): Input audio signal.
            n_mels (int): Number of mel bands.
            frame_size (int): Size of the FFT window.
            hop_length (int): Number of samples between successive frames.
            sr (int): Sampling rate.
            to_db (bool): Whether to convert the power spectrogram to decibel units.

        Returns:
            np.ndarray: Mel spectrogram.
        """
        mel_spec = librosa.feature.melspectrogram(
            y=audio, sr=sr, n_fft=frame_size + 1, hop_length=hop_length, n_mels=n_mels
        )
        if to_db:
            mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40
        return mel_spec.T

    def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS):
        """
        Apply polynomial fitting to MOS scores.

        Args:
            sig (float): Signal MOS score.
            bak (float): Background MOS score.
            ovr (float): Overall MOS score.
            is_personalized_MOS (bool): Flag for personalized MOS.

        Returns:
            tuple: Tuple containing the adjusted signal, background, and overall MOS scores.
        """
        if is_personalized_MOS:
            p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046])
            p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726])
            p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132])
        else:
            p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
            p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439])
            p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])

        sig_poly = p_sig(sig)
        bak_poly = p_bak(bak)
        ovr_poly = p_ovr(ovr)

        return sig_poly, bak_poly, ovr_poly

    def __call__(self, audio, sampling_rate, is_personalized_MOS):
        """
        Compute DNSMOS scores for an audio signal.

        Args:
            audio (np.ndarray or str): Input audio signal or path to audio file.
            sampling_rate (int): Sampling rate of the input audio.
            is_personalized_MOS (bool): Flag for personalized MOS.

        Returns:
            dict: Dictionary containing MOS scores.

        Raises:
            ValueError: If the input audio is not valid.
        """
        fs = SAMPLING_RATE
        if isinstance(audio, str):
            audio, _ = librosa.load(audio, sr=fs)
        elif sampling_rate != fs:
            # resample audio
            audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=fs)

        actual_audio_len = len(audio)

        len_samples = int(INPUT_LENGTH * fs)
        while len(audio) < len_samples:
            audio = np.append(audio, audio)

        num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1
        hop_len_samples = fs
        predicted_mos_sig_seg_raw = []
        predicted_mos_bak_seg_raw = []
        predicted_mos_ovr_seg_raw = []
        predicted_mos_sig_seg = []
        predicted_mos_bak_seg = []
        predicted_mos_ovr_seg = []

        for idx in range(num_hops):
            audio_seg = audio[
                int(idx * hop_len_samples) : int((idx + INPUT_LENGTH) * hop_len_samples)
            ]
            if len(audio_seg) < len_samples:
                continue
            input_features = np.array(audio_seg).astype("float32")[np.newaxis, :]
            oi = {"input_1": input_features}
            mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
            mos_sig, mos_bak, mos_ovr = self.get_polyfit_val(
                mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS
            )
            predicted_mos_sig_seg_raw.append(mos_sig_raw)
            predicted_mos_bak_seg_raw.append(mos_bak_raw)
            predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
            predicted_mos_sig_seg.append(mos_sig)
            predicted_mos_bak_seg.append(mos_bak)
            predicted_mos_ovr_seg.append(mos_ovr)

        clip_dict = {
            "filename": "audio_clip",
            "len_in_sec": actual_audio_len / fs,
            "sr": fs,
            "num_hops": num_hops,
            "OVRL_raw": np.mean(predicted_mos_ovr_seg_raw),
            "SIG_raw": np.mean(predicted_mos_sig_seg_raw),
            "BAK_raw": np.mean(predicted_mos_bak_seg_raw),
            "OVRL": np.mean(predicted_mos_ovr_seg),
            "SIG": np.mean(predicted_mos_sig_seg),
            "BAK": np.mean(predicted_mos_bak_seg),
        }
        return clip_dict