Spaces:

amphion
/

maskgct

Running on Zero

App Files Files Community

maskgct / evaluation /metrics /energy /energy_pearson_coefficients.py

Hecheng0625

Upload 167 files

8c92a11 verified 15 days ago

raw

history blame

3.17 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import math
	import librosa
	import torch

	import numpy as np
	from numpy import linalg as LA

	from torchmetrics import PearsonCorrCoef


	def extract_energy_pearson_coeffcients(
	audio_ref,
	audio_deg,
	n_fft=1024,
	hop_length=256,
	win_length=1024,
	**kwargs,
	):
	"""Compute Energy Pearson Coefficients between the predicted and the ground truth audio.
	audio_ref: path to the ground truth audio.
	audio_deg: path to the predicted audio.
	fs: sampling rate.
	n_fft: fft size.
	hop_length: hop length.
	win_length: window length.
	method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
	"cut" will cut both audios into a same length according to the one with the shorter length.
	db_scale: the ground truth and predicted audio will be converted to db_scale if "True".
	"""
	# Load hyperparameters
	kwargs = kwargs["kwargs"]
	fs = kwargs["fs"]
	method = kwargs["method"]
	db_scale = kwargs["db_scale"]

	# Initialize method
	pearson = PearsonCorrCoef()

	# Load audio
	if fs != None:
	audio_ref, _ = librosa.load(audio_ref, sr=fs)
	audio_deg, _ = librosa.load(audio_deg, sr=fs)
	else:
	audio_ref, fs = librosa.load(audio_ref)
	audio_deg, fs = librosa.load(audio_deg)

	# STFT
	spec_ref = librosa.stft(
	y=audio_ref, n_fft=n_fft, hop_length=hop_length, win_length=win_length
	)
	spec_deg = librosa.stft(
	y=audio_deg, n_fft=n_fft, hop_length=hop_length, win_length=win_length
	)

	# Get magnitudes
	mag_ref = np.abs(spec_ref).T
	mag_deg = np.abs(spec_deg).T

	# Convert spectrogram to energy
	energy_ref = LA.norm(mag_ref, axis=1)
	energy_deg = LA.norm(mag_deg, axis=1)

	# Convert to db_scale
	if db_scale:
	energy_ref = 20 * np.log10(energy_ref)
	energy_deg = 20 * np.log10(energy_deg)

	# Audio length alignment
	if method == "cut":
	length = min(len(energy_ref), len(energy_deg))
	energy_ref = energy_ref[:length]
	energy_deg = energy_deg[:length]
	elif method == "dtw":
	_, wp = librosa.sequence.dtw(energy_ref, energy_deg, backtrack=True)
	energy_gt_new = []
	energy_pred_new = []
	for i in range(wp.shape[0]):
	gt_index = wp[i][0]
	pred_index = wp[i][1]
	energy_gt_new.append(energy_ref[gt_index])
	energy_pred_new.append(energy_deg[pred_index])
	energy_ref = np.array(energy_gt_new)
	energy_deg = np.array(energy_pred_new)
	assert len(energy_ref) == len(energy_deg)

	# Convert to tensor
	energy_ref = torch.from_numpy(energy_ref)
	energy_deg = torch.from_numpy(energy_deg)

	if torch.cuda.is_available():
	device = torch.device("cuda")
	energy_ref = energy_ref.to(device)
	energy_deg = energy_deg.to(device)
	pearson = pearson.to(device)

	return pearson(energy_ref, energy_deg).detach().cpu().numpy().tolist()