|
import librosa |
|
import numpy as np |
|
from typing import Dict, Tuple |
|
|
|
class AudioProcessor: |
|
def __init__(self): |
|
self.sample_rate = 16000 |
|
self.n_mfcc = 13 |
|
self.n_mels = 128 |
|
|
|
def process_audio(self, audio_path: str) -> Tuple[np.ndarray, Dict]: |
|
|
|
waveform, sr = librosa.load(audio_path, sr=self.sample_rate) |
|
|
|
|
|
features = { |
|
'mfcc': self._extract_mfcc(waveform), |
|
'pitch': self._extract_pitch(waveform), |
|
'energy': self._extract_energy(waveform) |
|
} |
|
|
|
return waveform, features |
|
|
|
def _extract_mfcc(self, waveform: np.ndarray) -> np.ndarray: |
|
mfccs = librosa.feature.mfcc( |
|
y=waveform, |
|
sr=self.sample_rate, |
|
n_mfcc=self.n_mfcc |
|
) |
|
return mfccs.mean(axis=1) |
|
|
|
def _extract_pitch(self, waveform: np.ndarray) -> Dict: |
|
f0, voiced_flag, voiced_probs = librosa.pyin( |
|
waveform, |
|
fmin=librosa.note_to_hz('C2'), |
|
fmax=librosa.note_to_hz('C7'), |
|
sr=self.sample_rate |
|
) |
|
|
|
return { |
|
'mean': float(np.nanmean(f0)), |
|
'std': float(np.nanstd(f0)), |
|
'max': float(np.nanmax(f0)), |
|
'min': float(np.nanmin(f0)) |
|
} |
|
|
|
def _extract_energy(self, waveform: np.ndarray) -> Dict: |
|
rms = librosa.feature.rms(y=waveform)[0] |
|
|
|
return { |
|
'mean': float(np.mean(rms)), |
|
'std': float(np.std(rms)), |
|
'max': float(np.max(rms)), |
|
'min': float(np.min(rms)) |
|
} |