import librosa import numpy as np from typing import Dict, Tuple class AudioProcessor: def __init__(self): self.sample_rate = 16000 self.n_mfcc = 13 self.n_mels = 128 def process_audio(self, audio_path: str) -> Tuple[np.ndarray, Dict]: # Load and preprocess audio waveform, sr = librosa.load(audio_path, sr=self.sample_rate) # Extract features features = { 'mfcc': self._extract_mfcc(waveform), 'pitch': self._extract_pitch(waveform), 'energy': self._extract_energy(waveform) } return waveform, features def _extract_mfcc(self, waveform: np.ndarray) -> np.ndarray: mfccs = librosa.feature.mfcc( y=waveform, sr=self.sample_rate, n_mfcc=self.n_mfcc ) return mfccs.mean(axis=1) def _extract_pitch(self, waveform: np.ndarray) -> Dict: f0, voiced_flag, voiced_probs = librosa.pyin( waveform, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=self.sample_rate ) return { 'mean': float(np.nanmean(f0)), 'std': float(np.nanstd(f0)), 'max': float(np.nanmax(f0)), 'min': float(np.nanmin(f0)) } def _extract_energy(self, waveform: np.ndarray) -> Dict: rms = librosa.feature.rms(y=waveform)[0] return { 'mean': float(np.mean(rms)), 'std': float(np.std(rms)), 'max': float(np.max(rms)), 'min': float(np.min(rms)) }