|
import sys |
|
from io import BytesIO |
|
|
|
import numpy as np |
|
import soundfile as sf |
|
from pydub import AudioSegment, effects |
|
import pyrubberband as pyrb |
|
|
|
INT16_MAX = np.iinfo(np.int16).max |
|
|
|
|
|
def audio_to_int16(audio_data: np.ndarray) -> np.ndarray: |
|
if ( |
|
audio_data.dtype == np.float32 |
|
or audio_data.dtype == np.float64 |
|
or audio_data.dtype == np.float128 |
|
or audio_data.dtype == np.float16 |
|
): |
|
audio_data = (audio_data * INT16_MAX).astype(np.int16) |
|
return audio_data |
|
|
|
|
|
def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]: |
|
""" |
|
Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels], |
|
where each value is in range [-1.0, 1.0]. |
|
Returns tuple (audio_np_array, sample_rate). |
|
""" |
|
nd_array = np.array(audio.get_array_of_samples(), dtype=np.float32) |
|
if audio.channels != 1: |
|
nd_array = nd_array.reshape((-1, audio.channels)) |
|
nd_array = nd_array / (1 << (8 * audio.sample_width - 1)) |
|
|
|
return ( |
|
audio.frame_rate, |
|
nd_array, |
|
) |
|
|
|
|
|
def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray: |
|
""" |
|
Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels], |
|
where each value is in range [-1.0, 1.0]. |
|
""" |
|
channel_sounds = audiosegment.split_to_mono() |
|
samples = [s.get_array_of_samples() for s in channel_sounds] |
|
|
|
fp_arr = np.array(samples).T.astype(np.float32) |
|
fp_arr /= np.iinfo(samples[0].typecode).max |
|
fp_arr = fp_arr.reshape(-1) |
|
|
|
return fp_arr |
|
|
|
|
|
def ndarray_to_segment( |
|
ndarray: np.ndarray, frame_rate: int, sample_width: int = None, channels: int = None |
|
) -> AudioSegment: |
|
buffer = BytesIO() |
|
sf.write(buffer, ndarray, frame_rate, format="wav", subtype="PCM_16") |
|
buffer.seek(0) |
|
sound: AudioSegment = AudioSegment.from_wav(buffer) |
|
|
|
if sample_width is None: |
|
sample_width = sound.sample_width |
|
if channels is None: |
|
channels = sound.channels |
|
|
|
return ( |
|
sound.set_frame_rate(frame_rate) |
|
.set_sample_width(sample_width) |
|
.set_channels(channels) |
|
) |
|
|
|
|
|
def apply_prosody_to_audio_segment( |
|
audio_segment: AudioSegment, |
|
rate: float = 1, |
|
volume: float = 0, |
|
pitch: int = 0, |
|
sr: int = 24000, |
|
) -> AudioSegment: |
|
audio_data = audiosegment_to_librosawav(audio_segment) |
|
|
|
audio_data = apply_prosody_to_audio_data(audio_data, rate, volume, pitch, sr) |
|
|
|
audio_segment = ndarray_to_segment( |
|
audio_data, sr, audio_segment.sample_width, audio_segment.channels |
|
) |
|
|
|
return audio_segment |
|
|
|
|
|
def apply_prosody_to_audio_data( |
|
audio_data: np.ndarray, |
|
rate: float = 1, |
|
volume: float = 0, |
|
pitch: float = 0, |
|
sr: int = 24000, |
|
) -> np.ndarray: |
|
if rate != 1: |
|
audio_data = pyrb.time_stretch(audio_data, sr=sr, rate=rate) |
|
|
|
if volume != 0: |
|
audio_data = audio_data * volume |
|
|
|
if pitch != 0: |
|
audio_data = pyrb.pitch_shift(audio_data, sr=sr, n_steps=pitch) |
|
|
|
return audio_data |
|
|
|
|
|
def apply_normalize( |
|
audio_data: np.ndarray, |
|
headroom: float = 1, |
|
sr: int = 24000, |
|
): |
|
segment = ndarray_to_segment(audio_data, sr) |
|
segment = effects.normalize(seg=segment, headroom=headroom) |
|
|
|
return pydub_to_np(segment) |
|
|
|
|
|
if __name__ == "__main__": |
|
input_file = sys.argv[1] |
|
|
|
time_stretch_factors = [0.5, 0.75, 1.5, 1.0] |
|
pitch_shift_factors = [-12, -5, 0, 5, 12] |
|
|
|
input_sound = AudioSegment.from_mp3(input_file) |
|
|
|
for time_factor in time_stretch_factors: |
|
output_wav = f"{input_file}_time_{time_factor}.wav" |
|
output_sound = apply_prosody_to_audio_segment(input_sound, rate=time_factor) |
|
output_sound.export(output_wav, format="wav") |
|
|
|
for pitch_factor in pitch_shift_factors: |
|
output_wav = f"{input_file}_pitch_{pitch_factor}.wav" |
|
output_sound = apply_prosody_to_audio_segment(input_sound, pitch=pitch_factor) |
|
output_sound.export(output_wav, format="wav") |
|
|