Spaces:
Paused
Paused
import json | |
import shlex | |
import subprocess | |
import tempfile | |
from pathlib import Path | |
from typing import Tuple | |
import ffmpy | |
import numpy as np | |
import torch | |
def r128stats(filepath: str, quiet: bool): | |
"""Takes a path to an audio file, returns a dict with the loudness | |
stats computed by the ffmpeg ebur128 filter. | |
Parameters | |
---------- | |
filepath : str | |
Path to compute loudness stats on. | |
quiet : bool | |
Whether to show FFMPEG output during computation. | |
Returns | |
------- | |
dict | |
Dictionary containing loudness stats. | |
""" | |
ffargs = [ | |
"ffmpeg", | |
"-nostats", | |
"-i", | |
filepath, | |
"-filter_complex", | |
"ebur128", | |
"-f", | |
"null", | |
"-", | |
] | |
if quiet: | |
ffargs += ["-hide_banner"] | |
proc = subprocess.Popen(ffargs, stderr=subprocess.PIPE, universal_newlines=True) | |
stats = proc.communicate()[1] | |
summary_index = stats.rfind("Summary:") | |
summary_list = stats[summary_index:].split() | |
i_lufs = float(summary_list[summary_list.index("I:") + 1]) | |
i_thresh = float(summary_list[summary_list.index("I:") + 4]) | |
lra = float(summary_list[summary_list.index("LRA:") + 1]) | |
lra_thresh = float(summary_list[summary_list.index("LRA:") + 4]) | |
lra_low = float(summary_list[summary_list.index("low:") + 1]) | |
lra_high = float(summary_list[summary_list.index("high:") + 1]) | |
stats_dict = { | |
"I": i_lufs, | |
"I Threshold": i_thresh, | |
"LRA": lra, | |
"LRA Threshold": lra_thresh, | |
"LRA Low": lra_low, | |
"LRA High": lra_high, | |
} | |
return stats_dict | |
def ffprobe_offset_and_codec(path: str) -> Tuple[float, str]: | |
"""Given a path to a file, returns the start time offset and codec of | |
the first audio stream. | |
""" | |
ff = ffmpy.FFprobe( | |
inputs={path: None}, | |
global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,codec_name,start_pts,time_base -of json -v quiet", | |
) | |
streams = json.loads(ff.run(stdout=subprocess.PIPE)[0])["streams"] | |
seconds_offset = 0.0 | |
codec = None | |
# Get the offset and codec of the first audio stream we find | |
# and return its start time, if it has one. | |
for stream in streams: | |
if stream["codec_type"] == "audio": | |
seconds_offset = stream.get("start_time", 0.0) | |
codec = stream.get("codec_name") | |
break | |
return float(seconds_offset), codec | |
class FFMPEGMixin: | |
_loudness = None | |
def ffmpeg_loudness(self, quiet: bool = True): | |
"""Computes loudness of audio file using FFMPEG. | |
Parameters | |
---------- | |
quiet : bool, optional | |
Whether to show FFMPEG output during computation, | |
by default True | |
Returns | |
------- | |
torch.Tensor | |
Loudness of every item in the batch, computed via | |
FFMPEG. | |
""" | |
loudness = [] | |
with tempfile.NamedTemporaryFile(suffix=".wav") as f: | |
for i in range(self.batch_size): | |
self[i].write(f.name) | |
loudness_stats = r128stats(f.name, quiet=quiet) | |
loudness.append(loudness_stats["I"]) | |
self._loudness = torch.from_numpy(np.array(loudness)).float() | |
return self.loudness() | |
def ffmpeg_resample(self, sample_rate: int, quiet: bool = True): | |
"""Resamples AudioSignal using FFMPEG. More memory-efficient | |
than using julius.resample for long audio files. | |
Parameters | |
---------- | |
sample_rate : int | |
Sample rate to resample to. | |
quiet : bool, optional | |
Whether to show FFMPEG output during computation, | |
by default True | |
Returns | |
------- | |
AudioSignal | |
Resampled AudioSignal. | |
""" | |
from audiotools import AudioSignal | |
if sample_rate == self.sample_rate: | |
return self | |
with tempfile.NamedTemporaryFile(suffix=".wav") as f: | |
self.write(f.name) | |
f_out = f.name.replace("wav", "rs.wav") | |
command = f"ffmpeg -i {f.name} -ar {sample_rate} {f_out}" | |
if quiet: | |
command += " -hide_banner -loglevel error" | |
subprocess.check_call(shlex.split(command)) | |
resampled = AudioSignal(f_out) | |
Path.unlink(Path(f_out)) | |
return resampled | |
def load_from_file_with_ffmpeg(cls, audio_path: str, quiet: bool = True, **kwargs): | |
"""Loads AudioSignal object after decoding it to a wav file using FFMPEG. | |
Useful for loading audio that isn't covered by librosa's loading mechanism. Also | |
useful for loading mp3 files, without any offset. | |
Parameters | |
---------- | |
audio_path : str | |
Path to load AudioSignal from. | |
quiet : bool, optional | |
Whether to show FFMPEG output during computation, | |
by default True | |
Returns | |
------- | |
AudioSignal | |
AudioSignal loaded from file with FFMPEG. | |
""" | |
audio_path = str(audio_path) | |
with tempfile.TemporaryDirectory() as d: | |
wav_file = str(Path(d) / "extracted.wav") | |
padded_wav = str(Path(d) / "padded.wav") | |
global_options = "-y" | |
if quiet: | |
global_options += " -loglevel error" | |
ff = ffmpy.FFmpeg( | |
inputs={audio_path: None}, | |
outputs={wav_file: None}, | |
global_options=global_options, | |
) | |
ff.run() | |
# We pad the file using the start time offset in case it's an audio | |
# stream starting at some offset in a video container. | |
pad, codec = ffprobe_offset_and_codec(audio_path) | |
# For mp3s, don't pad files with discrepancies less than 0.027s - | |
# it's likely due to codec latency. The amount of latency introduced | |
# by mp3 is 1152, which is 0.0261 44khz. So we set the threshold | |
# here slightly above that. | |
# Source: https://lame.sourceforge.io/tech-FAQ.txt. | |
if codec == "mp3" and pad < 0.027: | |
pad = 0.0 | |
ff = ffmpy.FFmpeg( | |
inputs={wav_file: None}, | |
outputs={padded_wav: f"-af 'adelay={pad*1000}:all=true'"}, | |
global_options=global_options, | |
) | |
ff.run() | |
signal = cls(padded_wav, **kwargs) | |
return signal | |