import os from typing import List import gradio as gr import tempfile import numpy as np import librosa import soundfile as sf import spaces from audiosr import super_resolution from audiosr_utils import load_audiosr audiosr_model = load_audiosr() def split_audio_to_chunks(y, sr=48000, chunk_duration=5.12) -> List[str]: # Calculate the number of samples per chunk chunk_samples = int(chunk_duration * sr) # Split the audio into chunks chunks = [y[i : i + chunk_samples] for i in range(0, len(y), chunk_samples)] # Save each chunk to a temporary file temp_files = [] for i, chunk in enumerate(chunks): # Create a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") temp_files.append(temp_file.name) # Write the chunk to the temporary file sf.write(temp_file.name, chunk, sr) return temp_files @spaces.GPU(duration=180) def run_audiosr( chunks: List[str], guidance_scale: float, ddim_steps: int ) -> np.ndarray: waveforms = [] for i, chunk in enumerate(chunks): print(f"Processing chunk {i+1}/{len(chunks)}") waveform = super_resolution( audiosr_model, chunk, guidance_scale=guidance_scale, ddim_steps=ddim_steps, ) waveforms.append(waveform) waveform = np.concatenate(waveforms, axis=-1) # (1, 1, N) waveform = waveform.squeeze() return waveform def audiosr_infer(audio: str) -> str: guidance_scale = 3.5 ddim_steps = 100 y, sr = librosa.load(audio, sr=48000) if len(y) > 60 * sr: y = y[: 60 * sr] gr.Info("Audio is too long, only the first 60 seconds will be processed") chunk_files = split_audio_to_chunks(y, sr=sr, chunk_duration=5.12) print(f"Splited audio chunks: {chunk_files}") waveform = run_audiosr(chunk_files, guidance_scale, ddim_steps) sr = 44100 for chunk_file in chunk_files: os.remove(chunk_file) with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: sf.write(f.name, waveform, sr) return f.name models = { "AudioSR": audiosr_infer, } def infer(audio: str, model: str, sr: int) -> str: if sr > 0: # resample audio y, _ = librosa.load(audio, sr=sr) # save resampled audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, y, sr) return models[model](f.name) else: return models[model](audio) with gr.Blocks() as app: with open(os.path.join(os.path.dirname(__file__), "README.md"), "r") as f: README = f.read() # remove yaml front matter blocks = README.split("---") if len(blocks) > 1: README = "---".join(blocks[2:]) gr.Markdown(README) with gr.Row(): with gr.Column(): gr.Markdown("## Upload an audio file") audio = gr.Audio(label="Upload an audio file", type="filepath") sr = gr.Slider( value=0, label="Resample audio to sample rate before inference, 0 means no resampling", minimum=0, maximum=48000, step=1000, ) with gr.Row(): model = gr.Radio( label="Select a model", choices=[s for s in models.keys()], value="AudioSR", ) btn = gr.Button("Infer") with gr.Row(): with gr.Column(): out = gr.Audio( label="Output", format="mp3", type="filepath", interactive=False ) btn.click( fn=infer, inputs=[audio, model, sr], outputs=[out], api_name="infer", ) app.launch(show_error=True)