import gradio as gr import torch import soundfile as sf import numpy as np import yaml from inference import MasteringStyleTransfer from utils import download_youtube_audio from config import args mastering_transfer = MasteringStyleTransfer(args) def denormalize_audio(audio, dtype=np.int16): """ Denormalize the audio from the range [-1, 1] to the full range of the specified dtype. """ if dtype == np.int16: audio = np.clip(audio, -1, 1) # Ensure the input is in the range [-1, 1] return (audio * 32767).astype(np.int16) elif dtype == np.float32: return audio.astype(np.float32) else: raise ValueError("Unsupported dtype. Use np.int16 or np.float32.") def loudness_normalize(audio, sample_rate, target_loudness=-12.0): # Ensure audio is float32 if audio.dtype != np.float32: audio = audio.astype(np.float32) # If audio is mono, reshape to (samples, 1) if audio.ndim == 1: audio = audio.reshape(-1, 1) meter = pyln.Meter(sample_rate) # create BS.1770 meter loudness = meter.integrated_loudness(audio) loudness_normalized_audio = pyln.normalize.loudness(audio, loudness, target_loudness) return loudness_normalized_audio def process_audio(input_audio, reference_audio): output_audio, predicted_params, _, _, _, sr = mastering_transfer.process_audio( input_audio, reference_audio, reference_audio, {}, False ) param_output = mastering_transfer.get_param_output_string(predicted_params) # Convert output_audio to numpy array if it's a tensor if isinstance(output_audio, torch.Tensor): output_audio = output_audio.cpu().numpy() # Normalize output audio output_audio = loudness_normalize(output_audio, sr) # Denormalize the audio to int16 output_audio = denormalize_audio(output_audio, dtype=np.int16) if output_audio.ndim == 1: output_audio = output_audio.reshape(-1, 1) elif output_audio.ndim > 2: output_audio = output_audio.squeeze() # Ensure the audio is in the correct shape (samples, channels) if output_audio.shape[1] > output_audio.shape[0]: output_audio = output_audio.transpose(1,0) print(output_audio.shape) print(param_output) return (sr, output_audio), param_output def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights): if ito_reference_audio is None: ito_reference_audio = reference_audio ito_config = { 'optimizer': optimizer, 'learning_rate': learning_rate, 'num_steps': num_steps, 'af_weights': af_weights, 'sample_rate': args.sample_rate } input_tensor = mastering_transfer.preprocess_audio(input_audio, args.sample_rate) reference_tensor = mastering_transfer.preprocess_audio(reference_audio, args.sample_rate) ito_reference_tensor = mastering_transfer.preprocess_audio(ito_reference_audio, args.sample_rate) initial_reference_feature = mastering_transfer.get_reference_embedding(reference_tensor) ito_log = "" for log_entry, current_output, current_params, step in mastering_transfer.inference_time_optimization( input_tensor, ito_reference_tensor, ito_config, initial_reference_feature ): ito_log += log_entry ito_param_output = mastering_transfer.get_param_output_string(current_params) # Convert current_output to numpy array if it's a tensor if isinstance(current_output, torch.Tensor): current_output = current_output.detach().cpu().numpy() # Normalize output audio current_output = loudness_normalize(current_output, args.sample_rate) # Denormalize the audio to int16 current_output = denormalize_audio(current_output, dtype=np.int16) if current_output.ndim == 1: current_output = current_output.reshape(-1, 1) elif current_output.ndim > 2: current_output = current_output.squeeze() # Ensure the audio is in the correct shape (samples, channels) if current_output.shape[1] > current_output.shape[0]: current_output = current_output.transpose(1,0) yield (args.sample_rate, current_output), ito_param_output, step, ito_log with gr.Blocks() as demo: gr.Markdown("# Mastering Style Transfer Demo") with gr.Tab("Upload Audio"): with gr.Row(): input_audio = gr.Audio(label="Input Audio") reference_audio = gr.Audio(label="Reference Audio") process_button = gr.Button("Process Mastering Style Transfer") with gr.Row(): output_audio = gr.Audio(label="Output Audio", type='numpy') param_output = gr.Textbox(label="Predicted Parameters", lines=10) process_button.click( process_audio, inputs=[input_audio, reference_audio], outputs=[output_audio, param_output] ) gr.Markdown("## Inference Time Optimization (ITO)") with gr.Row(): with gr.Column(scale=2): ito_reference_audio = gr.Audio(label="ITO Reference Audio (optional)") num_steps = gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Steps") optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer") learning_rate = gr.Slider(minimum=0.0001, maximum=0.1, value=0.001, step=0.0001, label="Learning Rate") af_weights = gr.Textbox(label="AudioFeatureLoss Weights (comma-separated)", value="0.1,0.001,1.0,1.0,0.1") ito_button = gr.Button("Perform ITO") ito_output_audio = gr.Audio(label="ITO Output Audio") ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10) ito_steps_taken = gr.Number(label="ITO Steps Taken") with gr.Column(scale=1): ito_log = gr.Textbox(label="ITO Log", lines=30) def run_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights): af_weights = [float(w.strip()) for w in af_weights.split(',')] ito_generator = perform_ito( input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights ) # Initialize variables to store the final results final_audio = None final_params = None final_steps = 0 final_log = "" # Iterate through the generator to get the final results for audio, params, steps, log in ito_generator: final_audio = audio final_params = params final_steps = steps final_log = log return final_audio, final_params, final_steps, final_log ito_button.click( run_ito, inputs=[input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights], outputs=[ito_output_audio, ito_param_output, ito_steps_taken, ito_log] ) demo.launch() # import gradio as gr # import torch # import soundfile as sf # import numpy as np # import yaml # from inference import MasteringStyleTransfer # from utils import download_youtube_audio # from config import args # mastering_transfer = MasteringStyleTransfer(args) # def process_audio(input_audio, reference_audio, perform_ito, ito_reference_audio=None): # # Process the audio files # output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio( # input_audio, reference_audio, ito_reference_audio if ito_reference_audio else reference_audio, {}, perform_ito # ) # # Generate parameter output strings # param_output = mastering_transfer.get_param_output_string(predicted_params) # ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed" # # Generate top 10 differences if ITO was performed # top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed" # return "output_mastered.wav", "ito_output_mastered.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log # def process_with_ito(input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio): # ito_ref = reference_audio if use_same_reference else ito_reference_audio # return process_audio(input_audio, reference_audio, perform_ito, ito_ref) # def process_youtube_with_ito(input_url, reference_url, perform_ito, use_same_reference, ito_reference_url): # input_audio = download_youtube_audio(input_url) # reference_audio = download_youtube_audio(reference_url) # ito_ref = reference_audio if use_same_reference else download_youtube_audio(ito_reference_url) # output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio( # input_audio, reference_audio, ito_ref, {}, perform_ito, log_ito=True # ) # param_output = mastering_transfer.get_param_output_string(predicted_params) # ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed" # top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed" # return "output_mastered_yt.wav", "ito_output_mastered_yt.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log # with gr.Blocks() as demo: # gr.Markdown("# Mastering Style Transfer Demo") # with gr.Tab("Upload Audio"): # input_audio = gr.Audio(label="Input Audio") # reference_audio = gr.Audio(label="Reference Audio") # perform_ito = gr.Checkbox(label="Perform ITO") # with gr.Column(visible=False) as ito_options: # use_same_reference = gr.Checkbox(label="Use same reference audio for ITO", value=True) # ito_reference_audio = gr.Audio(label="ITO Reference Audio", visible=False) # def update_ito_options(perform_ito): # return gr.Column.update(visible=perform_ito) # def update_ito_reference(use_same): # return gr.Audio.update(visible=not use_same) # perform_ito.change(fn=update_ito_options, inputs=perform_ito, outputs=ito_options) # use_same_reference.change(fn=update_ito_reference, inputs=use_same_reference, outputs=ito_reference_audio) # submit_button = gr.Button("Process") # output_audio = gr.Audio(label="Output Audio") # ito_output_audio = gr.Audio(label="ITO Output Audio") # param_output = gr.Textbox(label="Predicted Parameters", lines=10) # ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10) # top_10_diff = gr.Textbox(label="Top 10 Parameter Differences", lines=10) # ito_log = gr.Textbox(label="ITO Log", lines=20) # submit_button.click( # process_with_ito, # inputs=[input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio], # outputs=[output_audio, ito_output_audio, param_output, ito_param_output, top_10_diff, ito_log] # ) # with gr.Tab("YouTube URLs"): # input_url = gr.Textbox(label="Input YouTube URL") # reference_url = gr.Textbox(label="Reference YouTube URL") # perform_ito_yt = gr.Checkbox(label="Perform ITO") # with gr.Column(visible=False) as ito_options_yt: # use_same_reference_yt = gr.Checkbox(label="Use same reference audio for ITO", value=True) # ito_reference_url = gr.Textbox(label="ITO Reference YouTube URL", visible=False) # def update_ito_options_yt(perform_ito): # return gr.Column.update(visible=perform_ito) # def update_ito_reference_yt(use_same): # return gr.Textbox.update(visible=not use_same) # perform_ito_yt.change(fn=update_ito_options_yt, inputs=perform_ito_yt, outputs=ito_options_yt) # use_same_reference_yt.change(fn=update_ito_reference_yt, inputs=use_same_reference_yt, outputs=ito_reference_url) # submit_button_yt = gr.Button("Process") # output_audio_yt = gr.Audio(label="Output Audio") # ito_output_audio_yt = gr.Audio(label="ITO Output Audio") # param_output_yt = gr.Textbox(label="Predicted Parameters", lines=10) # ito_param_output_yt = gr.Textbox(label="ITO Predicted Parameters", lines=10) # top_10_diff_yt = gr.Textbox(label="Top 10 Parameter Differences", lines=10) # ito_log_yt = gr.Textbox(label="ITO Log", lines=20) # submit_button_yt.click( # process_youtube_with_ito, # inputs=[input_url, reference_url, perform_ito_yt, use_same_reference_yt, ito_reference_url], # outputs=[output_audio_yt, ito_output_audio_yt, param_output_yt, ito_param_output_yt, top_10_diff_yt, ito_log_yt] # ) # demo.launch()