jhtonyKoo commited on
Commit
6d6c0d5
1 Parent(s): bc92652

modify app

Browse files
Files changed (2) hide show
  1. app.py +166 -74
  2. inference.py +56 -49
app.py CHANGED
@@ -9,39 +9,40 @@ from config import args
9
 
10
  mastering_transfer = MasteringStyleTransfer(args)
11
 
12
- def process_audio(input_audio, reference_audio, perform_ito, ito_reference_audio=None):
13
- # Process the audio files
14
- output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio(
15
- input_audio, reference_audio, ito_reference_audio if ito_reference_audio else reference_audio, {}, perform_ito
16
  )
17
 
18
- # Generate parameter output strings
19
  param_output = mastering_transfer.get_param_output_string(predicted_params)
20
- ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
21
 
22
- # Generate top 10 differences if ITO was performed
23
- top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
24
-
25
- return "output_mastered.wav", "ito_output_mastered.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log
26
 
27
- def process_with_ito(input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio):
28
- ito_ref = reference_audio if use_same_reference else ito_reference_audio
29
- return process_audio(input_audio, reference_audio, perform_ito, ito_ref)
30
 
31
- def process_youtube_with_ito(input_url, reference_url, perform_ito, use_same_reference, ito_reference_url):
32
- input_audio = download_youtube_audio(input_url)
33
- reference_audio = download_youtube_audio(reference_url)
34
- ito_ref = reference_audio if use_same_reference else download_youtube_audio(ito_reference_url)
35
-
36
- output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio(
37
- input_audio, reference_audio, ito_ref, {}, perform_ito, log_ito=True
 
 
 
 
 
 
 
 
 
38
  )
 
 
39
 
40
- param_output = mastering_transfer.get_param_output_string(predicted_params)
41
- ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
42
- top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
43
-
44
- return "output_mastered_yt.wav", "ito_output_mastered_yt.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log
45
 
46
 
47
  with gr.Blocks() as demo:
@@ -50,63 +51,154 @@ with gr.Blocks() as demo:
50
  with gr.Tab("Upload Audio"):
51
  input_audio = gr.Audio(label="Input Audio")
52
  reference_audio = gr.Audio(label="Reference Audio")
53
- perform_ito = gr.Checkbox(label="Perform ITO")
54
- with gr.Column(visible=False) as ito_options:
55
- use_same_reference = gr.Checkbox(label="Use same reference audio for ITO", value=True)
56
- ito_reference_audio = gr.Audio(label="ITO Reference Audio", visible=False)
57
-
58
- def update_ito_options(perform_ito):
59
- return gr.Column.update(visible=perform_ito)
60
-
61
- def update_ito_reference(use_same):
62
- return gr.Audio.update(visible=not use_same)
63
-
64
- perform_ito.change(fn=update_ito_options, inputs=perform_ito, outputs=ito_options)
65
- use_same_reference.change(fn=update_ito_reference, inputs=use_same_reference, outputs=ito_reference_audio)
66
-
67
- submit_button = gr.Button("Process")
68
  output_audio = gr.Audio(label="Output Audio")
69
- ito_output_audio = gr.Audio(label="ITO Output Audio")
70
  param_output = gr.Textbox(label="Predicted Parameters", lines=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10)
72
- top_10_diff = gr.Textbox(label="Top 10 Parameter Differences", lines=10)
73
  ito_log = gr.Textbox(label="ITO Log", lines=20)
74
-
75
- submit_button.click(
76
- process_with_ito,
77
- inputs=[input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio],
78
- outputs=[output_audio, ito_output_audio, param_output, ito_param_output, top_10_diff, ito_log]
 
 
 
 
 
 
 
79
  )
80
 
81
- with gr.Tab("YouTube URLs"):
82
- input_url = gr.Textbox(label="Input YouTube URL")
83
- reference_url = gr.Textbox(label="Reference YouTube URL")
84
- perform_ito_yt = gr.Checkbox(label="Perform ITO")
85
- with gr.Column(visible=False) as ito_options_yt:
86
- use_same_reference_yt = gr.Checkbox(label="Use same reference audio for ITO", value=True)
87
- ito_reference_url = gr.Textbox(label="ITO Reference YouTube URL", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- def update_ito_options_yt(perform_ito):
90
- return gr.Column.update(visible=perform_ito)
91
 
92
- def update_ito_reference_yt(use_same):
93
- return gr.Textbox.update(visible=not use_same)
94
 
95
- perform_ito_yt.change(fn=update_ito_options_yt, inputs=perform_ito_yt, outputs=ito_options_yt)
96
- use_same_reference_yt.change(fn=update_ito_reference_yt, inputs=use_same_reference_yt, outputs=ito_reference_url)
97
 
98
- submit_button_yt = gr.Button("Process")
99
- output_audio_yt = gr.Audio(label="Output Audio")
100
- ito_output_audio_yt = gr.Audio(label="ITO Output Audio")
101
- param_output_yt = gr.Textbox(label="Predicted Parameters", lines=10)
102
- ito_param_output_yt = gr.Textbox(label="ITO Predicted Parameters", lines=10)
103
- top_10_diff_yt = gr.Textbox(label="Top 10 Parameter Differences", lines=10)
104
- ito_log_yt = gr.Textbox(label="ITO Log", lines=20)
105
-
106
- submit_button_yt.click(
107
- process_youtube_with_ito,
108
- inputs=[input_url, reference_url, perform_ito_yt, use_same_reference_yt, ito_reference_url],
109
- outputs=[output_audio_yt, ito_output_audio_yt, param_output_yt, ito_param_output_yt, top_10_diff_yt, ito_log_yt]
110
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- demo.launch()
 
9
 
10
  mastering_transfer = MasteringStyleTransfer(args)
11
 
12
+ def process_audio(input_audio, reference_audio):
13
+ output_audio, predicted_params, _, _, _, sr = mastering_transfer.process_audio(
14
+ input_audio, reference_audio, reference_audio, {}, False
 
15
  )
16
 
 
17
  param_output = mastering_transfer.get_param_output_string(predicted_params)
 
18
 
19
+ return "output_mastered.wav", param_output
 
 
 
20
 
21
+ def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
22
+ if ito_reference_audio is None:
23
+ ito_reference_audio = reference_audio
24
 
25
+ ito_config = {
26
+ 'optimizer': optimizer,
27
+ 'learning_rate': learning_rate,
28
+ 'num_steps': num_steps,
29
+ 'af_weights': af_weights,
30
+ 'sample_rate': args.sample_rate
31
+ }
32
+
33
+ input_tensor = mastering_transfer.preprocess_audio(input_audio, args.sample_rate)
34
+ reference_tensor = mastering_transfer.preprocess_audio(reference_audio, args.sample_rate)
35
+ ito_reference_tensor = mastering_transfer.preprocess_audio(ito_reference_audio, args.sample_rate)
36
+
37
+ initial_reference_feature = mastering_transfer.get_reference_embedding(reference_tensor)
38
+
39
+ ito_output, ito_params, optimized_embedding, steps_taken, ito_log = mastering_transfer.inference_time_optimization(
40
+ input_tensor, ito_reference_tensor, ito_config, initial_reference_feature
41
  )
42
+
43
+ ito_param_output = mastering_transfer.get_param_output_string(ito_params)
44
 
45
+ return "ito_output_mastered.wav", ito_param_output, steps_taken, ito_log
 
 
 
 
46
 
47
 
48
  with gr.Blocks() as demo:
 
51
  with gr.Tab("Upload Audio"):
52
  input_audio = gr.Audio(label="Input Audio")
53
  reference_audio = gr.Audio(label="Reference Audio")
54
+ process_button = gr.Button("Process")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  output_audio = gr.Audio(label="Output Audio")
 
56
  param_output = gr.Textbox(label="Predicted Parameters", lines=10)
57
+
58
+ process_button.click(
59
+ process_audio,
60
+ inputs=[input_audio, reference_audio],
61
+ outputs=[output_audio, param_output]
62
+ )
63
+
64
+ gr.Markdown("## Inference Time Optimization (ITO)")
65
+ ito_reference_audio = gr.Audio(label="ITO Reference Audio (optional)")
66
+ num_steps = gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Steps")
67
+ optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer")
68
+ learning_rate = gr.Slider(minimum=0.0001, maximum=0.1, value=0.001, step=0.0001, label="Learning Rate")
69
+ af_weights = gr.Textbox(label="AudioFeatureLoss Weights (comma-separated)", value="0.1,0.001,1.0,1.0,0.1")
70
+
71
+ ito_button = gr.Button("Perform ITO")
72
+ ito_output_audio = gr.Audio(label="ITO Output Audio")
73
  ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10)
74
+ ito_steps_taken = gr.Number(label="ITO Steps Taken")
75
  ito_log = gr.Textbox(label="ITO Log", lines=20)
76
+
77
+ def run_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
78
+ af_weights = [float(w.strip()) for w in af_weights.split(',')]
79
+ ito_output, ito_params, steps_taken = perform_ito(
80
+ input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights
81
+ )
82
+ return ito_output, ito_params, steps_taken
83
+
84
+ ito_button.click(
85
+ run_ito,
86
+ inputs=[input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights],
87
+ outputs=[ito_output_audio, ito_param_output, ito_steps_taken, ito_log]
88
  )
89
 
90
+ demo.launch()
91
+
92
+
93
+ # import gradio as gr
94
+ # import torch
95
+ # import soundfile as sf
96
+ # import numpy as np
97
+ # import yaml
98
+ # from inference import MasteringStyleTransfer
99
+ # from utils import download_youtube_audio
100
+ # from config import args
101
+
102
+ # mastering_transfer = MasteringStyleTransfer(args)
103
+
104
+ # def process_audio(input_audio, reference_audio, perform_ito, ito_reference_audio=None):
105
+ # # Process the audio files
106
+ # output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio(
107
+ # input_audio, reference_audio, ito_reference_audio if ito_reference_audio else reference_audio, {}, perform_ito
108
+ # )
109
+
110
+ # # Generate parameter output strings
111
+ # param_output = mastering_transfer.get_param_output_string(predicted_params)
112
+ # ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
113
+
114
+ # # Generate top 10 differences if ITO was performed
115
+ # top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
116
+
117
+ # return "output_mastered.wav", "ito_output_mastered.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log
118
+
119
+ # def process_with_ito(input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio):
120
+ # ito_ref = reference_audio if use_same_reference else ito_reference_audio
121
+ # return process_audio(input_audio, reference_audio, perform_ito, ito_ref)
122
+
123
+ # def process_youtube_with_ito(input_url, reference_url, perform_ito, use_same_reference, ito_reference_url):
124
+ # input_audio = download_youtube_audio(input_url)
125
+ # reference_audio = download_youtube_audio(reference_url)
126
+ # ito_ref = reference_audio if use_same_reference else download_youtube_audio(ito_reference_url)
127
+
128
+ # output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio(
129
+ # input_audio, reference_audio, ito_ref, {}, perform_ito, log_ito=True
130
+ # )
131
+
132
+ # param_output = mastering_transfer.get_param_output_string(predicted_params)
133
+ # ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
134
+ # top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
135
+
136
+ # return "output_mastered_yt.wav", "ito_output_mastered_yt.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log
137
+
138
+
139
+ # with gr.Blocks() as demo:
140
+ # gr.Markdown("# Mastering Style Transfer Demo")
141
+
142
+ # with gr.Tab("Upload Audio"):
143
+ # input_audio = gr.Audio(label="Input Audio")
144
+ # reference_audio = gr.Audio(label="Reference Audio")
145
+ # perform_ito = gr.Checkbox(label="Perform ITO")
146
+ # with gr.Column(visible=False) as ito_options:
147
+ # use_same_reference = gr.Checkbox(label="Use same reference audio for ITO", value=True)
148
+ # ito_reference_audio = gr.Audio(label="ITO Reference Audio", visible=False)
149
 
150
+ # def update_ito_options(perform_ito):
151
+ # return gr.Column.update(visible=perform_ito)
152
 
153
+ # def update_ito_reference(use_same):
154
+ # return gr.Audio.update(visible=not use_same)
155
 
156
+ # perform_ito.change(fn=update_ito_options, inputs=perform_ito, outputs=ito_options)
157
+ # use_same_reference.change(fn=update_ito_reference, inputs=use_same_reference, outputs=ito_reference_audio)
158
 
159
+ # submit_button = gr.Button("Process")
160
+ # output_audio = gr.Audio(label="Output Audio")
161
+ # ito_output_audio = gr.Audio(label="ITO Output Audio")
162
+ # param_output = gr.Textbox(label="Predicted Parameters", lines=10)
163
+ # ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10)
164
+ # top_10_diff = gr.Textbox(label="Top 10 Parameter Differences", lines=10)
165
+ # ito_log = gr.Textbox(label="ITO Log", lines=20)
166
+
167
+ # submit_button.click(
168
+ # process_with_ito,
169
+ # inputs=[input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio],
170
+ # outputs=[output_audio, ito_output_audio, param_output, ito_param_output, top_10_diff, ito_log]
171
+ # )
172
+
173
+ # with gr.Tab("YouTube URLs"):
174
+ # input_url = gr.Textbox(label="Input YouTube URL")
175
+ # reference_url = gr.Textbox(label="Reference YouTube URL")
176
+ # perform_ito_yt = gr.Checkbox(label="Perform ITO")
177
+ # with gr.Column(visible=False) as ito_options_yt:
178
+ # use_same_reference_yt = gr.Checkbox(label="Use same reference audio for ITO", value=True)
179
+ # ito_reference_url = gr.Textbox(label="ITO Reference YouTube URL", visible=False)
180
+
181
+ # def update_ito_options_yt(perform_ito):
182
+ # return gr.Column.update(visible=perform_ito)
183
+
184
+ # def update_ito_reference_yt(use_same):
185
+ # return gr.Textbox.update(visible=not use_same)
186
+
187
+ # perform_ito_yt.change(fn=update_ito_options_yt, inputs=perform_ito_yt, outputs=ito_options_yt)
188
+ # use_same_reference_yt.change(fn=update_ito_reference_yt, inputs=use_same_reference_yt, outputs=ito_reference_url)
189
+
190
+ # submit_button_yt = gr.Button("Process")
191
+ # output_audio_yt = gr.Audio(label="Output Audio")
192
+ # ito_output_audio_yt = gr.Audio(label="ITO Output Audio")
193
+ # param_output_yt = gr.Textbox(label="Predicted Parameters", lines=10)
194
+ # ito_param_output_yt = gr.Textbox(label="ITO Predicted Parameters", lines=10)
195
+ # top_10_diff_yt = gr.Textbox(label="Top 10 Parameter Differences", lines=10)
196
+ # ito_log_yt = gr.Textbox(label="ITO Log", lines=20)
197
+
198
+ # submit_button_yt.click(
199
+ # process_youtube_with_ito,
200
+ # inputs=[input_url, reference_url, perform_ito_yt, use_same_reference_yt, ito_reference_url],
201
+ # outputs=[output_audio_yt, ito_output_audio_yt, param_output_yt, ito_param_output_yt, top_10_diff_yt, ito_log_yt]
202
+ # )
203
 
204
+ # demo.launch()
inference.py CHANGED
@@ -60,59 +60,66 @@ class MasteringStyleTransfer:
60
  predicted_params = self.mastering_converter.get_last_predicted_params()
61
  return output_audio, predicted_params
62
 
63
- def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
64
- fit_embedding = torch.nn.Parameter(initial_reference_feature)
65
- optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
66
-
67
- af_loss = AudioFeatureLoss(
68
- weights=ito_config['af_weights'],
69
- sample_rate=ito_config['sample_rate'],
70
- stem_separation=False,
71
- use_clap=False
72
- )
73
-
74
- min_loss = float('inf')
75
- min_loss_step = 0
76
- min_loss_output = None
77
- min_loss_params = None
78
- min_loss_embedding = None
79
-
80
- loss_history = []
81
- divergence_counter = 0
82
-
83
- for step in range(ito_config['num_steps']):
84
- optimizer.zero_grad()
85
-
86
- output_audio = self.mastering_converter(input_tensor, fit_embedding)
87
-
88
- losses = af_loss(output_audio, reference_tensor)
89
- total_loss = sum(losses.values())
90
-
91
- loss_history.append(total_loss.item())
92
-
93
- if total_loss < min_loss:
94
- min_loss = total_loss.item()
95
- min_loss_step = step
96
- min_loss_output = output_audio.detach()
97
- min_loss_params = self.mastering_converter.get_last_predicted_params()
98
- min_loss_embedding = fit_embedding.detach().clone()
99
-
100
- # Check for divergence
101
- if len(loss_history) > 10 and total_loss > loss_history[-11]:
102
- divergence_counter += 1
103
- else:
104
- divergence_counter = 0
 
 
105
 
106
- print(total_loss, min_loss)
 
 
 
 
 
107
 
108
- if divergence_counter >= 10:
109
- print(f"Optimization stopped early due to divergence at step {step}")
110
- break
111
 
112
- total_loss.backward()
113
- optimizer.step()
114
 
115
- return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1
116
 
117
  def preprocess_audio(self, audio, target_sample_rate=44100):
118
  sample_rate, data = audio
 
60
  predicted_params = self.mastering_converter.get_last_predicted_params()
61
  return output_audio, predicted_params
62
 
63
+ def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
64
+ fit_embedding = torch.nn.Parameter(initial_reference_feature)
65
+ optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
66
+
67
+ af_loss = AudioFeatureLoss(
68
+ weights=ito_config['af_weights'],
69
+ sample_rate=ito_config['sample_rate'],
70
+ stem_separation=False,
71
+ use_clap=False
72
+ )
73
+
74
+ min_loss = float('inf')
75
+ min_loss_step = 0
76
+ min_loss_output = None
77
+ min_loss_params = None
78
+ min_loss_embedding = None
79
+
80
+ loss_history = []
81
+ divergence_counter = 0
82
+ ito_log = []
83
+
84
+ for step in range(ito_config['num_steps']):
85
+ optimizer.zero_grad()
86
+
87
+ output_audio = self.mastering_converter(input_tensor, fit_embedding)
88
+ current_params = self.mastering_converter.get_last_predicted_params()
89
+
90
+ losses = af_loss(output_audio, reference_tensor)
91
+ total_loss = sum(losses.values())
92
+
93
+ loss_history.append(total_loss.item())
94
+
95
+ if total_loss < min_loss:
96
+ min_loss = total_loss.item()
97
+ min_loss_step = step
98
+ min_loss_output = output_audio.detach()
99
+ min_loss_params = current_params
100
+ min_loss_embedding = fit_embedding.detach().clone()
101
+
102
+ # Check for divergence
103
+ if len(loss_history) > 10 and total_loss > loss_history[-11]:
104
+ divergence_counter += 1
105
+ else:
106
+ divergence_counter = 0
107
 
108
+ # Log top 10 parameter differences
109
+ if step == 0:
110
+ initial_params = current_params
111
+ top_10_diff = self.get_top_10_diff_string(initial_params, current_params)
112
+ log_entry = f"Step {step + 1}, Loss: {total_loss.item():.4f}\n{top_10_diff}\n"
113
+ ito_log.append(log_entry)
114
 
115
+ if divergence_counter >= 10:
116
+ print(f"Optimization stopped early due to divergence at step {step}")
117
+ break
118
 
119
+ total_loss.backward()
120
+ optimizer.step()
121
 
122
+ return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1, "\n".join(ito_log)
123
 
124
  def preprocess_audio(self, audio, target_sample_rate=44100):
125
  sample_rate, data = audio