jhtonyKoo commited on
Commit
a990e23
1 Parent(s): 6d6c0d5

modify app

Browse files
Files changed (2) hide show
  1. app.py +29 -18
  2. inference.py +56 -56
app.py CHANGED
@@ -10,8 +10,10 @@ from config import args
10
  mastering_transfer = MasteringStyleTransfer(args)
11
 
12
  def process_audio(input_audio, reference_audio):
 
 
13
  output_audio, predicted_params, _, _, _, sr = mastering_transfer.process_audio(
14
- input_audio, reference_audio, reference_audio, {}, False
15
  )
16
 
17
  param_output = mastering_transfer.get_param_output_string(predicted_params)
@@ -44,16 +46,19 @@ def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, op
44
 
45
  return "ito_output_mastered.wav", ito_param_output, steps_taken, ito_log
46
 
47
-
48
  with gr.Blocks() as demo:
49
  gr.Markdown("# Mastering Style Transfer Demo")
50
 
51
  with gr.Tab("Upload Audio"):
52
- input_audio = gr.Audio(label="Input Audio")
53
- reference_audio = gr.Audio(label="Reference Audio")
 
 
54
  process_button = gr.Button("Process")
55
- output_audio = gr.Audio(label="Output Audio")
56
- param_output = gr.Textbox(label="Predicted Parameters", lines=10)
 
 
57
 
58
  process_button.click(
59
  process_audio,
@@ -62,24 +67,30 @@ with gr.Blocks() as demo:
62
  )
63
 
64
  gr.Markdown("## Inference Time Optimization (ITO)")
65
- ito_reference_audio = gr.Audio(label="ITO Reference Audio (optional)")
66
- num_steps = gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Steps")
67
- optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer")
68
- learning_rate = gr.Slider(minimum=0.0001, maximum=0.1, value=0.001, step=0.0001, label="Learning Rate")
69
- af_weights = gr.Textbox(label="AudioFeatureLoss Weights (comma-separated)", value="0.1,0.001,1.0,1.0,0.1")
70
 
71
- ito_button = gr.Button("Perform ITO")
72
- ito_output_audio = gr.Audio(label="ITO Output Audio")
73
- ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10)
74
- ito_steps_taken = gr.Number(label="ITO Steps Taken")
75
- ito_log = gr.Textbox(label="ITO Log", lines=20)
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  def run_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
78
  af_weights = [float(w.strip()) for w in af_weights.split(',')]
79
- ito_output, ito_params, steps_taken = perform_ito(
80
  input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights
81
  )
82
- return ito_output, ito_params, steps_taken
83
 
84
  ito_button.click(
85
  run_ito,
 
10
  mastering_transfer = MasteringStyleTransfer(args)
11
 
12
  def process_audio(input_audio, reference_audio):
13
+ input_tensor = mastering_transfer.preprocess_audio(input_audio, args.sample_rate)
14
+ reference_tensor = mastering_transfer.preprocess_audio(reference_audio, args.sample_rate)
15
  output_audio, predicted_params, _, _, _, sr = mastering_transfer.process_audio(
16
+ input_tensor, reference_tensor, reference_tensor, {}, False
17
  )
18
 
19
  param_output = mastering_transfer.get_param_output_string(predicted_params)
 
46
 
47
  return "ito_output_mastered.wav", ito_param_output, steps_taken, ito_log
48
 
 
49
  with gr.Blocks() as demo:
50
  gr.Markdown("# Mastering Style Transfer Demo")
51
 
52
  with gr.Tab("Upload Audio"):
53
+ with gr.Row():
54
+ input_audio = gr.Audio(label="Input Audio")
55
+ reference_audio = gr.Audio(label="Reference Audio")
56
+
57
  process_button = gr.Button("Process")
58
+
59
+ with gr.Row():
60
+ output_audio = gr.Audio(label="Output Audio")
61
+ param_output = gr.Textbox(label="Predicted Parameters", lines=10)
62
 
63
  process_button.click(
64
  process_audio,
 
67
  )
68
 
69
  gr.Markdown("## Inference Time Optimization (ITO)")
 
 
 
 
 
70
 
71
+ with gr.Row():
72
+ with gr.Column(scale=2):
73
+ ito_reference_audio = gr.Audio(label="ITO Reference Audio (optional)")
74
+ num_steps = gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Steps")
75
+ optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer")
76
+ learning_rate = gr.Slider(minimum=0.0001, maximum=0.1, value=0.001, step=0.0001, label="Learning Rate")
77
+ af_weights = gr.Textbox(label="AudioFeatureLoss Weights (comma-separated)", value="0.1,0.001,1.0,1.0,0.1")
78
+
79
+ ito_button = gr.Button("Perform ITO")
80
+
81
+ ito_output_audio = gr.Audio(label="ITO Output Audio")
82
+ ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10)
83
+ ito_steps_taken = gr.Number(label="ITO Steps Taken")
84
+
85
+ with gr.Column(scale=1):
86
+ ito_log = gr.Textbox(label="ITO Log", lines=30)
87
 
88
  def run_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
89
  af_weights = [float(w.strip()) for w in af_weights.split(',')]
90
+ ito_output, ito_params, steps_taken, log = perform_ito(
91
  input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights
92
  )
93
+ return ito_output, ito_params, steps_taken, log
94
 
95
  ito_button.click(
96
  run_ito,
inference.py CHANGED
@@ -60,66 +60,66 @@ class MasteringStyleTransfer:
60
  predicted_params = self.mastering_converter.get_last_predicted_params()
61
  return output_audio, predicted_params
62
 
63
- def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
64
- fit_embedding = torch.nn.Parameter(initial_reference_feature)
65
- optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
66
-
67
- af_loss = AudioFeatureLoss(
68
- weights=ito_config['af_weights'],
69
- sample_rate=ito_config['sample_rate'],
70
- stem_separation=False,
71
- use_clap=False
72
- )
73
-
74
- min_loss = float('inf')
75
- min_loss_step = 0
76
- min_loss_output = None
77
- min_loss_params = None
78
- min_loss_embedding = None
79
-
80
- loss_history = []
81
- divergence_counter = 0
82
- ito_log = []
83
-
84
- for step in range(ito_config['num_steps']):
85
- optimizer.zero_grad()
86
-
87
- output_audio = self.mastering_converter(input_tensor, fit_embedding)
88
- current_params = self.mastering_converter.get_last_predicted_params()
89
-
90
- losses = af_loss(output_audio, reference_tensor)
91
- total_loss = sum(losses.values())
92
-
93
- loss_history.append(total_loss.item())
94
-
95
- if total_loss < min_loss:
96
- min_loss = total_loss.item()
97
- min_loss_step = step
98
- min_loss_output = output_audio.detach()
99
- min_loss_params = current_params
100
- min_loss_embedding = fit_embedding.detach().clone()
101
-
102
- # Check for divergence
103
- if len(loss_history) > 10 and total_loss > loss_history[-11]:
104
- divergence_counter += 1
105
- else:
106
- divergence_counter = 0
107
 
108
- # Log top 10 parameter differences
109
- if step == 0:
110
- initial_params = current_params
111
- top_10_diff = self.get_top_10_diff_string(initial_params, current_params)
112
- log_entry = f"Step {step + 1}, Loss: {total_loss.item():.4f}\n{top_10_diff}\n"
113
- ito_log.append(log_entry)
114
 
115
- if divergence_counter >= 10:
116
- print(f"Optimization stopped early due to divergence at step {step}")
117
- break
118
 
119
- total_loss.backward()
120
- optimizer.step()
121
 
122
- return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1, "\n".join(ito_log)
123
 
124
  def preprocess_audio(self, audio, target_sample_rate=44100):
125
  sample_rate, data = audio
 
60
  predicted_params = self.mastering_converter.get_last_predicted_params()
61
  return output_audio, predicted_params
62
 
63
+ def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
64
+ fit_embedding = torch.nn.Parameter(initial_reference_feature)
65
+ optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
66
+
67
+ af_loss = AudioFeatureLoss(
68
+ weights=ito_config['af_weights'],
69
+ sample_rate=ito_config['sample_rate'],
70
+ stem_separation=False,
71
+ use_clap=False
72
+ )
73
+
74
+ min_loss = float('inf')
75
+ min_loss_step = 0
76
+ min_loss_output = None
77
+ min_loss_params = None
78
+ min_loss_embedding = None
79
+
80
+ loss_history = []
81
+ divergence_counter = 0
82
+ ito_log = []
83
+
84
+ for step in range(ito_config['num_steps']):
85
+ optimizer.zero_grad()
86
+
87
+ output_audio = self.mastering_converter(input_tensor, fit_embedding)
88
+ current_params = self.mastering_converter.get_last_predicted_params()
89
+
90
+ losses = af_loss(output_audio, reference_tensor)
91
+ total_loss = sum(losses.values())
92
+
93
+ loss_history.append(total_loss.item())
94
+
95
+ if total_loss < min_loss:
96
+ min_loss = total_loss.item()
97
+ min_loss_step = step
98
+ min_loss_output = output_audio.detach()
99
+ min_loss_params = current_params
100
+ min_loss_embedding = fit_embedding.detach().clone()
101
+
102
+ # Check for divergence
103
+ if len(loss_history) > 10 and total_loss > loss_history[-11]:
104
+ divergence_counter += 1
105
+ else:
106
+ divergence_counter = 0
107
 
108
+ # Log top 10 parameter differences
109
+ if step == 0:
110
+ initial_params = current_params
111
+ top_10_diff = self.get_top_10_diff_string(initial_params, current_params)
112
+ log_entry = f"Step {step + 1}, Loss: {total_loss.item():.4f}\n{top_10_diff}\n"
113
+ ito_log.append(log_entry)
114
 
115
+ if divergence_counter >= 10:
116
+ print(f"Optimization stopped early due to divergence at step {step}")
117
+ break
118
 
119
+ total_loss.backward()
120
+ optimizer.step()
121
 
122
+ return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1, "\n".join(ito_log)
123
 
124
  def preprocess_audio(self, audio, target_sample_rate=44100):
125
  sample_rate, data = audio