Hugo Flores Garcia commited on
Commit
3815be3
1 Parent(s): ac059f4

eval, demo

Browse files
Files changed (4) hide show
  1. .gitignore +2 -1
  2. demo.py +67 -28
  3. scripts/exp/eval.py +3 -0
  4. scripts/utils/vamp_folder.py +88 -42
.gitignore CHANGED
@@ -172,4 +172,5 @@ scratch/
172
 
173
  runs-archive
174
  lyrebird-audiotools
175
- lyrebird-audio-codec
 
 
172
 
173
  runs-archive
174
  lyrebird-audiotools
175
+ lyrebird-audio-codec
176
+ samples-*/**
demo.py CHANGED
@@ -48,21 +48,64 @@ def load_random_audio():
48
  sr = sig.sample_rate
49
  return sr, audio.T
50
 
51
- def mask_audio(
52
- prefix_s, suffix_s, rand_mask_intensity,
53
- mask_periodic_amt, beat_unmask_dur,
54
- mask_dwn_chk, dwn_factor,
55
- mask_up_chk, up_factor
56
- ):
57
- pass
58
 
59
  def vamp(
60
  input_audio, prefix_s, suffix_s, rand_mask_intensity,
61
  mask_periodic_amt, beat_unmask_dur,
62
  mask_dwn_chk, dwn_factor,
63
- mask_up_chk, up_factor
 
64
  ):
65
- print(input_audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
  with gr.Blocks() as demo:
@@ -180,6 +223,17 @@ with gr.Blocks() as demo:
180
  # process and output
181
  with gr.Row():
182
  with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
183
  vamp_button = gr.Button("vamp")
184
 
185
  output_audio = gr.Audio(
@@ -187,22 +241,6 @@ with gr.Blocks() as demo:
187
  interactive=False,
188
  visible=False
189
  )
190
- output_audio_viz = gr.Video(
191
- label="output audio",
192
- interactive=False
193
- )
194
-
195
- # connect widgets
196
- compute_mask_button.click(
197
- fn=mask_audio,
198
- inputs=[
199
- prefix_s, suffix_s, rand_mask_intensity,
200
- mask_periodic_amt, beat_unmask_dur,
201
- mask_dwn_chk, dwn_factor,
202
- mask_up_chk, up_factor
203
- ],
204
- outputs=[mask_output, mask_output_viz]
205
- )
206
 
207
  # connect widgets
208
  vamp_button.click(
@@ -211,10 +249,11 @@ with gr.Blocks() as demo:
211
  prefix_s, suffix_s, rand_mask_intensity,
212
  mask_periodic_amt, beat_unmask_dur,
213
  mask_dwn_chk, dwn_factor,
214
- mask_up_chk, up_factor
 
215
  ],
216
- outputs=[output_audio, output_audio_viz]
217
  )
218
 
219
 
220
- demo.launch(share=True)
 
48
  sr = sig.sample_rate
49
  return sr, audio.T
50
 
 
 
 
 
 
 
 
51
 
52
  def vamp(
53
  input_audio, prefix_s, suffix_s, rand_mask_intensity,
54
  mask_periodic_amt, beat_unmask_dur,
55
  mask_dwn_chk, dwn_factor,
56
+ mask_up_chk, up_factor,
57
+ num_vamps, mode
58
  ):
59
+ try:
60
+ print(input_audio)
61
+
62
+ sig = at.AudioSignal(
63
+ input_audio[1],
64
+ sample_rate=input_audio[0]
65
+ )
66
+
67
+ if beat_unmask_dur > 0.0:
68
+ beat_mask = interface.make_beat_mask(
69
+ sig,
70
+ before_beat_s=0.01,
71
+ after_beat_s=beat_unmask_dur,
72
+ mask_downbeats=mask_dwn_chk,
73
+ mask_upbeats=mask_up_chk,
74
+ downbeat_downsample_factor=dwn_factor,
75
+ beat_downsample_factor=up_factor,
76
+ dropout=0.7,
77
+ invert=True
78
+ )
79
+ else:
80
+ beat_mask = None
81
+
82
+ if mode == "standard":
83
+ zv = interface.coarse_vamp_v2(
84
+ sig,
85
+ prefix_dur_s=prefix_s,
86
+ suffix_dur_s=suffix_s,
87
+ num_vamps=num_vamps,
88
+ downsample_factor=mask_periodic_amt,
89
+ intensity=rand_mask_intensity,
90
+ ext_mask=beat_mask
91
+ )
92
+ elif mode == "loop":
93
+ zv = interface.loop(
94
+ zv,
95
+ prefix_dur_s=prefix_s,
96
+ suffix_dur_s=suffix_s,
97
+ num_loops=num_vamps,
98
+ downsample_factor=mask_periodic_amt,
99
+ intensity=rand_mask_intensity,
100
+ ext_mask=beat_mask
101
+ )
102
+
103
+ zv = interface.coarse_to_fine(zv)
104
+ sig = interface.to_signal(zv)
105
+ return sig.sample_rate, sig.samples[0].T
106
+ except Exception as e:
107
+ raise gr.Error(f"failed with error: {e}")
108
+
109
 
110
 
111
  with gr.Blocks() as demo:
 
223
  # process and output
224
  with gr.Row():
225
  with gr.Column():
226
+ gr.Markdown("**NOTE**: for loop mode, both prefix and suffix must be greater than 0.")
227
+ mode = gr.Radio(
228
+ label="mode",
229
+ choices=["standard", "loop"],
230
+ value="standard"
231
+ )
232
+ num_vamps = gr.Number(
233
+ label="number of vamps",
234
+ value=1,
235
+ precision=0
236
+ )
237
  vamp_button = gr.Button("vamp")
238
 
239
  output_audio = gr.Audio(
 
241
  interactive=False,
242
  visible=False
243
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  # connect widgets
246
  vamp_button.click(
 
249
  prefix_s, suffix_s, rand_mask_intensity,
250
  mask_periodic_amt, beat_unmask_dur,
251
  mask_dwn_chk, dwn_factor,
252
+ mask_up_chk, up_factor,
253
+ num_vamps, mode
254
  ],
255
+ outputs=[output_audio]
256
  )
257
 
258
 
259
+ demo.launch(share=True, server_name="0.0.0.0")
scripts/exp/eval.py CHANGED
@@ -65,6 +65,9 @@ def eval(
65
  baseline_sig = AudioSignal(str(baseline_file))
66
  cond_sig = AudioSignal(str(cond_file))
67
 
 
 
 
68
  # compute the metrics
69
  # try:
70
  # vsq = visqol(baseline_sig, cond_sig)
 
65
  baseline_sig = AudioSignal(str(baseline_file))
66
  cond_sig = AudioSignal(str(cond_file))
67
 
68
+ cond_sig.resample(baseline_sig.sample_rate)
69
+ cond_sig.truncate_samples(baseline_sig.length)
70
+
71
  # compute the metrics
72
  # try:
73
  # vsq = visqol(baseline_sig, cond_sig)
scripts/utils/vamp_folder.py CHANGED
@@ -1,6 +1,8 @@
1
  from pathlib import Path
2
  import random
3
  from typing import List
 
 
4
 
5
  import argbind
6
  from tqdm import tqdm
@@ -9,28 +11,26 @@ import argbind
9
  from vampnet.interface import Interface
10
  import audiotools as at
11
 
12
- Interface = argbind.bind(Interface)
13
 
14
- # condition wrapper for printing
15
- def condition(cond):
16
- def wrapper(sig, interface):
17
- # print(f"Condition: {cond.__name__}")
18
- sig = cond(sig, interface)
19
- # print(f"Condition: {cond.__name__} (done)\n")
20
- return sig
21
- return wrapper
 
22
 
23
- @condition
24
  def baseline(sig, interface):
25
  return interface.preprocess(sig)
26
 
27
- @condition
28
  def reconstructed(sig, interface):
29
  return interface.to_signal(
30
  interface.encode(sig)
31
  )
32
 
33
- @condition
34
  def coarse2fine(sig, interface):
35
  z = interface.encode(sig)
36
  z = z[:, :interface.c2f.n_conditioning_codebooks, :]
@@ -38,7 +38,6 @@ def coarse2fine(sig, interface):
38
  z = interface.coarse_to_fine(z)
39
  return interface.to_signal(z)
40
 
41
- @condition
42
  def coarse2fine_argmax(sig, interface):
43
  z = interface.encode(sig)
44
  z = z[:, :interface.c2f.n_conditioning_codebooks, :]
@@ -49,46 +48,85 @@ def coarse2fine_argmax(sig, interface):
49
  )
50
  return interface.to_signal(z)
51
 
52
- @condition
53
- def one_codebook(sig, interface):
54
- zv = interface.coarse_vamp_v2(
55
- sig, n_conditioning_codebooks=1
56
- )
57
- zv = interface.coarse_to_fine(zv)
58
 
59
- return interface.to_signal(zv)
60
 
61
- @condition
62
- def two_codebooks_downsampled_4x(sig, interface):
63
- zv = interface.coarse_vamp_v2(
64
- sig, n_conditioning_codebooks=2,
65
- downsample_factor=4
66
- )
67
- zv = interface.coarse_to_fine(zv)
68
 
69
- return interface.to_signal(zv)
 
 
 
 
 
70
 
 
 
71
 
72
- def four_codebooks_downsampled(sig, interface, x=12):
73
- zv = interface.coarse_vamp_v2(
74
- sig, downsample_factor=12
75
- )
76
- zv = interface.coarse_to_fine(zv)
77
- return interface.to_signal(zv)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
 
80
  COARSE_SAMPLE_CONDS ={
81
  "baseline": baseline,
82
  "reconstructed": reconstructed,
83
  "coarse2fine": coarse2fine,
84
- "one_codebook": one_codebook,
85
- "two_codebooks_downsampled_4x": two_codebooks_downsampled_4x,
86
- # four codebooks at different downsample factors
87
  **{
88
- f"four_codebooks_downsampled_{x}x": lambda sig, interface: four_codebooks_downsampled(sig, interface, x=x)
89
- for x in [4, 8, 12, 16, 20, 24]
90
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
 
 
 
92
  }
93
 
94
  C2F_SAMPLE_CONDS = {
@@ -124,7 +162,16 @@ def main(
124
  without_replacement=True,
125
  )
126
 
127
- SAMPLE_CONDS = COARSE_SAMPLE_CONDS if exp_type == "coarse" else C2F_SAMPLE_CONDS
 
 
 
 
 
 
 
 
 
128
 
129
 
130
  indices = list(range(max_excerpts))
@@ -139,7 +186,6 @@ def main(
139
  # continue
140
 
141
  sig = dataset[i]["signal"]
142
-
143
  results = {
144
  name: cond(sig, interface).cpu()
145
  for name, cond in SAMPLE_CONDS.items()
 
1
  from pathlib import Path
2
  import random
3
  from typing import List
4
+ import tempfile
5
+ import subprocess
6
 
7
  import argbind
8
  from tqdm import tqdm
 
11
  from vampnet.interface import Interface
12
  import audiotools as at
13
 
14
+ Interface: Interface = argbind.bind(Interface)
15
 
16
+ def calculate_bitrate(
17
+ interface, num_codebooks,
18
+ downsample_factor
19
+ ):
20
+ bit_width = 10
21
+ sr = interface.codec.sample_rate
22
+ hop = interface.codec.hop_size
23
+ rate = (sr / hop) * ((bit_width * num_codebooks) / downsample_factor)
24
+ return rate
25
 
 
26
  def baseline(sig, interface):
27
  return interface.preprocess(sig)
28
 
 
29
  def reconstructed(sig, interface):
30
  return interface.to_signal(
31
  interface.encode(sig)
32
  )
33
 
 
34
  def coarse2fine(sig, interface):
35
  z = interface.encode(sig)
36
  z = z[:, :interface.c2f.n_conditioning_codebooks, :]
 
38
  z = interface.coarse_to_fine(z)
39
  return interface.to_signal(z)
40
 
 
41
  def coarse2fine_argmax(sig, interface):
42
  z = interface.encode(sig)
43
  z = z[:, :interface.c2f.n_conditioning_codebooks, :]
 
48
  )
49
  return interface.to_signal(z)
50
 
 
 
 
 
 
 
51
 
52
+ class CoarseCond:
53
 
54
+ def __init__(self, num_codebooks, downsample_factor):
55
+ self.num_codebooks = num_codebooks
56
+ self.downsample_factor = downsample_factor
 
 
 
 
57
 
58
+ def __call__(self, sig, interface):
59
+ n_conditioning_codebooks = interface.coarse.n_codebooks - self.num_codebooks
60
+ zv = interface.coarse_vamp_v2(sig,
61
+ n_conditioning_codebooks=n_conditioning_codebooks,
62
+ downsample_factor=self.downsample_factor
63
+ )
64
 
65
+ zv = interface.coarse_to_fine(zv)
66
+ return interface.to_signal(zv)
67
 
68
+
69
+ def opus(sig, interface, bitrate=128):
70
+ sig = interface.preprocess(sig)
71
+
72
+ with tempfile.NamedTemporaryFile(suffix=".wav") as f:
73
+ sig.write(f.name)
74
+
75
+ opus_name = Path(f.name).with_suffix(".opus")
76
+ # convert to opus
77
+ cmd = [
78
+ "ffmpeg", "-y", "-i", f.name,
79
+ "-c:a", "libopus",
80
+ "-b:a", f"{bitrate}",
81
+ opus_name
82
+ ]
83
+ subprocess.run(cmd, check=True)
84
+
85
+ # convert back to wav
86
+ output_name = Path(f"{f.name}-opus").with_suffix(".wav")
87
+ cmd = [
88
+ "ffmpeg", "-y", "-i", opus_name,
89
+ output_name
90
+ ]
91
+
92
+ subprocess.run(cmd, check=True)
93
+
94
+ sig = at.AudioSignal(
95
+ output_name,
96
+ sample_rate=sig.sample_rate
97
+ )
98
+ return sig
99
 
100
 
101
  COARSE_SAMPLE_CONDS ={
102
  "baseline": baseline,
103
  "reconstructed": reconstructed,
104
  "coarse2fine": coarse2fine,
 
 
 
105
  **{
106
+ f"{n}_codebooks_downsampled_{x}x": CoarseCond(num_codebooks=n, downsample_factor=x)
107
+ for (n, x) in (
108
+ (4, 2), # 4 codebooks, downsampled 2x,
109
+ (2, 2), # 2 codebooks, downsampled 2x
110
+ (1, None), # 1 codebook, no downsampling
111
+ (4, 4), # 4 codebooks, downsampled 4x
112
+ (1, 2), # 1 codebook, downsampled 2x,
113
+ (4, 6), # 4 codebooks, downsampled 6x
114
+ (4, 8), # 4 codebooks, downsampled 8x
115
+ (4, 16), # 4 codebooks, downsampled 16x
116
+ (4, 32), # 4 codebooks, downsampled 16x
117
+ )
118
+ },
119
+
120
+ }
121
+
122
+ OPUS_JAZZPOP_SAMPLE_CONDS = {
123
+ f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
124
+ for bitrate in [5620, 1875, 1250, 625]
125
+ }
126
 
127
+ OPUS_SPOTDL_SAMPLE_CONDS = {
128
+ f"opus_{bitrate}": lambda sig, interface: opus(sig, interface, bitrate=bitrate)
129
+ for bitrate in [8036, 2296, 1148, 574]
130
  }
131
 
132
  C2F_SAMPLE_CONDS = {
 
162
  without_replacement=True,
163
  )
164
 
165
+ if exp_type == "opus-jazzpop":
166
+ SAMPLE_CONDS = OPUS_JAZZPOP_SAMPLE_CONDS
167
+ elif exp_type == "opus-spotdl":
168
+ SAMPLE_CONDS = OPUS_SPOTDL_SAMPLE_CONDS
169
+ elif exp_type == "coarse":
170
+ SAMPLE_CONDS = COARSE_SAMPLE_CONDS
171
+ elif exp_type == "c2f":
172
+ SAMPLE_CONDS = C2F_SAMPLE_CONDS
173
+ else:
174
+ raise ValueError(f"Unknown exp_type {exp_type}")
175
 
176
 
177
  indices = list(range(max_excerpts))
 
186
  # continue
187
 
188
  sig = dataset[i]["signal"]
 
189
  results = {
190
  name: cond(sig, interface).cpu()
191
  for name, cond in SAMPLE_CONDS.items()