nakas commited on
Commit
c6ef486
1 Parent(s): f9316f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +392 -65
app.py CHANGED
@@ -1,71 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
- import gradio as gr
3
- from scipy.io.wavfile import write
4
- import subprocess
5
- import torch
6
  import typing as tp
 
 
 
 
7
 
8
  from audiocraft.data.audio_utils import convert_audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Import the necessary MusicGen code here
11
-
12
- def load_model():
13
- # Load the MusicGen model here
14
-
15
- def music_gen_and_separation(text, audio):
16
- # Perform music generation with the loaded MusicGen model
17
- texts = [text] # Use the provided text for music generation
18
- melodies = [(audio[1], audio[0])] # Convert audio to melody format for MusicGen
19
-
20
- # Perform music generation using the loaded MusicGen model
21
- generated_music = predict_full(model, texts, melodies, duration, topk, topp, temperature, cfg_coef)
22
-
23
- # Perform source separation using Demucs
24
- # Save the generated music to a temporary file
25
- temp_file = "generated_music.wav"
26
- write(temp_file, generated_music, 32000)
27
-
28
- # Run Demucs for source separation
29
- command = "python3 -m demucs.separate -n mdx_extra_q -d cpu " + temp_file + " -o out"
30
- process = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
31
- print("Demucs script output:", process.stdout.decode())
32
-
33
- # Check if files exist before returning
34
- files = ["./out/mdx_extra_q/test/vocals.wav",
35
- "./out/mdx_extra_q/test/bass.wav",
36
- "./out/mdx_extra_q/test/drums.wav",
37
- "./out/mdx_extra_q/test/other.wav"]
38
- for file in files:
39
- if not os.path.isfile(file):
40
- print(f"File not found: {file}")
41
  else:
42
- print(f"File exists: {file}")
43
-
44
- # Convert the separated audio files to numpy arrays
45
- separated_audio = []
46
- for file in files:
47
- _, audio = read(file)
48
- separated_audio.append(audio)
49
-
50
- return separated_audio
51
-
52
-
53
- title = "MusicGen with Demucs"
54
- description = "Combine MusicGen with Demucs for music generation and source separation."
55
- article = "<p>Article content goes here.</p>"
56
-
57
- input_text = gr.inputs.Textbox(label="Input Text")
58
- input_audio = gr.inputs.Audio(label="Input Audio")
59
- output_vocals = gr.outputs.Audio(label="Vocals")
60
- output_bass = gr.outputs.Audio(label="Bass")
61
- output_drums = gr.outputs.Audio(label="Drums")
62
- output_other = gr.outputs.Audio(label="Other")
63
-
64
- gr.Interface(
65
- music_gen_and_separation,
66
- [input_text, input_audio],
67
- [output_vocals, output_bass, output_drums, output_other],
68
- title=title,
69
- description=description,
70
- article=article
71
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
8
+ # also released under the MIT license.
9
+
10
+ import argparse
11
+ from concurrent.futures import ProcessPoolExecutor
12
  import os
13
+ from pathlib import Path
14
+ import subprocess as sp
15
+ from tempfile import NamedTemporaryFile
16
+ import time
17
  import typing as tp
18
+ import warnings
19
+
20
+ import torch
21
+ import gradio as gr
22
 
23
  from audiocraft.data.audio_utils import convert_audio
24
+ from audiocraft.data.audio import audio_write
25
+ from audiocraft.models import MusicGen
26
+
27
+
28
+ MODEL = None # Last used model
29
+ IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
30
+ MAX_BATCH_SIZE = 6
31
+ BATCHED_DURATION = 15
32
+ INTERRUPTING = False
33
+ # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
34
+ _old_call = sp.call
35
+
36
+
37
+ def _call_nostderr(*args, **kwargs):
38
+ # Avoid ffmpeg vomitting on the logs.
39
+ kwargs['stderr'] = sp.DEVNULL
40
+ kwargs['stdout'] = sp.DEVNULL
41
+ _old_call(*args, **kwargs)
42
+
43
 
44
+ sp.call = _call_nostderr
45
+ # Preallocating the pool of processes.
46
+ pool = ProcessPoolExecutor(3)
47
+ pool.__enter__()
48
+
49
+
50
+ def interrupt():
51
+ global INTERRUPTING
52
+ INTERRUPTING = True
53
+
54
+
55
+ class FileCleaner:
56
+ def __init__(self, file_lifetime: float = 3600):
57
+ self.file_lifetime = file_lifetime
58
+ self.files = []
59
+
60
+ def add(self, path: tp.Union[str, Path]):
61
+ self._cleanup()
62
+ self.files.append((time.time(), Path(path)))
63
+
64
+ def _cleanup(self):
65
+ now = time.time()
66
+ for time_added, path in list(self.files):
67
+ if now - time_added > self.file_lifetime:
68
+ if path.exists():
69
+ path.unlink()
70
+ self.files.pop(0)
 
 
 
 
71
  else:
72
+ break
73
+
74
+
75
+ file_cleaner = FileCleaner()
76
+
77
+
78
+ def make_waveform(*args, **kwargs):
79
+ # Further remove some warnings.
80
+ be = time.time()
81
+ with warnings.catch_warnings():
82
+ warnings.simplefilter('ignore')
83
+ out = gr.make_waveform(*args, **kwargs)
84
+ print("Make a video took", time.time() - be)
85
+ return out
86
+
87
+
88
+ def load_model(version='melody'):
89
+ global MODEL
90
+ print("Loading model", version)
91
+ if MODEL is None or MODEL.name != version:
92
+ MODEL = MusicGen.get_pretrained(version)
93
+
94
+
95
+ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
96
+ MODEL.set_generation_params(duration=duration, **gen_kwargs)
97
+ print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
98
+ be = time.time()
99
+ processed_melodies = []
100
+ target_sr = 32000
101
+ target_ac = 1
102
+ for melody in melodies:
103
+ if melody is None:
104
+ processed_melodies.append(None)
105
+ else:
106
+ sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
107
+ if melody.dim() == 1:
108
+ melody = melody[None]
109
+ melody = melody[..., :int(sr * duration)]
110
+ melody = convert_audio(melody, sr, target_sr, target_ac)
111
+ processed_melodies.append(melody)
112
+
113
+ if any(m is not None for m in processed_melodies):
114
+ outputs = MODEL.generate_with_chroma(
115
+ descriptions=texts,
116
+ melody_wavs=processed_melodies,
117
+ melody_sample_rate=target_sr,
118
+ progress=progress,
119
+ )
120
+ else:
121
+ outputs = MODEL.generate(texts, progress=progress)
122
+
123
+ outputs = outputs.detach().cpu().float()
124
+ out_files = []
125
+ for output in outputs:
126
+ with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
127
+ audio_write(
128
+ file.name, output, MODEL.sample_rate, strategy="loudness",
129
+ loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
130
+ out_files.append(pool.submit(make_waveform, file.name))
131
+ file_cleaner.add(file.name)
132
+ res = [out_file.result() for out_file in out_files]
133
+ for file in res:
134
+ file_cleaner.add(file)
135
+ print("batch finished", len(texts), time.time() - be)
136
+ print("Tempfiles currently stored: ", len(file_cleaner.files))
137
+ return res
138
+
139
+
140
+ def predict_batched(texts, melodies):
141
+ max_text_length = 512
142
+ texts = [text[:max_text_length] for text in texts]
143
+ load_model('melody')
144
+ res = _do_predictions(texts, melodies, BATCHED_DURATION)
145
+ return [res]
146
+
147
+
148
+ def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
149
+ global INTERRUPTING
150
+ INTERRUPTING = False
151
+ if temperature < 0:
152
+ raise gr.Error("Temperature must be >= 0.")
153
+ if topk < 0:
154
+ raise gr.Error("Topk must be non-negative.")
155
+ if topp < 0:
156
+ raise gr.Error("Topp must be non-negative.")
157
+
158
+ topk = int(topk)
159
+ load_model(model)
160
+
161
+ def _progress(generated, to_generate):
162
+ progress((generated, to_generate))
163
+ if INTERRUPTING:
164
+ raise gr.Error("Interrupted.")
165
+ MODEL.set_custom_progress_callback(_progress)
166
+
167
+ outs = _do_predictions(
168
+ [text], [melody], duration, progress=True,
169
+ top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
170
+ return outs[0]
171
+
172
+
173
+ def toggle_audio_src(choice):
174
+ if choice == "mic":
175
+ return gr.update(source="microphone", value=None, label="Microphone")
176
+ else:
177
+ return gr.update(source="upload", value=None, label="File")
178
+
179
+
180
+ def ui_full(launch_kwargs):
181
+ with gr.Blocks() as interface:
182
+ gr.Markdown(
183
+ """
184
+ # MusicGen
185
+ This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
186
+ a simple and controllable model for music generation
187
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
188
+ """
189
+ )
190
+ with gr.Row():
191
+ with gr.Column():
192
+ with gr.Row():
193
+ text = gr.Text(label="Input Text", interactive=True)
194
+ with gr.Column():
195
+ radio = gr.Radio(["file", "mic"], value="file",
196
+ label="Condition on a melody (optional) File or Mic")
197
+ melody = gr.Audio(source="upload", type="numpy", label="File",
198
+ interactive=True, elem_id="melody-input")
199
+ with gr.Row():
200
+ submit = gr.Button("Submit")
201
+ # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
202
+ _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
203
+ with gr.Row():
204
+ model = gr.Radio(["melody", "medium", "small", "large"],
205
+ label="Model", value="melody", interactive=True)
206
+ with gr.Row():
207
+ duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
208
+ with gr.Row():
209
+ topk = gr.Number(label="Top-k", value=250, interactive=True)
210
+ topp = gr.Number(label="Top-p", value=0, interactive=True)
211
+ temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
212
+ cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
213
+ with gr.Column():
214
+ output = gr.Video(label="Generated Music")
215
+ submit.click(predict_full,
216
+ inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef],
217
+ outputs=[output])
218
+ radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
219
+ gr.Examples(
220
+ fn=predict_full,
221
+ examples=[
222
+ [
223
+ "An 80s driving pop song with heavy drums and synth pads in the background",
224
+ "./assets/bach.mp3",
225
+ "melody"
226
+ ],
227
+ [
228
+ "A cheerful country song with acoustic guitars",
229
+ "./assets/bolero_ravel.mp3",
230
+ "melody"
231
+ ],
232
+ [
233
+ "90s rock song with electric guitar and heavy drums",
234
+ None,
235
+ "medium"
236
+ ],
237
+ [
238
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
239
+ "./assets/bach.mp3",
240
+ "melody"
241
+ ],
242
+ [
243
+ "lofi slow bpm electro chill with organic samples",
244
+ None,
245
+ "medium",
246
+ ],
247
+ ],
248
+ inputs=[text, melody, model],
249
+ outputs=[output]
250
+ )
251
+ gr.Markdown(
252
+ """
253
+ ### More details
254
+ The model will generate a short music extract based on the description you provided.
255
+ The model can generate up to 30 seconds of audio in one pass. It is now possible
256
+ to extend the generation by feeding back the end of the previous chunk of audio.
257
+ This can take a long time, and the model might lose consistency. The model might also
258
+ decide at arbitrary positions that the song ends.
259
+ **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min).
260
+ An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
261
+ are generated each time.
262
+ We present 4 model variations:
263
+ 1. Melody -- a music generation model capable of generating music condition
264
+ on text and melody inputs. **Note**, you can also use text only.
265
+ 2. Small -- a 300M transformer decoder conditioned on text only.
266
+ 3. Medium -- a 1.5B transformer decoder conditioned on text only.
267
+ 4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
268
+ When using `melody`, ou can optionaly provide a reference audio from
269
+ which a broad melody will be extracted. The model will then try to follow both
270
+ the description and melody provided.
271
+ You can also use your own GPU or a Google Colab by following the instructions on our repo.
272
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
273
+ for more details.
274
+ """
275
+ )
276
+
277
+ interface.queue().launch(**launch_kwargs)
278
+
279
+
280
+ def ui_batched(launch_kwargs):
281
+ with gr.Blocks() as demo:
282
+ gr.Markdown(
283
+ """
284
+ # MusicGen
285
+ This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
286
+ a simple and controllable model for music generation
287
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
288
+ <br/>
289
+ <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
290
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
291
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
292
+ src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
293
+ for longer sequences, more control and no queue.</p>
294
+ """
295
+ )
296
+ with gr.Row():
297
+ with gr.Column():
298
+ with gr.Row():
299
+ text = gr.Text(label="Describe your music", lines=2, interactive=True)
300
+ with gr.Column():
301
+ radio = gr.Radio(["file", "mic"], value="file",
302
+ label="Condition on a melody (optional) File or Mic")
303
+ melody = gr.Audio(source="upload", type="numpy", label="File",
304
+ interactive=True, elem_id="melody-input")
305
+ with gr.Row():
306
+ submit = gr.Button("Generate")
307
+ with gr.Column():
308
+ output = gr.Video(label="Generated Music")
309
+ submit.click(predict_batched, inputs=[text, melody],
310
+ outputs=[output], batch=True, max_batch_size=MAX_BATCH_SIZE)
311
+ radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
312
+ gr.Examples(
313
+ fn=predict_batched,
314
+ examples=[
315
+ [
316
+ "An 80s driving pop song with heavy drums and synth pads in the background",
317
+ "./assets/bach.mp3",
318
+ ],
319
+ [
320
+ "A cheerful country song with acoustic guitars",
321
+ "./assets/bolero_ravel.mp3",
322
+ ],
323
+ [
324
+ "90s rock song with electric guitar and heavy drums",
325
+ None,
326
+ ],
327
+ [
328
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
329
+ "./assets/bach.mp3",
330
+ ],
331
+ [
332
+ "lofi slow bpm electro chill with organic samples",
333
+ None,
334
+ ],
335
+ ],
336
+ inputs=[text, melody],
337
+ outputs=[output]
338
+ )
339
+ gr.Markdown("""
340
+ ### More details
341
+ The model will generate 12 seconds of audio based on the description you provided.
342
+ You can optionaly provide a reference audio from which a broad melody will be extracted.
343
+ The model will then try to follow both the description and melody provided.
344
+ All samples are generated with the `melody` model.
345
+ You can also use your own GPU or a Google Colab by following the instructions on our repo.
346
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
347
+ for more details.
348
+ """)
349
+
350
+ demo.queue(max_size=8 * 4).launch(**launch_kwargs)
351
+
352
+
353
+ if __name__ == "__main__":
354
+ parser = argparse.ArgumentParser()
355
+ parser.add_argument(
356
+ '--listen',
357
+ type=str,
358
+ default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
359
+ help='IP to listen on for connections to Gradio',
360
+ )
361
+ parser.add_argument(
362
+ '--username', type=str, default='', help='Username for authentication'
363
+ )
364
+ parser.add_argument(
365
+ '--password', type=str, default='', help='Password for authentication'
366
+ )
367
+ parser.add_argument(
368
+ '--server_port',
369
+ type=int,
370
+ default=0,
371
+ help='Port to run the server listener on',
372
+ )
373
+ parser.add_argument(
374
+ '--inbrowser', action='store_true', help='Open in browser'
375
+ )
376
+ parser.add_argument(
377
+ '--share', action='store_true', help='Share the gradio UI'
378
+ )
379
+
380
+ args = parser.parse_args()
381
+
382
+ launch_kwargs = {}
383
+ launch_kwargs['server_name'] = args.listen
384
+
385
+ if args.username and args.password:
386
+ launch_kwargs['auth'] = (args.username, args.password)
387
+ if args.server_port:
388
+ launch_kwargs['server_port'] = args.server_port
389
+ if args.inbrowser:
390
+ launch_kwargs['inbrowser'] = args.inbrowser
391
+ if args.share:
392
+ launch_kwargs['share'] = args.share
393
+
394
+ # Show the interface
395
+ if IS_BATCHED:
396
+ ui_batched(launch_kwargs)
397
+ else:
398
+ ui_full(launch_kwargs)