File size: 11,359 Bytes
3e4e311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd93602
 
3e4e311
 
 
 
 
 
 
 
 
 
49f36fd
 
 
 
 
 
 
3e4e311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49f36fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e4e311
 
f06459f
 
 
 
 
 
 
3e4e311
54bd1e6
 
6366f6f
 
b95dc71
6366f6f
54bd1e6
 
 
 
 
1aa5fa0
6366f6f
 
54bd1e6
 
 
6366f6f
1aa5fa0
6366f6f
 
54bd1e6
 
 
 
e17ae6e
54bd1e6
 
 
6366f6f
54bd1e6
 
 
 
 
 
6366f6f
 
 
 
dd93602
 
 
 
 
 
 
 
6366f6f
 
54bd1e6
6366f6f
54bd1e6
6366f6f
54bd1e6
 
3e4e311
f06459f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import gradio as gr
from musiclang_predict import MusicLangPredictor
import random
import subprocess
import os
import torchaudio
import torch
import numpy as np
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
from pydub import AudioSegment

# Utility Functions
def peak_normalize(y, target_peak=0.97):
    return target_peak * (y / np.max(np.abs(y)))

def rms_normalize(y, target_rms=0.05):
    return y * (target_rms / np.sqrt(np.mean(y**2)))

def preprocess_audio(waveform):
    waveform_np = waveform.cpu().squeeze().numpy()  # Move to CPU before converting to NumPy
#   processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
    return torch.from_numpy(waveform_np).unsqueeze(0).to(device)

def create_slices(song, sr, slice_duration, bpm, num_slices=5):
    song_length = song.shape[-1] / sr
    slices = []
    
    # Ensure the first slice is from the beginning of the song
    first_slice_waveform = song[..., :int(slice_duration * sr)]
    slices.append(first_slice_waveform)
    
    for i in range(1, num_slices):
        possible_start_indices = list(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
        if not possible_start_indices:
            # If there are no valid start indices, duplicate the first slice
            slices.append(first_slice_waveform)
            continue
        
        random_start = random.choice(possible_start_indices)
        slice_end = random_start + int(slice_duration * sr)
        
        if slice_end > song_length * sr:
            # Wrap around to the beginning of the song
            remaining_samples = int(slice_end - song_length * sr)
            slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1)
        else:
            slice_waveform = song[..., random_start:slice_end]
        
        if len(slice_waveform.squeeze()) < int(slice_duration * sr):
            additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze())
            slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1)
        
        slices.append(slice_waveform)
        
    return slices

def calculate_duration(bpm, min_duration=29, max_duration=30):
    single_bar_duration = 4 * 60 / bpm
    bars = max(min_duration // single_bar_duration, 1)
    
    while single_bar_duration * bars < min_duration:
        bars += 1
    
    duration = single_bar_duration * bars
    
    while duration > max_duration and bars > 1:
        bars -= 1
        duration = single_bar_duration * bars
    
    return duration

def generate_music(seed, use_chords, chord_progression, prompt_duration, musicgen_model, num_iterations, bpm):
    while True:
        try:
            if seed == "":
                seed = random.randint(1, 10000)

            ml = MusicLangPredictor('musiclang/musiclang-v2')

            try:
                seed = int(seed)
            except ValueError:
                seed = random.randint(1, 10000)

            nb_tokens = 1024
            temperature = 0.9
            top_p = 1.0

            if use_chords and chord_progression.strip():
                score = ml.predict_chords(
                    chord_progression,
                    time_signature=(4, 4),
                    temperature=temperature,
                    topp=top_p,
                    rng_seed=seed
                )
            else:
                score = ml.predict(
                    nb_tokens=nb_tokens,
                    temperature=temperature,
                    topp=top_p,
                    rng_seed=seed
                )

            midi_filename = f"output_{seed}.mid"
            wav_filename = midi_filename.replace(".mid", ".wav")

            score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))

            subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])

            # Load the generated audio
            song, sr = torchaudio.load(wav_filename)
            song = song.to(device)

            # Use the user-provided BPM value for duration calculation
            duration = calculate_duration(bpm)

            # Create slices from the song using the user-provided BPM value
            slices = create_slices(song, sr, 35, bpm, num_slices=5)

            # Load the model
            model_name = musicgen_model.split(" ")[0]
            model_continue = MusicGen.get_pretrained(model_name)

            # Setting generation parameters
            model_continue.set_generation_params(
                use_sampling=True,
                top_k=250,
                top_p=0.0,
                temperature=1.0,
                duration=duration,
                cfg_coef=3
            )

            all_audio_files = []

            for i in range(num_iterations):
                slice_idx = i % len(slices)
                
                print(f"Running iteration {i + 1} using slice {slice_idx}...")
                
                prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)]
                prompt_waveform = preprocess_audio(prompt_waveform)
                
                output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
                output = output.cpu()  # Move the output tensor back to CPU
                
                # Make sure the output tensor has at most 2 dimensions
                if len(output.size()) > 2:
                    output = output.squeeze()
                
                filename_without_extension = f'continue_{i}'
                filename_with_extension = f'{filename_without_extension}.wav'
                
                audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
                all_audio_files.append(f'{filename_without_extension}.wav.wav')  # Assuming the library appends an extra .wav

            # Combine all audio files
            combined_audio = AudioSegment.empty()
            for filename in all_audio_files:
                combined_audio += AudioSegment.from_wav(filename)

            combined_audio_filename = f"combined_audio_{seed}.mp3"
            combined_audio.export(combined_audio_filename, format="mp3")

            # Clean up temporary files
            os.remove(midi_filename)
            os.remove(wav_filename)
            for filename in all_audio_files:
                os.remove(filename)

            return combined_audio_filename
        except IndexError:
            # Retry with a new random seed if an IndexError is raised
            seed = random.randint(1, 10000)

# Check if CUDA is available
if torch.cuda.is_available():
  device = "cuda"
elif torch.backends.mps.is_available():
  device = "mps"
else:
  device = "cpu"
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the expandable sections
musiclang_blurb = """

## musiclang

musiclang is a controllable ai midi model. it can generate midi sequences based on user-provided parameters, or unconditionally.

[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict)

[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict)

"""

musicgen_blurb = """

## musicgen

musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.

[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)

visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.

see also https://youtube.com/@thecollabagepatch 

"""

finetunes_blurb = """

## fine-tuned models

the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.

[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)

[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)

"""

# Create the Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# the-slot-machine")
    gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
    gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model continue, semi-randomly, from different sections of the midi track. the slot machine combines em all at the end into something very bizarre. pick a number for the seed between 1 and 10k, or leave it blank to unlock the full rnjesus powers. if you wanna be lame, you can control the chord progression, prompt duration, musicgen model, number of iterations, and BPM.")

    with gr.Accordion("more info", open=False):
        gr.Markdown(musiclang_blurb)
        gr.Markdown(musicgen_blurb)
        gr.Markdown(finetunes_blurb)

    with gr.Row():
        with gr.Column():
            seed = gr.Textbox(label="seed (leave blank for random)", value="")
            use_chords = gr.Checkbox(label="control chord progression", value=False)
            chord_progression = gr.Textbox(label="chord progression (e.g., Am CM Dm E7 Am)", visible=True)
            prompt_duration = gr.Dropdown(label="prompt duration (seconds)", choices=list(range(1, 11)), value=7)
            musicgen_models = [
                "thepatch/vanya_ai_dnb_0.1 (small)",
                "thepatch/budots_remix (small)",
                "thepatch/PhonkV2 (small)",
                "thepatch/bleeps-medium (medium)",
                "thepatch/hoenn_lofi (large)"
            ]

            musicgen_model = gr.Dropdown(label="musicGen model", choices=musicgen_models, value=musicgen_models[0])
            num_iterations = gr.Slider(label="number of iterations", minimum=1, maximum=10, step=1, value=3)
            bpm = gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=140)
            generate_button = gr.Button("generate music")
        with gr.Column():
            output_audio = gr.Audio(label="your track")

    generate_button.click(generate_music, inputs=[seed, use_chords, chord_progression, prompt_duration, musicgen_model, num_iterations, bpm], outputs=output_audio)

iface.launch()