Spaces:

thepatch
/

zero-gpu-slot-machine

Running on Zero

File size: 16,002 Bytes

b6ff5af
637c678
b6ff5af
 
 
 
 
 
 
 
 
 
637c678
ff54f69
3c1e68c
ed7d0fe
b6ff5af
8bdf8d9
 
b6ff5af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bdf8d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90d8e0f
140426b
8bdf8d9
ee282eb
140426b
 
bd9caa9
717ff8a
8bdf8d9
 
 
 
717ff8a
8bdf8d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717ff8a
8bdf8d9
 
717ff8a
8bdf8d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee282eb
40a916f
58fc3d4
 
 
ed7d0fe
3c1e68c
58fc3d4
 
 
 
 
 
 
 
 
ed7d0fe
3c1e68c
ed7d0fe
 
3c1e68c
 
58fc3d4
ed7d0fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58fc3d4
3c1e68c
ed7d0fe
58fc3d4
 
 
 
 
3c1e68c
 
58fc3d4
ed7d0fe
58fc3d4
ed7d0fe
 
 
 
58fc3d4
 
ed7d0fe
58fc3d4
3c1e68c
 
 
58fc3d4
 
 
3c1e68c
 
b6ff5af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46b3885
 
 
d102a12
46b3885
 
d102a12
46b3885
 
 
 
 
d102a12
46b3885
 
 
99fe505
 
 
46b3885
 
b6ff5af
 
 
 
9ebd8e6
b6ff5af
 
 
 
 
 
ed7d0fe
 
46b3885
b6ff5af
 
140426b
 
5098605
140426b
8bdf8d9
5098605
8bdf8d9
 
140426b
 
b6ff5af
 
 
 
76c951d
4cd41c1
140426b
ed7d0fe
8bdf8d9
3c1e68c
58fc3d4
3c1e68c
b6ff5af
5098605
 
 
58fc3d4
b6ff5af

import gradio as gr
import spaces
from musiclang_predict import MusicLangPredictor
import random
import subprocess
import os
import torchaudio
import torch
import numpy as np
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
from pydub import AudioSegment

import tempfile
from pydub import AudioSegment
import io

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Utility Functions
def peak_normalize(y, target_peak=0.97):
    return target_peak * (y / np.max(np.abs(y)))

def rms_normalize(y, target_rms=0.05):
    return y * (target_rms / np.sqrt(np.mean(y**2)))

def preprocess_audio(waveform):
    waveform_np = waveform.cpu().squeeze().numpy()  # Move to CPU before converting to NumPy
#   processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
    return torch.from_numpy(waveform_np).unsqueeze(0).to(device)

def create_slices(song, sr, slice_duration, bpm, num_slices=5):
    song_length = song.shape[-1] / sr
    slices = []
    
    # Ensure the first slice is from the beginning of the song
    first_slice_waveform = song[..., :int(slice_duration * sr)]
    slices.append(first_slice_waveform)
    
    for i in range(1, num_slices):
        possible_start_indices = list(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
        if not possible_start_indices:
            # If there are no valid start indices, duplicate the first slice
            slices.append(first_slice_waveform)
            continue
        
        random_start = random.choice(possible_start_indices)
        slice_end = random_start + int(slice_duration * sr)
        
        if slice_end > song_length * sr:
            # Wrap around to the beginning of the song
            remaining_samples = int(slice_end - song_length * sr)
            slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1)
        else:
            slice_waveform = song[..., random_start:slice_end]
        
        if len(slice_waveform.squeeze()) < int(slice_duration * sr):
            additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze())
            slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1)
        
        slices.append(slice_waveform)
        
    return slices

def calculate_duration(bpm, min_duration=29, max_duration=30):
    single_bar_duration = 4 * 60 / bpm
    bars = max(min_duration // single_bar_duration, 1)
    
    while single_bar_duration * bars < min_duration:
        bars += 1
    
    duration = single_bar_duration * bars
    
    while duration > max_duration and bars > 1:
        bars -= 1
        duration = single_bar_duration * bars
    
    return duration

@spaces.GPU(duration=60)
def generate_midi(seed, use_chords, chord_progression, bpm):
    if seed == "":
        seed = random.randint(1, 10000)

    ml = MusicLangPredictor('musiclang/musiclang-v2')

    try:
        seed = int(seed)
    except ValueError:
        seed = random.randint(1, 10000)

    nb_tokens = 1024
    temperature = 0.9
    top_p = 1.0

    if use_chords and chord_progression.strip():
        score = ml.predict_chords(
            chord_progression,
            time_signature=(4, 4),
            temperature=temperature,
            topp=top_p,
            rng_seed=seed
        )
    else:
        score = ml.predict(
            nb_tokens=nb_tokens,
            temperature=temperature,
            topp=top_p,
            rng_seed=seed
        )

    midi_filename = f"output_{seed}.mid"
    wav_filename = midi_filename.replace(".mid", ".wav")

    score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))

    subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])

    # Clean up temporary MIDI file
    os.remove(midi_filename)

    sample_rate = 44100  # Assuming fixed sample rate from fluidsynth command
    return wav_filename

@spaces.GPU(duration=120)
def generate_music(wav_filename, prompt_duration, musicgen_model, num_iterations, bpm):
    # Load the audio from the passed file path
    song, sr = torchaudio.load(wav_filename)
    song = song.to(device)
    # Use the user-provided BPM value for duration calculation
    duration = calculate_duration(bpm)

    # Create slices from the song using the user-provided BPM value
    slices = create_slices(song, sr, 35, bpm, num_slices=5)

    # Load the model
    model_name = musicgen_model.split(" ")[0]
    model_continue = MusicGen.get_pretrained(model_name)

    # Setting generation parameters
    model_continue.set_generation_params(
        use_sampling=True,
        top_k=250,
        top_p=0.0,
        temperature=1.0,
        duration=duration,
        cfg_coef=3
    )

    all_audio_files = []

    for i in range(num_iterations):
        slice_idx = i % len(slices)
        
        print(f"Running iteration {i + 1} using slice {slice_idx}...")
        
        prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)]
        prompt_waveform = preprocess_audio(prompt_waveform)
        
        output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
        output = output.cpu()  # Move the output tensor back to CPU
        
        # Make sure the output tensor has at most 2 dimensions
        if len(output.size()) > 2:
            output = output.squeeze()
        
        filename_without_extension = f'continue_{i}'
        filename_with_extension = f'{filename_without_extension}.wav'
        
        audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
        all_audio_files.append(f'{filename_without_extension}.wav.wav')  # Assuming the library appends an extra .wav

    # Combine all audio files
    combined_audio = AudioSegment.empty()
    for filename in all_audio_files:
        combined_audio += AudioSegment.from_wav(filename)

    combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
    combined_audio.export(combined_audio_filename, format="mp3")

    # Clean up temporary files
    for filename in all_audio_files:
        os.remove(filename)

    return combined_audio_filename

@spaces.GPU(duration=120)
def continue_music(input_audio_path, prompt_duration, musicgen_model, num_iterations, bpm):
    # Load the audio from the given file path
    song, sr = torchaudio.load(input_audio_path)
    song = song.to(device)

    # Load the model and set generation parameters
    model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0])
    model_continue.set_generation_params(
        use_sampling=True,
        top_k=250,
        top_p=0.0,
        temperature=1.0,
        duration=calculate_duration(bpm),
        cfg_coef=3
    )

    original_audio = AudioSegment.from_mp3(input_audio_path)
    current_audio = original_audio

    file_paths_for_cleanup = []  # List to track generated file paths for cleanup

    for i in range(num_iterations):
        # Calculate the slice from the end of the current audio based on prompt_duration
        num_samples = int(prompt_duration * sr)
        if current_audio.duration_seconds * 1000 < prompt_duration * 1000:
            raise ValueError("The prompt_duration is longer than the current audio length.")

        start_time = current_audio.duration_seconds * 1000 - prompt_duration * 1000
        prompt_audio = current_audio[start_time:]

        # Convert the prompt audio to a PyTorch tensor
        prompt_bytes = prompt_audio.export(format="wav").read()
        prompt_waveform, _ = torchaudio.load(io.BytesIO(prompt_bytes))
        prompt_waveform = prompt_waveform.to(device)

        # Prepare the audio slice for generation
        prompt_waveform = preprocess_audio(prompt_waveform)

        output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
        output = output.cpu()  # Move the output tensor back to CPU

        if len(output.size()) > 2:
            output = output.squeeze()

        filename_without_extension = f'continue_{i}'
        filename_with_extension = f'{filename_without_extension}.wav'
        correct_filename_extension = f'{filename_without_extension}.wav.wav'  # Apply the workaround for audio_write

        audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
        generated_audio_segment = AudioSegment.from_wav(correct_filename_extension)

        # Replace the prompt portion with the generated audio
        current_audio = current_audio[:start_time] + generated_audio_segment

        file_paths_for_cleanup.append(correct_filename_extension)  # Add to cleanup list

    combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
    current_audio.export(combined_audio_filename, format="mp3")

    # Clean up temporary files using the list of file paths
    for file_path in file_paths_for_cleanup:
        os.remove(file_path)

    return combined_audio_filename



# Define the expandable sections
musiclang_blurb = """

## musiclang

musiclang is a controllable ai midi model. it can generate midi sequences based on user-provided parameters, or unconditionally.

[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict)

[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict)

"""

musicgen_blurb = """

## musicgen

musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.

[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)

visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.

see also https://youtube.com/@thecollabagepatch 

"""

finetunes_blurb = """

## fine-tuned models

the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.

[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)

[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)

"""

# Define the fine-tunes blurb for each model
fine_tunes_info = """

## thepatch/vanya_ai_dnb_0.1

thepatch/vanya_ai_dnb_0.1 was trained by vanya. [vanya's Twitter](https://twitter.com/@veryVANYA) 🔗 - it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well)



## thepatch/bleeps-medium

thepatch/bleeps-medium was trained by kevin and lyra [lyra's Twitter](https://twitter.com/@_lyraaaa_) 🔗 - it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist.



## thepatch/budots_remix

thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw.



## thepatch/hoenn_lofi

thepatch/hoenn_lofi is a large fine-tune by hoenn. [hoenn's Twitter](https://twitter.com/@eschatolocation) 🔗 - this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far.



## thepatch/PhonkV2

thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord.



## foureyednymph/musicgen-sza-sos-small

foureyednymph/musicgen-sza-sos-small was just trained by foureyednymph. We're all about to find out if it does continuations well.

"""

# Create the Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# the-slot-machine")
    gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
    gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model. trim it so that you like the beginning of the output, and choose the prompt duration. Then we give it to musicgen to continue for 30 seconds. We can then choose a new model and prompt duration, trim it, and give it to musicgen to continue from the end of the output. Re-upload, trim again and repeat with a new musicgen model and different prompt duration if you want. ")

    with gr.Accordion("more info", open=False):
        gr.Markdown(musiclang_blurb)
        gr.Markdown(musicgen_blurb)
        gr.Markdown(finetunes_blurb)

    with gr.Accordion("fine-tunes info", open=False):
        gr.Markdown(fine_tunes_info)

    with gr.Row():
        with gr.Column():
            seed = gr.Textbox(label="Seed (leave blank for random)", value="")
            use_chords = gr.Checkbox(label="Control Chord Progression", value=False)
            chord_progression = gr.Textbox(label="Chord Progression (e.g., Am CM Dm E7 Am)", visible=True)
            bpm = gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=120)
            generate_midi_button = gr.Button("Generate MIDI")
            midi_audio = gr.Audio(label="Generated MIDI Audio", type="filepath")  # Ensure this is set to handle file paths

        with gr.Column():
            prompt_duration = gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=5)
            musicgen_model = gr.Dropdown(label="MusicGen Model", choices=[
                "thepatch/vanya_ai_dnb_0.1 (small)",
                "thepatch/budots_remix (small)",
                "thepatch/PhonkV2 (small)",
                "thepatch/bleeps-medium (medium)",
                "thepatch/hoenn_lofi (large)",
                "foureyednymph/musicgen-sza-sos-small (small)"
            ], value="thepatch/vanya_ai_dnb_0.1 (small)")
            num_iterations = gr.Slider(label="this does nothing rn", minimum=1, maximum=1, step=1, value=1)
            generate_music_button = gr.Button("Generate Music")
            output_audio = gr.Audio(label="Generated Music", type="filepath")
            continue_button = gr.Button("Continue Generating Music")
            continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath")

    # Connecting the components
    generate_midi_button.click(generate_midi, inputs=[seed, use_chords, chord_progression, bpm], outputs=[midi_audio])
    generate_music_button.click(generate_music, inputs=[midi_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=[output_audio])
    continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=continue_output_audio)

iface.launch()