Spaces:

thepatch
/

zero-gpu-slot-machine

Running on Zero

App Files Files Community

zero-gpu-slot-machine / app.py

thecollabagepatch

gary on gary

3c1e68c 7 months ago

raw

history blame

13.9 kB

	import gradio as gr
	from musiclang_predict import MusicLangPredictor
	import random
	import subprocess
	import os
	import torchaudio
	import torch
	import numpy as np
	from audiocraft.models import MusicGen
	from audiocraft.data.audio import audio_write
	from pydub import AudioSegment
	import spaces
	import tempfile
	from pydub import AudioSegment

	# Check if CUDA is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Utility Functions
	def peak_normalize(y, target_peak=0.97):
	return target_peak * (y / np.max(np.abs(y)))

	def rms_normalize(y, target_rms=0.05):
	return y * (target_rms / np.sqrt(np.mean(y**2)))

	def preprocess_audio(waveform):
	waveform_np = waveform.cpu().squeeze().numpy() # Move to CPU before converting to NumPy
	# processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
	return torch.from_numpy(waveform_np).unsqueeze(0).to(device)

	def create_slices(song, sr, slice_duration, bpm, num_slices=5):
	song_length = song.shape[-1] / sr
	slices = []

	# Ensure the first slice is from the beginning of the song
	first_slice_waveform = song[..., :int(slice_duration * sr)]
	slices.append(first_slice_waveform)

	for i in range(1, num_slices):
	possible_start_indices = list(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
	if not possible_start_indices:
	# If there are no valid start indices, duplicate the first slice
	slices.append(first_slice_waveform)
	continue

	random_start = random.choice(possible_start_indices)
	slice_end = random_start + int(slice_duration * sr)

	if slice_end > song_length * sr:
	# Wrap around to the beginning of the song
	remaining_samples = int(slice_end - song_length * sr)
	slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1)
	else:
	slice_waveform = song[..., random_start:slice_end]

	if len(slice_waveform.squeeze()) < int(slice_duration * sr):
	additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze())
	slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1)

	slices.append(slice_waveform)

	return slices

	def calculate_duration(bpm, min_duration=29, max_duration=30):
	single_bar_duration = 4 * 60 / bpm
	bars = max(min_duration // single_bar_duration, 1)

	while single_bar_duration * bars < min_duration:
	bars += 1

	duration = single_bar_duration * bars

	while duration > max_duration and bars > 1:
	bars -= 1
	duration = single_bar_duration * bars

	return duration

	@spaces.GPU(duration=60)
	def generate_midi(seed, use_chords, chord_progression, bpm):
	if seed == "":
	seed = random.randint(1, 10000)

	ml = MusicLangPredictor('musiclang/musiclang-v2')

	try:
	seed = int(seed)
	except ValueError:
	seed = random.randint(1, 10000)

	nb_tokens = 1024
	temperature = 0.9
	top_p = 1.0

	if use_chords and chord_progression.strip():
	score = ml.predict_chords(
	chord_progression,
	time_signature=(4, 4),
	temperature=temperature,
	topp=top_p,
	rng_seed=seed
	)
	else:
	score = ml.predict(
	nb_tokens=nb_tokens,
	temperature=temperature,
	topp=top_p,
	rng_seed=seed
	)

	midi_filename = f"output_{seed}.mid"
	wav_filename = midi_filename.replace(".mid", ".wav")

	score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))

	subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])

	# Clean up temporary MIDI file
	os.remove(midi_filename)

	sample_rate = 44100 # Assuming fixed sample rate from fluidsynth command
	return wav_filename

	@spaces.GPU(duration=60)
	def generate_music(wav_filename, prompt_duration, musicgen_model, num_iterations, bpm):
	# Load the audio from the passed file path
	song, sr = torchaudio.load(wav_filename)
	song = song.to(device)
	# Use the user-provided BPM value for duration calculation
	duration = calculate_duration(bpm)

	# Create slices from the song using the user-provided BPM value
	slices = create_slices(song, sr, 35, bpm, num_slices=5)

	# Load the model
	model_name = musicgen_model.split(" ")[0]
	model_continue = MusicGen.get_pretrained(model_name)

	# Setting generation parameters
	model_continue.set_generation_params(
	use_sampling=True,
	top_k=250,
	top_p=0.0,
	temperature=1.0,
	duration=duration,
	cfg_coef=3
	)

	all_audio_files = []

	for i in range(num_iterations):
	slice_idx = i % len(slices)

	print(f"Running iteration {i + 1} using slice {slice_idx}...")

	prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)]
	prompt_waveform = preprocess_audio(prompt_waveform)

	output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
	output = output.cpu() # Move the output tensor back to CPU

	# Make sure the output tensor has at most 2 dimensions
	if len(output.size()) > 2:
	output = output.squeeze()

	filename_without_extension = f'continue_{i}'
	filename_with_extension = f'{filename_without_extension}.wav'

	audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
	all_audio_files.append(f'{filename_without_extension}.wav.wav') # Assuming the library appends an extra .wav

	# Combine all audio files
	combined_audio = AudioSegment.empty()
	for filename in all_audio_files:
	combined_audio += AudioSegment.from_wav(filename)

	combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
	combined_audio.export(combined_audio_filename, format="mp3")

	# Clean up temporary files
	for filename in all_audio_files:
	os.remove(filename)

	return combined_audio_filename

	@spaces.GPU(duration=60)
	def continue_music(input_audio_path, prompt_duration, musicgen_model, num_iterations, bpm):
	# Load the audio from the given file path
	song, sr = torchaudio.load(input_audio_path)
	song = song.to(device)

	# Calculate the slice from the end of the song based on prompt_duration
	num_samples = int(prompt_duration * sr)
	if song.shape[-1] < num_samples:
	raise ValueError("The prompt_duration is longer than the audio length.")
	start_idx = song.shape[-1] - num_samples
	prompt_waveform = song[..., start_idx:]

	# Prepare the audio slice for generation
	prompt_waveform = preprocess_audio(prompt_waveform)

	# Load the model and set generation parameters
	model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0])
	model_continue.set_generation_params(
	use_sampling=True,
	top_k=250,
	top_p=0.0,
	temperature=1.0,
	duration=calculate_duration(bpm),
	cfg_coef=3
	)

	original_audio = AudioSegment.from_mp3(input_audio_path)
	all_audio_files = [original_audio] # Start with the original audio
	file_paths_for_cleanup = [] # List to track generated file paths for cleanup

	for i in range(num_iterations):
	output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
	output = output.cpu() # Move the output tensor back to CPU
	if len(output.size()) > 2:
	output = output.squeeze()

	filename_without_extension = f'continue_{i}'
	filename_with_extension = f'{filename_without_extension}.wav'
	correct_filename_extension = f'{filename_without_extension}.wav.wav' # Apply the workaround for audio_write

	audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
	new_audio_segment = AudioSegment.from_wav(correct_filename_extension)
	all_audio_files.append(new_audio_segment)
	file_paths_for_cleanup.append(correct_filename_extension) # Add to cleanup list

	# Combine all audio files into one continuous segment
	combined_audio = sum(all_audio_files)

	combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
	combined_audio.export(combined_audio_filename, format="mp3")

	# Clean up temporary files using the list of file paths
	for file_path in file_paths_for_cleanup:
	os.remove(file_path)

	return combined_audio_filename



	# Define the expandable sections
	musiclang_blurb = """
	## musiclang
	musiclang is a controllable ai midi model. it can generate midi sequences based on user-provided parameters, or unconditionally.
	[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict)
	[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict)
	"""

	musicgen_blurb = """
	## musicgen
	musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.
	[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)
	visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.
	see also https://youtube.com/@thecollabagepatch
	"""

	finetunes_blurb = """
	## fine-tuned models
	the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.
	[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)
	[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)
	"""

	# Create the Gradio interface
	with gr.Blocks() as iface:
	gr.Markdown("# the-slot-machine")
	gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
	gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model continue, semi-randomly, from different sections of the midi track. the slot machine combines em all at the end into something very bizarre. pick a number for the seed between 1 and 10k, or leave it blank to unlock the full rnjesus powers. if you wanna be lame, you can control the chord progression, prompt duration, musicgen model, number of iterations, and BPM.")

	with gr.Accordion("more info", open=False):
	gr.Markdown(musiclang_blurb)
	gr.Markdown(musicgen_blurb)
	gr.Markdown(finetunes_blurb)

	with gr.Row():
	with gr.Column():
	seed = gr.Textbox(label="Seed (leave blank for random)", value="")
	use_chords = gr.Checkbox(label="Control Chord Progression", value=False)
	chord_progression = gr.Textbox(label="Chord Progression (e.g., Am CM Dm E7 Am)", visible=True)
	bpm = gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=120)
	generate_midi_button = gr.Button("Generate MIDI")
	midi_audio = gr.Audio(label="Generated MIDI Audio", type="filepath") # Ensure this is set to handle file paths

	with gr.Column():
	prompt_duration = gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=5)
	musicgen_model = gr.Dropdown(label="MusicGen Model", choices=[
	"thepatch/vanya_ai_dnb_0.1 (small)",
	"thepatch/budots_remix (small)",
	"thepatch/PhonkV2 (small)",
	"thepatch/bleeps-medium (medium)",
	"thepatch/hoenn_lofi (large)"
	], value="thepatch/vanya_ai_dnb_0.1 (small)")
	num_iterations = gr.Slider(label="Number of Iterations", minimum=1, maximum=3, step=1, value=3)
	generate_music_button = gr.Button("Generate Music")
	output_audio = gr.Audio(label="Generated Music", type="filepath")
	continue_button = gr.Button("Continue Generating Music")
	continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath")

	# Connecting the components
	generate_midi_button.click(generate_midi, inputs=[seed, use_chords, chord_progression, bpm], outputs=[midi_audio])
	generate_music_button.click(generate_music, inputs=[midi_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=[output_audio])
	continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=continue_output_audio)

	iface.launch()