import gradio as gr
from musiclang_predict import MusicLangPredictor
import random
import subprocess
import os
import torchaudio
import torch
import numpy as np
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
from pydub import AudioSegment
import spaces
import tempfile
from pydub import AudioSegment

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Utility Functions
def peak_normalize(y, target_peak=0.97):
    return target_peak * (y / np.max(np.abs(y)))

def rms_normalize(y, target_rms=0.05):
    return y * (target_rms / np.sqrt(np.mean(y**2)))

def preprocess_audio(waveform):
    waveform_np = waveform.cpu().squeeze().numpy()  # Move to CPU before converting to NumPy
#   processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
    return torch.from_numpy(waveform_np).unsqueeze(0).to(device)

def create_slices(song, sr, slice_duration, bpm, num_slices=5):
    song_length = song.shape[-1] / sr
    slices = []
    
    # Ensure the first slice is from the beginning of the song
    first_slice_waveform = song[..., :int(slice_duration * sr)]
    slices.append(first_slice_waveform)
    
    for i in range(1, num_slices):
        possible_start_indices = list(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
        if not possible_start_indices:
            # If there are no valid start indices, duplicate the first slice
            slices.append(first_slice_waveform)
            continue
        
        random_start = random.choice(possible_start_indices)
        slice_end = random_start + int(slice_duration * sr)
        
        if slice_end > song_length * sr:
            # Wrap around to the beginning of the song
            remaining_samples = int(slice_end - song_length * sr)
            slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1)
        else:
            slice_waveform = song[..., random_start:slice_end]
        
        if len(slice_waveform.squeeze()) < int(slice_duration * sr):
            additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze())
            slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1)
        
        slices.append(slice_waveform)
        
    return slices

def calculate_duration(bpm, min_duration=29, max_duration=30):
    single_bar_duration = 4 * 60 / bpm
    bars = max(min_duration // single_bar_duration, 1)
    
    while single_bar_duration * bars < min_duration:
        bars += 1
    
    duration = single_bar_duration * bars
    
    while duration > max_duration and bars > 1:
        bars -= 1
        duration = single_bar_duration * bars
    
    return duration

@spaces.GPU(duration=60)
def generate_midi(seed, use_chords, chord_progression, bpm):
    if seed == "":
        seed = random.randint(1, 10000)

    ml = MusicLangPredictor('musiclang/musiclang-v2')

    try:
        seed = int(seed)
    except ValueError:
        seed = random.randint(1, 10000)

    nb_tokens = 1024
    temperature = 0.9
    top_p = 1.0

    if use_chords and chord_progression.strip():
        score = ml.predict_chords(
            chord_progression,
            time_signature=(4, 4),
            temperature=temperature,
            topp=top_p,
            rng_seed=seed
        )
    else:
        score = ml.predict(
            nb_tokens=nb_tokens,
            temperature=temperature,
            topp=top_p,
            rng_seed=seed
        )

    midi_filename = f"output_{seed}.mid"
    wav_filename = midi_filename.replace(".mid", ".wav")

    score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))

    subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])

    # Clean up temporary MIDI file
    os.remove(midi_filename)

    sample_rate = 44100  # Assuming fixed sample rate from fluidsynth command
    return wav_filename

@spaces.GPU(duration=90)
def generate_music(wav_filename, prompt_duration, musicgen_model, num_iterations, bpm):
    # Load the audio from the passed file path
    song, sr = torchaudio.load(wav_filename)
    song = song.to(device)
    # Use the user-provided BPM value for duration calculation
    duration = calculate_duration(bpm)

    # Create slices from the song using the user-provided BPM value
    slices = create_slices(song, sr, 35, bpm, num_slices=5)

    # Load the model
    model_name = musicgen_model.split(" ")[0]
    model_continue = MusicGen.get_pretrained(model_name)

    # Setting generation parameters
    model_continue.set_generation_params(
        use_sampling=True,
        top_k=250,
        top_p=0.0,
        temperature=1.0,
        duration=duration,
        cfg_coef=3
    )

    all_audio_files = []

    for i in range(num_iterations):
        slice_idx = i % len(slices)
        
        print(f"Running iteration {i + 1} using slice {slice_idx}...")
        
        prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)]
        prompt_waveform = preprocess_audio(prompt_waveform)
        
        output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
        output = output.cpu()  # Move the output tensor back to CPU
        
        # Make sure the output tensor has at most 2 dimensions
        if len(output.size()) > 2:
            output = output.squeeze()
        
        filename_without_extension = f'continue_{i}'
        filename_with_extension = f'{filename_without_extension}.wav'
        
        audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
        all_audio_files.append(f'{filename_without_extension}.wav.wav')  # Assuming the library appends an extra .wav

    # Combine all audio files
    combined_audio = AudioSegment.empty()
    for filename in all_audio_files:
        combined_audio += AudioSegment.from_wav(filename)

    combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
    combined_audio.export(combined_audio_filename, format="mp3")

    # Clean up temporary files
    for filename in all_audio_files:
        os.remove(filename)

    return combined_audio_filename

@spaces.GPU(duration=90)
def continue_music(input_audio_path, prompt_duration, musicgen_model, num_iterations, bpm):
    # Load the audio from the given file path
    song, sr = torchaudio.load(input_audio_path)
    song = song.to(device)
    
    # Calculate the slice from the end of the song based on prompt_duration
    num_samples = int(prompt_duration * sr)
    if song.shape[-1] < num_samples:
        raise ValueError("The prompt_duration is longer than the audio length.")
    start_idx = song.shape[-1] - num_samples
    prompt_waveform = song[..., start_idx:]

    # Prepare the audio slice for generation
    prompt_waveform = preprocess_audio(prompt_waveform)
    
    # Load the model and set generation parameters
    model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0])
    model_continue.set_generation_params(
        use_sampling=True,
        top_k=250,
        top_p=0.0,
        temperature=1.0,
        duration=calculate_duration(bpm),
        cfg_coef=3
    )
    
    original_audio = AudioSegment.from_mp3(input_audio_path)
    all_audio_files = [original_audio]  # Start with the original audio
    file_paths_for_cleanup = []  # List to track generated file paths for cleanup

    for i in range(num_iterations):
        output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
        output = output.cpu()  # Move the output tensor back to CPU
        if len(output.size()) > 2:
            output = output.squeeze()

        filename_without_extension = f'continue_{i}'
        filename_with_extension = f'{filename_without_extension}.wav'
        correct_filename_extension = f'{filename_without_extension}.wav.wav'  # Apply the workaround for audio_write

        audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
        new_audio_segment = AudioSegment.from_wav(correct_filename_extension)
        all_audio_files.append(new_audio_segment)
        file_paths_for_cleanup.append(correct_filename_extension)  # Add to cleanup list

    # Combine all audio files into one continuous segment
    combined_audio = sum(all_audio_files)

    combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
    combined_audio.export(combined_audio_filename, format="mp3")

    # Clean up temporary files using the list of file paths
    for file_path in file_paths_for_cleanup:
        os.remove(file_path)

    return combined_audio_filename



# Define the expandable sections
musiclang_blurb = """
## musiclang
musiclang is a controllable ai midi model. it can generate midi sequences based on user-provided parameters, or unconditionally.
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict)
[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict)
"""

musicgen_blurb = """
## musicgen
musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)
visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.
see also https://youtube.com/@thecollabagepatch 
"""

finetunes_blurb = """
## fine-tuned models
the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.
[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)
[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)
"""



# Define the fine-tunes blurb for each model
fine_tunes_info = """
## thepatch/vanya_ai_dnb_0.1
thepatch/vanya_ai_dnb_0.1 was trained by vanya. [![Twitter](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)](https://twitter.com/@veryVANYA) . it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well)

## thepatch/bleeps-medium
thepatch/bleeps-medium was trained by kevin and lyra [![Twitter](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)](https://twitter.com/@_lyraaaa_) . it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist.

## thepatch/budots_remix
thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw.

## thepatch/hoenn_lofi
thepatch/hoenn_lofi is a large fine-tune by hoenn. [![Twitter](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)](https://twitter.com/@eschatolocation) . this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far.

## thepatch/PhonkV2
thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord.
"""



# Create the Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# the-slot-machine")
    gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
    gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model to continue the from the beginning of the midi model's generation. then, musicgen can continue from the end of its own output. re-upload, trim and repeat with a different fine-tune and prompt duration for the coolest outputs.")

    with gr.Accordion("more info", open=False):
        gr.Markdown(musiclang_blurb)
        gr.Markdown(musicgen_blurb)
        gr.Markdown(finetunes_blurb)

with gr.Accordion("fine-tunes info", open=False):
        gr.Markdown(fine_tunes_blurb)

    with gr.Row():
        with gr.Column():
            seed = gr.Textbox(label="Seed (leave blank for random)", value="")
            use_chords = gr.Checkbox(label="Control Chord Progression", value=False)
            chord_progression = gr.Textbox(label="Chord Progression (e.g., Am CM Dm E7 Am)", visible=True)
            bpm = gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=120)
            generate_midi_button = gr.Button("Generate MIDI")
            midi_audio = gr.Audio(label="Generated MIDI Audio", type="filepath")  # Ensure this is set to handle file paths

        with gr.Column():
            prompt_duration = gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=5)
            musicgen_model = gr.Dropdown(label="MusicGen Model", choices=[
                "thepatch/vanya_ai_dnb_0.1 (small)",
                "thepatch/budots_remix (small)",
                "thepatch/PhonkV2 (small)",
                "thepatch/bleeps-medium (medium)",
                "thepatch/hoenn_lofi (large)"
            ], value="thepatch/vanya_ai_dnb_0.1 (small)")

            generate_music_button = gr.Button("Generate Music")
            output_audio = gr.Audio(label="Generated Music", type="filepath")
            continue_button = gr.Button("Continue Generating Music")
            continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath")

    # Connecting the components
    generate_midi_button.click(generate_midi, inputs=[seed, use_chords, chord_progression, bpm], outputs=[midi_audio])
    generate_music_button.click(generate_music, inputs=[midi_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=[output_audio])
    continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=continue_output_audio)

iface.launch()