import torch
import soundfile as sf
from diffusers import StableAudioPipeline
import gradio as gr
import spaces


# Load the StableAudio pipeline model
pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16)
pipe = pipe.to("cuda")

# Define the function to generate the sound based on a text prompt
@spaces.GPU
def generate_sound(prompt, negative_prompt, seed, inference_steps, duration, waveforms):
        
    # Set the seed for reproducibility
    generator = torch.Generator("cuda").manual_seed(seed)
    
    # Run the audio generation
    audio = pipe(
        prompt,
        negative_prompt=negative_prompt,
        num_inference_steps=inference_steps,
        audio_end_in_s=duration,
        num_waveforms_per_prompt=waveforms,
        generator=generator,
    ).audios
    
    # Get the output and save to a file
    output = audio[0].T.float().cpu().numpy()
    sf.write("generated_sound.wav", output, pipe.vae.sampling_rate)
    
    return "generated_sound.wav"

# Define the Gradio interface
app = gr.Interface(
    fn=generate_sound, 
    inputs=[
        gr.Textbox(label="Text Prompt", placeholder="Describe the sound you'd like to generate..."),
        gr.Textbox(label="Negative Prompt", placeholder="Describe what you don't want in the sound..."),
        gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=0),
        gr.Slider(label="Inference Steps", minimum=50, maximum=500, step=10, value=200),
        gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, step=0.5, value=10.0),
        gr.Slider(label="Number of Waveforms", minimum=1, maximum=5, step=1, value=1)
    ], 
    outputs=gr.Audio(label="Generated Sound"),
    title="StableAudio Text-to-Audio Generator",
    description="Generate high-quality audio from text using StableAudio."
)


app.launch()