import torch import soundfile as sf from diffusers import StableAudioPipeline import gradio as gr import spaces # Load the StableAudio pipeline model pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16) pipe = pipe.to("cuda") # Define the function to generate the sound based on a text prompt @spaces.GPU def generate_sound(prompt, negative_prompt, seed, inference_steps, duration, waveforms): # Set the seed for reproducibility generator = torch.Generator("cuda").manual_seed(seed) # Run the audio generation audio = pipe( prompt, negative_prompt=negative_prompt, num_inference_steps=inference_steps, audio_end_in_s=duration, num_waveforms_per_prompt=waveforms, generator=generator, ).audios # Get the output and save to a file output = audio[0].T.float().cpu().numpy() sf.write("generated_sound.wav", output, pipe.vae.sampling_rate) return "generated_sound.wav" # Define the Gradio interface app = gr.Interface( fn=generate_sound, inputs=[ gr.Textbox(label="Text Prompt", placeholder="Describe the sound you'd like to generate..."), gr.Textbox(label="Negative Prompt", placeholder="Describe what you don't want in the sound..."), gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=0), gr.Slider(label="Inference Steps", minimum=50, maximum=500, step=10, value=200), gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, step=0.5, value=10.0), gr.Slider(label="Number of Waveforms", minimum=1, maximum=5, step=1, value=1) ], outputs=gr.Audio(label="Generated Sound"), title="StableAudio Text-to-Audio Generator", description="Generate high-quality audio from text using StableAudio." ) app.launch()