Spaces:

amphion
/

PicoAudio

Running on Zero

App Files Files Community

ZeyuXie commited on Jul 17

Commit

cb0c99a

•

1 Parent(s): db42a2b

Upload 5 files

Browse files

Files changed (4) hide show

app.py +4 -2
inference.py +18 -7
llm_preprocess.py +1 -1
requirements.txt +29 -29

app.py CHANGED Viewed

@@ -52,13 +52,15 @@ with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown("## PicoAudio")
     with gr.Row():
-        description_text = f"18 events: {', '.join(event_list)}"
         gr.Markdown(description_text)
     with gr.Row():
         gr.Markdown("## Step1")
     with gr.Row():
-        preprocess_description_text = f"preprocess: free-text to timestamp caption via LLM"
         gr.Markdown(preprocess_description_text)
     with gr.Row():
         with gr.Column():

     with gr.Row():
         gr.Markdown("## PicoAudio")
     with gr.Row():
+        description_text = f"Support 18 events: {', '.join(event_list)}"
         gr.Markdown(description_text)
     with gr.Row():
         gr.Markdown("## Step1")
     with gr.Row():
+        preprocess_description_text = f"preprocess: free-text to timestamp caption via LLM. "+\
+            "This demo uses Gemini as the preprocessor. If any errors occur, please try a few more times. "+\
+                "We also provide the GPT version consistent with the paper in the file 'File/llc_reprocessing.py'. You can use your own api_key to modify and run 'File/llc_reference. py' for local inference."
         gr.Markdown(preprocess_description_text)
     with gr.Row():
         with gr.Column():

inference.py CHANGED Viewed

@@ -9,7 +9,7 @@ import numpy as np
 import torch
 from diffusers import DDPMScheduler
 from pico_model import PicoDiffusion, build_pretrained_models
 class dotdict(dict):
     """dot.notation access to dictionary attributes"""
     __getattr__ = dict.get
@@ -19,8 +19,14 @@ class dotdict(dict):
 def parse_args():
     parser = argparse.ArgumentParser(description="Inference for text to audio generation task.")
     parser.add_argument(
-        "--text", '-t', type=str, default="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",
-        help="Path for experiment."
     )
     parser.add_argument(
         "--exp_path", '-exp', type=str, default="/hpc_stor03/sjtu_home/zeyu.xie/workspace/controllable_audio_generation/huggingface/ckpts/pico_model",
@@ -43,7 +49,7 @@ def parse_args():
 def main():
     args = parse_args()
     train_args = dotdict(json.loads(open(args.original_args).readlines()[0]))
     seed = args.seed
     random.seed(seed)
     np.random.seed(seed)
@@ -52,6 +58,11 @@ def main():
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
     # Load Models #
     print("------Load model")
     name = "audioldm-s-full"
@@ -74,11 +85,11 @@ def main():
     print("------Diffusion begin!")
     with torch.no_grad():
-        latents = model.demo_inference(args.text, scheduler, num_steps, guidance, num_samples, disable_progress=True)
         mel = vae.decode_first_stage(latents)
         wave = vae.decode_to_waveform(mel)
-        sf.write(f"{output_dir}/{args.text}.wav", wave[0][:audio_len], samplerate=16000, subtype='PCM_16')
-    print(f"------Write to files to {output_dir}/{args.text}.wav")
 if __name__ == "__main__":
     main()

 import torch
 from diffusers import DDPMScheduler
 from pico_model import PicoDiffusion, build_pretrained_models
+from llm_preprocess import get_event, preprocess_gemini, preprocess_gpt
 class dotdict(dict):
     """dot.notation access to dictionary attributes"""
     __getattr__ = dict.get
 def parse_args():
     parser = argparse.ArgumentParser(description="Inference for text to audio generation task.")
     parser.add_argument(
+        "--text", '-t', type=str, default="spraying two times then gunshot three times.",
+        help="free-text caption."
+    )
+    parser.add_argument(
+        "--timestamp_caption", '-c', type=str,
+        default=None,
+        #default="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",
+        help="timestamp caption, formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'."
     )
     parser.add_argument(
         "--exp_path", '-exp', type=str, default="/hpc_stor03/sjtu_home/zeyu.xie/workspace/controllable_audio_generation/huggingface/ckpts/pico_model",
 def main():
     args = parse_args()
     train_args = dotdict(json.loads(open(args.original_args).readlines()[0]))
     seed = args.seed
     random.seed(seed)
     np.random.seed(seed)
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
+    # Step1: preprocess via llm
+    if args.timestamp_caption == None:
+        #args.timestamp_caption = preprocess_gpt(args.text)
+        args.timestamp_caption = preprocess_gemini(args.text)
     # Load Models #
     print("------Load model")
     name = "audioldm-s-full"
     print("------Diffusion begin!")
     with torch.no_grad():
+        latents = model.demo_inference(args.timestamp_caption, scheduler, num_steps, guidance, num_samples, disable_progress=True)
         mel = vae.decode_first_stage(latents)
         wave = vae.decode_to_waveform(mel)
+        sf.write(f"{output_dir}/{args.timestamp_caption}.wav", wave[0][:audio_len], samplerate=16000, subtype='PCM_16')
+    print(f"------Write to files to {output_dir}/{args.timestamp_caption}.wav")
 if __name__ == "__main__":
     main()

llm_preprocess.py CHANGED Viewed

@@ -85,7 +85,7 @@ def preprocess_gemini(free_text_caption):
 def preprocess_gpt(free_text_caption):
     preffix_prompt = get_prompt()
     from openai import OpenAI
-    client = OpenAI(api_key="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")
     completion_start = client.chat.completions.create(
                     model="gpt-4-1106-preview",
                     messages=[{

 def preprocess_gpt(free_text_caption):
     preffix_prompt = get_prompt()
     from openai import OpenAI
+    client = OpenAI(api_key="")
     completion_start = client.chat.completions.create(
                     model="gpt-4-1106-preview",
                     messages=[{

requirements.txt CHANGED Viewed

@@ -1,30 +1,30 @@
-torch==2.0.1
-torchaudio==2.0.2
-torchvision==0.15.2
-transformers==4.37.2
-accelerate==0.26.1
-datasets==2.16.1
-diffusers==0.18.2
-einops==0.7.0
-h5py==3.10.0
-huggingface_hub==0.20.3
-importlib_metadata==7.0.1
-librosa==0.10.1
-matplotlib==3.8.2
-numpy==1.23.5
-omegaconf==2.0.6
-packaging==23.2
-pandas==2.2.0
-progressbar33==2.4
-protobuf==3.20.*
-resampy==0.4.2
-scikit_image==0.22.0
-scikit_learn==1.4.0
-scipy==1.12.0
-soundfile==0.12.1
-ssr_eval==0.0.7
-torchlibrosa==0.1.0
-tqdm==4.63.1
-laion-clap==1.1.4
-gradio
 google-generativeai

+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+transformers==4.37.2
+accelerate==0.26.1
+datasets==2.16.1
+diffusers==0.18.2
+einops==0.7.0
+h5py==3.10.0
+huggingface_hub==0.20.3
+importlib_metadata==7.0.1
+librosa==0.10.1
+matplotlib==3.8.2
+numpy==1.23.5
+omegaconf==2.0.6
+packaging==23.2
+pandas==2.2.0
+progressbar33==2.4
+protobuf==3.20.*
+resampy==0.4.2
+scikit_image==0.22.0
+scikit_learn==1.4.0
+scipy==1.12.0
+soundfile==0.12.1
+ssr_eval==0.0.7
+torchlibrosa==0.1.0
+tqdm==4.63.1
+laion-clap==1.1.4
+gradio
 google-generativeai