lmzjms commited on
Commit
4d54c87
1 Parent(s): 8ab0e50

Update audio_foundation_models.py

Browse files
Files changed (1) hide show
  1. audio_foundation_models.py +81 -23
audio_foundation_models.py CHANGED
@@ -42,6 +42,13 @@ from utils.os_utils import move_file
42
  import scipy.io.wavfile as wavfile
43
 
44
 
 
 
 
 
 
 
 
45
 
46
  def initialize_model(config, ckpt, device):
47
  config = OmegaConf.load(config)
@@ -64,7 +71,7 @@ def initialize_model_inpaint(config, ckpt):
64
  sampler = DDIMSampler(model)
65
  return sampler
66
  def select_best_audio(prompt,wav_list):
67
- clap_model = CLAPWrapper('text_to_audio/Make_An_Audio/useful_ckpts/CLAP/CLAP_weights_2022.pth','text_to_audio/Make_An_Audio/useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
68
  text_embeddings = clap_model.get_text_embeddings([prompt])
69
  score_list = []
70
  for data in wav_list:
@@ -87,6 +94,11 @@ class T2I:
87
  self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model, tokenizer=self.text_refine_tokenizer, device=self.device)
88
  self.pipe.to(device)
89
 
 
 
 
 
 
90
  def inference(self, text):
91
  image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
92
  refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"]
@@ -103,6 +115,13 @@ class ImageCaptioning:
103
  self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
104
  self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)
105
 
 
 
 
 
 
 
 
106
  def inference(self, image_path):
107
  inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device)
108
  out = self.model.generate(**inputs)
@@ -113,9 +132,15 @@ class T2A:
113
  def __init__(self, device):
114
  print("Initializing Make-An-Audio to %s" % device)
115
  self.device = device
116
- self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/text-to-audio/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
117
  self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
118
 
 
 
 
 
 
 
119
  def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
120
  SAMPLE_RATE = 16000
121
  prng = np.random.RandomState(seed)
@@ -160,8 +185,15 @@ class I2A:
160
  def __init__(self, device):
161
  print("Initializing Make-An-Audio-Image to %s" % device)
162
  self.device = device
163
- self.sampler = initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
164
- self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
 
 
 
 
 
 
 
165
  def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
166
  SAMPLE_RATE = 16000
167
  n_samples = 1 # only support 1 sample
@@ -205,18 +237,6 @@ class I2A:
205
  print(f"Processed I2a.run, image_filename: {image}, audio_filename: {audio_filename}")
206
  return audio_filename
207
 
208
- class TTS:
209
- def __init__(self, device=None):
210
- self.inferencer = TTSInference(device)
211
-
212
- def inference(self, text):
213
- global temp_audio_filename
214
- inp = {"text": text}
215
- out = self.inferencer.infer_once(inp)
216
- audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
217
- soundfile.write(audio_filename, out, samplerate = 22050)
218
- return audio_filename
219
-
220
  class T2S:
221
  def __init__(self, device= None):
222
  if device is None:
@@ -233,6 +253,15 @@ class T2S:
233
  'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
234
  }
235
 
 
 
 
 
 
 
 
 
 
236
  def set_model_hparams(self):
237
  set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
238
  self.hp = hp
@@ -241,11 +270,13 @@ class T2S:
241
  self.set_model_hparams()
242
  val = inputs.split(",")
243
  key = ['text', 'notes', 'notes_duration']
244
- if inputs == '' or len(val) < len(key):
 
 
 
 
245
  inp = self.default_inp
246
- else:
247
- inp = {k:v for k,v in zip(key,val)}
248
- wav = self.pipe.infer_once(inp)
249
  wav *= 32767
250
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
251
  wavfile.write(audio_filename, self.hp['audio_sample_rate'], wav.astype(np.int16))
@@ -263,6 +294,13 @@ class TTS_OOD:
263
  self.set_model_hparams()
264
  self.pipe = GenerSpeechInfer(self.hp, device)
265
 
 
 
 
 
 
 
 
266
  def set_model_hparams(self):
267
  set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
268
  f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
@@ -278,7 +316,6 @@ class TTS_OOD:
278
  key = ['ref_audio', 'text']
279
  val = inputs.split(",")
280
  inp = {k: v for k, v in zip(key, val)}
281
- print(inp)
282
  wav = self.pipe.infer_once(inp)
283
  wav *= 32767
284
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
@@ -291,9 +328,16 @@ class Inpaint:
291
  def __init__(self, device):
292
  print("Initializing Make-An-Audio-inpaint to %s" % device)
293
  self.device = device
294
- self.sampler = initialize_model_inpaint('text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/inpaint7_epoch00047.ckpt')
295
- self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
296
  self.cmap_transform = matplotlib.cm.viridis
 
 
 
 
 
 
 
297
  def make_batch_sd(self, mel, mask, num_samples=1):
298
 
299
  mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
@@ -424,6 +468,13 @@ class ASR:
424
  print("Initializing Whisper to %s" % device)
425
  self.device = device
426
  self.model = whisper.load_model("base", device=device)
 
 
 
 
 
 
 
427
  def inference(self, audio_path):
428
  audio = whisper.load_audio(audio_path)
429
  audio = whisper.pad_or_trim(audio)
@@ -438,6 +489,13 @@ class A2T:
438
  print("Initializing Audio-To-Text Model to %s" % device)
439
  self.device = device
440
  self.model = AudioCapModel("audio_to_text/audiocaps_cntrstv_cnn14rnn_trm")
 
 
 
 
 
 
 
441
  def inference(self, audio_path):
442
  audio = whisper.load_audio(audio_path)
443
  caption_text = self.model(audio)
 
42
  import scipy.io.wavfile as wavfile
43
 
44
 
45
+ def prompts(name, description):
46
+ def decorator(func):
47
+ func.name = name
48
+ func.description = description
49
+ return func
50
+
51
+ return decorator
52
 
53
  def initialize_model(config, ckpt, device):
54
  config = OmegaConf.load(config)
 
71
  sampler = DDIMSampler(model)
72
  return sampler
73
  def select_best_audio(prompt,wav_list):
74
+ clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
75
  text_embeddings = clap_model.get_text_embeddings([prompt])
76
  score_list = []
77
  for data in wav_list:
 
94
  self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model, tokenizer=self.text_refine_tokenizer, device=self.device)
95
  self.pipe.to(device)
96
 
97
+ @prompts(name="Generate Image From User Input Text",
98
+ description="useful when you want to generate an image from a user input text and save it to a file. "
99
+ "like: generate an image of an object or something, or generate an image that includes some objects. "
100
+ "The input to this tool should be a string, representing the text used to generate image. ")
101
+
102
  def inference(self, text):
103
  image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
104
  refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"]
 
115
  self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
116
  self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)
117
 
118
+
119
+ @prompts(name="Remove Something From The Photo",
120
+ description="useful when you want to remove and object or something from the photo "
121
+ "from its description or location. "
122
+ "The input to this tool should be a comma separated string of two, "
123
+ "representing the image_path and the object need to be removed. ")
124
+
125
  def inference(self, image_path):
126
  inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device)
127
  out = self.model.generate(**inputs)
 
132
  def __init__(self, device):
133
  print("Initializing Make-An-Audio to %s" % device)
134
  self.device = device
135
+ self.sampler = initialize_model('configs/text-to-audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt', device=device)
136
  self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
137
 
138
+ @prompts(name="Generate Audio From User Input Text",
139
+ description="useful for when you want to generate an audio "
140
+ "from a user input text and it saved it to a file."
141
+ "The input to this tool should be a string, "
142
+ "representing the text used to generate audio.")
143
+
144
  def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples = 3, W = 624, H = 80):
145
  SAMPLE_RATE = 16000
146
  prng = np.random.RandomState(seed)
 
185
  def __init__(self, device):
186
  print("Initializing Make-An-Audio-Image to %s" % device)
187
  self.device = device
188
+ self.sampler = initialize_model('text_to_audio/Make_An_Audio_img/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio_img/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
189
+ self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio_img/vocoder/logs/bigv16k53w',device=device)
190
+
191
+ @prompts(name="Generate Audio From The Image",
192
+ description="useful for when you want to generate an audio "
193
+ "based on an image.""
194
+ "The input to this tool should be a string, "
195
+ "representing the image_path. ")
196
+
197
  def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W = 624, H = 80):
198
  SAMPLE_RATE = 16000
199
  n_samples = 1 # only support 1 sample
 
237
  print(f"Processed I2a.run, image_filename: {image}, audio_filename: {audio_filename}")
238
  return audio_filename
239
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  class T2S:
241
  def __init__(self, device= None):
242
  if device is None:
 
253
  'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
254
  }
255
 
256
+ @prompts(name="Generate Singing Voice From User Input Text, Note and Duration Sequence",
257
+ description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) "
258
+ "and save it to a file.""
259
+ "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence. "
260
+ "If Like: Generate a piece of singing voice. Text: xxx, Note: xxx, Duration: xxx. "
261
+ "Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
262
+ "The input to this tool should be a comma seperated string of three, "
263
+ "representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided. ")
264
+
265
  def set_model_hparams(self):
266
  set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
267
  self.hp = hp
 
270
  self.set_model_hparams()
271
  val = inputs.split(",")
272
  key = ['text', 'notes', 'notes_duration']
273
+ try:
274
+ inp = {k: v for k, v in zip(key, val)}
275
+ wav = self.pipe.infer_once(inp)
276
+ except:
277
+ print('Error occurs. Generate default audio sample.\n')
278
  inp = self.default_inp
279
+ wav = self.pipe.infer_once(inp)
 
 
280
  wav *= 32767
281
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
282
  wavfile.write(audio_filename, self.hp['audio_sample_rate'], wav.astype(np.int16))
 
294
  self.set_model_hparams()
295
  self.pipe = GenerSpeechInfer(self.hp, device)
296
 
297
+ @prompts(name="Style Transfer",
298
+ description="useful for when you want to generate speech samples with styles "
299
+ "(e.g., timbre, emotion, and prosody) derived from a reference custom voice. "
300
+ "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
301
+ "The input to this tool should be a comma seperated string of two, "
302
+ "representing reference audio path and input text. " )
303
+
304
  def set_model_hparams(self):
305
  set_hparams(config=self.config, exp_name=self.exp_name, print_hparams=False)
306
  f0_stats_fn = f'{hp["binary_data_dir"]}/train_f0s_mean_std.npy'
 
316
  key = ['ref_audio', 'text']
317
  val = inputs.split(",")
318
  inp = {k: v for k, v in zip(key, val)}
 
319
  wav = self.pipe.infer_once(inp)
320
  wav *= 32767
321
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
 
328
  def __init__(self, device):
329
  print("Initializing Make-An-Audio-inpaint to %s" % device)
330
  self.device = device
331
+ self.sampler = initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
332
+ self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
333
  self.cmap_transform = matplotlib.cm.viridis
334
+
335
+ @prompts(name="Audio Inpainting",
336
+ description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, "
337
+ "this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input. "
338
+ "The input to this tool should be a string, "
339
+ "representing the audio_path. " )
340
+
341
  def make_batch_sd(self, mel, mask, num_samples=1):
342
 
343
  mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
 
468
  print("Initializing Whisper to %s" % device)
469
  self.device = device
470
  self.model = whisper.load_model("base", device=device)
471
+
472
+ @prompts(name="Transcribe speech",
473
+ description="useful for when you want to know the text corresponding to a human speech, "
474
+ "receives audio_path as input. "
475
+ "The input to this tool should be a string, "
476
+ "representing the audio_path. " )
477
+
478
  def inference(self, audio_path):
479
  audio = whisper.load_audio(audio_path)
480
  audio = whisper.pad_or_trim(audio)
 
489
  print("Initializing Audio-To-Text Model to %s" % device)
490
  self.device = device
491
  self.model = AudioCapModel("audio_to_text/audiocaps_cntrstv_cnn14rnn_trm")
492
+
493
+ @prompts(name="Generate Text From The Audio",
494
+ description="useful for when you want to describe an audio in text, "
495
+ "receives audio_path as input. "
496
+ "The input to this tool should be a string, "
497
+ "representing the audio_path. " )
498
+
499
  def inference(self, audio_path):
500
  audio = whisper.load_audio(audio_path)
501
  caption_text = self.model(audio)