lmzjms commited on
Commit
b84694b
1 Parent(s): 810d95e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -56
app.py CHANGED
@@ -17,7 +17,6 @@ AudioGPT can not directly read audios, but it has a list of tools to finish diff
17
  AudioGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
18
  Human may provide new audios to AudioGPT with a description. The description helps AudioGPT to understand this audio, but AudioGPT should use tools to finish following tasks, rather than directly imagine from the description.
19
  Overall, AudioGPT is a powerful audio dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
20
-
21
  TOOLS:
22
  ------
23
  AudioGPT has access to the following tools:"""
@@ -58,6 +57,18 @@ def cut_dialogue_history(history_memory, keep_last_n_words = 500):
58
  paragraphs = paragraphs[1:]
59
  return '\n' + '\n'.join(paragraphs)
60
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  class ConversationBot:
62
  def __init__(self, load_dict):
63
  print("Initializing AudioGPT")
@@ -66,11 +77,6 @@ class ConversationBot:
66
  self.models = dict()
67
  for class_name, device in load_dict.items():
68
  self.models[class_name] = globals()[class_name](device=device)
69
- for class_name, instance in self.models.items():
70
- for e in dir(instance):
71
- if e.startswith('inference'):
72
- func = getattr(instance, e)
73
- self.tools.append(Tool(name=func.name, description=func.description, func=func))
74
 
75
  def run_text(self, text, state):
76
  print("===============Running run_text =============")
@@ -83,7 +89,7 @@ class ConversationBot:
83
  response = res['output']
84
  state = state + [(text, response)]
85
  print("Outputs:", state)
86
- return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
87
  else:
88
  tool = res['intermediate_steps'][0][0].tool
89
  if tool == "Generate Image From User Input Text":
@@ -92,14 +98,14 @@ class ConversationBot:
92
  state = state + [(text, response)]
93
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
94
  f"Current Memory: {self.agent.memory.buffer}")
95
- return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
96
  elif tool == "Detect The Sound Event From The Audio":
97
  image_filename = res['intermediate_steps'][0][1]
98
  response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
99
  state = state + [(text, response)]
100
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
101
  f"Current Memory: {self.agent.memory.buffer}")
102
- return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
103
  elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
104
  print("======>Current memory:\n %s" % self.agent.memory)
105
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
@@ -107,22 +113,21 @@ class ConversationBot:
107
  #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
108
  state = state + [(text, response)]
109
  print("Outputs:", state)
110
- return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
111
  elif tool == "Audio Inpainting":
112
  audio_filename = res['intermediate_steps'][0][0].tool_input
113
  image_filename = res['intermediate_steps'][0][1]
114
  print("======>Current memory:\n %s" % self.agent.memory)
115
- print(res)
116
  response = res['output']
117
  state = state + [(text, response)]
118
  print("Outputs:", state)
119
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
120
  print("======>Current memory:\n %s" % self.agent.memory)
121
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
122
  audio_filename = res['intermediate_steps'][0][1]
123
  state = state + [(text, response)]
124
  print("Outputs:", state)
125
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(visible=False), gr.Button.update(visible=False)
126
 
127
  def run_image_or_audio(self, file, state, txt):
128
  file_type = file.name[-3:]
@@ -144,7 +149,7 @@ class ConversationBot:
144
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
145
  state = state + [(f"*{audio_filename}*", AI_prompt)]
146
  print("Outputs:", state)
147
- return state, state, txt + ' ' + audio_filename + ' ', gr.Audio.update(value=audio_filename,visible=True)
148
  else:
149
  # print("===============Running run_image =============")
150
  # print("Inputs:", file, state)
@@ -170,13 +175,69 @@ class ConversationBot:
170
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
171
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
172
  f"Current Memory: {self.agent.memory.buffer}")
173
- return state, state, txt + f'{txt} {image_filename} ', gr.Audio.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  def inpainting(self, state, audio_filename, image_filename):
176
  print("===============Running inpainting =============")
177
  print("Inputs:", state)
178
  print("======>Previous memory:\n %s" % self.agent.memory)
179
- # inpaint = Inpaint(device="cpu")
180
  new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
181
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
182
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
@@ -186,21 +247,50 @@ class ConversationBot:
186
  return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
187
  def clear_audio(self):
188
  return gr.Audio.update(value=None, visible=False)
 
 
189
  def clear_image(self):
190
  return gr.Image.update(value=None, visible=False)
 
 
191
  def clear_button(self):
192
  return gr.Button.update(visible=False)
193
- def init_agent(self, openai_api_key):
194
- self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
195
- self.agent = initialize_agent(
196
- self.tools,
197
- self.llm,
198
- agent="conversational-react-description",
199
- verbose=True,
200
- memory=self.memory,
201
- return_intermediate_steps=True,
202
- agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
203
- return gr.update(visible = True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
 
206
 
@@ -218,37 +308,50 @@ if __name__ == '__main__':
218
  'SoundExtraction': 'cuda:0',
219
  'TargetSoundDetection': 'cuda:0'
220
  })
221
- with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
222
- gr.Markdown(_DESCRIPTION)
223
-
224
  with gr.Row():
 
 
 
 
 
 
 
225
  openai_api_key_textbox = gr.Textbox(
226
  placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
227
  show_label=False,
228
  lines=1,
229
  type="password",
230
  )
231
-
232
- chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
233
- state = gr.State([])
234
- with gr.Row(visible = False) as input_raws:
235
  with gr.Column(scale=0.7):
236
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
237
  with gr.Column(scale=0.1, min_width=0):
238
  run = gr.Button("🏃‍♂️Run")
239
  with gr.Column(scale=0.1, min_width=0):
240
- clear = gr.Button("🔄Clear️")
241
  with gr.Column(scale=0.1, min_width=0):
242
  btn = gr.UploadButton("🖼️Upload", file_types=["image","audio"])
243
- with gr.Row():
244
- with gr.Column():
245
- outaudio = gr.Audio(visible=False)
246
- with gr.Row():
247
- with gr.Column():
248
- show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
249
- with gr.Row():
250
- with gr.Column():
251
- run_button = gr.Button("Predict Masked Place",visible=False)
 
 
 
 
 
 
 
 
 
 
 
252
  gr.Examples(
253
  examples=["Generate a speech with text 'here we go'",
254
  "Transcribe this speech",
@@ -265,18 +368,27 @@ if __name__ == '__main__':
265
  inputs=txt
266
  )
267
 
268
- openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
269
- txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
 
270
  txt.submit(lambda: "", None, txt)
271
- run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
272
  run.click(lambda: "", None, txt)
273
- btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
274
- run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, run_button])
275
- clear.click(bot.memory.clear)
276
- clear.click(lambda: [], None, chatbot)
277
- clear.click(lambda: [], None, state)
278
- clear.click(lambda:None, None, txt)
279
- clear.click(bot.clear_button, None, run_button)
280
- clear.click(bot.clear_image, None, show_mel)
281
- clear.click(bot.clear_audio, None, outaudio)
 
 
 
 
 
 
 
 
282
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
17
  AudioGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
18
  Human may provide new audios to AudioGPT with a description. The description helps AudioGPT to understand this audio, but AudioGPT should use tools to finish following tasks, rather than directly imagine from the description.
19
  Overall, AudioGPT is a powerful audio dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
 
20
  TOOLS:
21
  ------
22
  AudioGPT has access to the following tools:"""
 
57
  paragraphs = paragraphs[1:]
58
  return '\n' + '\n'.join(paragraphs)
59
 
60
+ def merge_audio(audio_path_1, audio_path_2):
61
+ merged_signal = []
62
+ sr_1, signal_1 = wavfile.read(audio_path_1)
63
+ sr_2, signal_2 = wavfile.read(audio_path_2)
64
+ merged_signal.append(signal_1)
65
+ merged_signal.append(signal_2)
66
+ merged_signal = np.hstack(merged_signal)
67
+ merged_signal = np.asarray(merged_signal, dtype=np.int16)
68
+ audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
69
+ wavfile.write(audio_filename, sr_2, merged_signal)
70
+ return audio_filename
71
+
72
  class ConversationBot:
73
  def __init__(self, load_dict):
74
  print("Initializing AudioGPT")
 
77
  self.models = dict()
78
  for class_name, device in load_dict.items():
79
  self.models[class_name] = globals()[class_name](device=device)
 
 
 
 
 
80
 
81
  def run_text(self, text, state):
82
  print("===============Running run_text =============")
 
89
  response = res['output']
90
  state = state + [(text, response)]
91
  print("Outputs:", state)
92
+ return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
93
  else:
94
  tool = res['intermediate_steps'][0][0].tool
95
  if tool == "Generate Image From User Input Text":
 
98
  state = state + [(text, response)]
99
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
100
  f"Current Memory: {self.agent.memory.buffer}")
101
+ return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
102
  elif tool == "Detect The Sound Event From The Audio":
103
  image_filename = res['intermediate_steps'][0][1]
104
  response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
105
  state = state + [(text, response)]
106
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
107
  f"Current Memory: {self.agent.memory.buffer}")
108
+ return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
109
  elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
110
  print("======>Current memory:\n %s" % self.agent.memory)
111
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
 
113
  #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
114
  state = state + [(text, response)]
115
  print("Outputs:", state)
116
+ return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
117
  elif tool == "Audio Inpainting":
118
  audio_filename = res['intermediate_steps'][0][0].tool_input
119
  image_filename = res['intermediate_steps'][0][1]
120
  print("======>Current memory:\n %s" % self.agent.memory)
 
121
  response = res['output']
122
  state = state + [(text, response)]
123
  print("Outputs:", state)
124
+ return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
125
  print("======>Current memory:\n %s" % self.agent.memory)
126
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
127
  audio_filename = res['intermediate_steps'][0][1]
128
  state = state + [(text, response)]
129
  print("Outputs:", state)
130
+ return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
131
 
132
  def run_image_or_audio(self, file, state, txt):
133
  file_type = file.name[-3:]
 
149
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
150
  state = state + [(f"*{audio_filename}*", AI_prompt)]
151
  print("Outputs:", state)
152
+ return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False)
153
  else:
154
  # print("===============Running run_image =============")
155
  # print("Inputs:", file, state)
 
175
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
176
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
177
  f"Current Memory: {self.agent.memory.buffer}")
178
+ return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False)
179
+
180
+ def speech(self, speech_input, state):
181
+ input_audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
182
+ text = self.models['ASR'].translate_english(speech_input)
183
+ print("Inputs:", text, state)
184
+ print("======>Previous memory:\n %s" % self.agent.memory)
185
+ self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
186
+ res = self.agent({"input": text})
187
+ if res['intermediate_steps'] == []:
188
+ print("======>Current memory:\n %s" % self.agent.memory)
189
+ response = res['output']
190
+ output_audio_filename = self.models['TTS'].inference(response)
191
+ state = state + [(text, response)]
192
+ print("Outputs:", state)
193
+ return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
194
+ else:
195
+ tool = res['intermediate_steps'][0][0].tool
196
+ if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Target Sound Detection":
197
+ print("======>Current memory:\n %s" % self.agent.memory)
198
+ response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
199
+ output_audio_filename = self.models['TTS'].inference(res['output'])
200
+ state = state + [(text, response)]
201
+ print("Outputs:", state)
202
+ return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
203
+ elif tool == "Transcribe Speech":
204
+ print("======>Current memory:\n %s" % self.agent.memory)
205
+ output_audio_filename = self.models['TTS'].inference(res['output'])
206
+ response = res['output']
207
+ state = state + [(text, response)]
208
+ print("Outputs:", state)
209
+ return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
210
+ elif tool == "Detect The Sound Event From The Audio":
211
+ print("======>Current memory:\n %s" % self.agent.memory)
212
+ image_filename = res['intermediate_steps'][0][1]
213
+ output_audio_filename = self.models['TTS'].inference(res['output'])
214
+ response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
215
+ state = state + [(text, response)]
216
+ print("Outputs:", state)
217
+ return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
218
+ elif tool == "Generate a talking human portrait video given a input Audio":
219
+ video_filename = res['intermediate_steps'][0][1]
220
+ print("======>Current memory:\n %s" % self.agent.memory)
221
+ response = res['output']
222
+ output_audio_filename = self.models['TTS'].inference(res['output'])
223
+ state = state + [(text, response)]
224
+ print("Outputs:", state)
225
+ return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(value=video_filename,visible=True)
226
+ print("======>Current memory:\n %s" % self.agent.memory)
227
+ response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
228
+ audio_filename = res['intermediate_steps'][0][1]
229
+ Res = "The audio file has been generated and the audio is "
230
+ output_audio_filename = merge_audio(self.models['TTS'].inference(Res), audio_filename)
231
+ print(output_audio_filename)
232
+ state = state + [(text, response)]
233
+ response = res['output']
234
+ print("Outputs:", state)
235
+ return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
236
 
237
  def inpainting(self, state, audio_filename, image_filename):
238
  print("===============Running inpainting =============")
239
  print("Inputs:", state)
240
  print("======>Previous memory:\n %s" % self.agent.memory)
 
241
  new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
242
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
243
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
 
247
  return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
248
  def clear_audio(self):
249
  return gr.Audio.update(value=None, visible=False)
250
+ def clear_input_audio(self):
251
+ return gr.Audio.update(value=None)
252
  def clear_image(self):
253
  return gr.Image.update(value=None, visible=False)
254
+ def clear_video(self):
255
+ return gr.Video.update(value=None, visible=False)
256
  def clear_button(self):
257
  return gr.Button.update(visible=False)
258
+
259
+ def init_agent(self, openai_api_key, interaction_type):
260
+ if interaction_type == "text":
261
+ for class_name, instance in self.models.items():
262
+ for e in dir(instance):
263
+ if e.startswith('inference'):
264
+ func = getattr(instance, e)
265
+ self.tools.append(Tool(name=func.name, description=func.description, func=func))
266
+ self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
267
+ self.agent = initialize_agent(
268
+ self.tools,
269
+ self.llm,
270
+ agent="conversational-react-description",
271
+ verbose=True,
272
+ memory=self.memory,
273
+ return_intermediate_steps=True,
274
+ agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
275
+ return gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)
276
+ else:
277
+ for class_name, instance in self.models.items():
278
+ if class_name != 'T2A' and class_name != 'I2A' and class_name != 'Inpaint' and class_name != 'ASR' and class_name != 'SoundDetection':
279
+ for e in dir(instance):
280
+ if e.startswith('inference'):
281
+ func = getattr(instance, e)
282
+ self.tools.append(Tool(name=func.name, description=func.description, func=func))
283
+
284
+ self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
285
+ self.agent = initialize_agent(
286
+ self.tools,
287
+ self.llm,
288
+ agent="conversational-react-description",
289
+ verbose=True,
290
+ memory=self.memory,
291
+ return_intermediate_steps=True,
292
+ agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
293
+ return gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)
294
 
295
 
296
 
 
308
  'SoundExtraction': 'cuda:0',
309
  'TargetSoundDetection': 'cuda:0'
310
  })
311
+ with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
 
 
312
  with gr.Row():
313
+ gr.Markdown("## AudioGPT")
314
+ chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT", visible=False)
315
+ state = gr.State([])
316
+
317
+ with gr.Row() as select_raws:
318
+ with gr.Column(scale=0.7):
319
+ interaction_type = gr.Radio(choices=['text', 'speech'], value='text', label='Interaction Type')
320
  openai_api_key_textbox = gr.Textbox(
321
  placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
322
  show_label=False,
323
  lines=1,
324
  type="password",
325
  )
326
+ with gr.Row(visible=False) as text_input_raws:
 
 
 
327
  with gr.Column(scale=0.7):
328
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
329
  with gr.Column(scale=0.1, min_width=0):
330
  run = gr.Button("🏃‍♂️Run")
331
  with gr.Column(scale=0.1, min_width=0):
332
+ clear_txt = gr.Button("🔄Clear️")
333
  with gr.Column(scale=0.1, min_width=0):
334
  btn = gr.UploadButton("🖼️Upload", file_types=["image","audio"])
335
+
336
+ with gr.Row():
337
+ outaudio = gr.Audio(visible=False)
338
+ with gr.Row():
339
+ with gr.Column(scale=0.3, min_width=0):
340
+ outvideo = gr.Video(visible=False)
341
+ with gr.Row():
342
+ show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
343
+ with gr.Row():
344
+ run_button = gr.Button("Predict Masked Place",visible=False)
345
+
346
+ with gr.Row(visible=False) as speech_input_raws:
347
+ with gr.Column(scale=0.7):
348
+ speech_input = gr.Audio(source="microphone", type="filepath", label="Input")
349
+ with gr.Column(scale=0.15, min_width=0):
350
+ submit_btn = gr.Button("🏃‍♂️Submit")
351
+ with gr.Column(scale=0.15, min_width=0):
352
+ clear_speech = gr.Button("🔄Clear️")
353
+ with gr.Row():
354
+ speech_output = gr.Audio(label="Output",visible=False)
355
  gr.Examples(
356
  examples=["Generate a speech with text 'here we go'",
357
  "Transcribe this speech",
 
368
  inputs=txt
369
  )
370
 
371
+ openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox, interaction_type], [select_raws, chatbot, text_input_raws, speech_input_raws])
372
+
373
+ txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
374
  txt.submit(lambda: "", None, txt)
375
+ run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
376
  run.click(lambda: "", None, txt)
377
+ btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, outaudio, outvideo])
378
+ run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, outvideo, run_button])
379
+ clear_txt.click(bot.memory.clear)
380
+ clear_txt.click(lambda: [], None, chatbot)
381
+ clear_txt.click(lambda: [], None, state)
382
+ clear_txt.click(lambda:None, None, txt)
383
+ clear_txt.click(bot.clear_button, None, run_button)
384
+ clear_txt.click(bot.clear_image, None, show_mel)
385
+ clear_txt.click(bot.clear_audio, None, outaudio)
386
+ clear_txt.click(bot.clear_video, None, outvideo)
387
+
388
+ submit_btn.click(bot.speech, [speech_input, state], [speech_input, speech_output, state, outvideo])
389
+ clear_speech.click(bot.clear_input_audio, None, speech_input)
390
+ clear_speech.click(bot.clear_audio, None, speech_output)
391
+ clear_speech.click(lambda: [], None, state)
392
+ clear_speech.click(bot.clear_video, None, outvideo)
393
+
394
  demo.launch(server_name="0.0.0.0", server_port=7860)