lmzjms commited on
Commit
9ada147
1 Parent(s): 4735175

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -18
app.py CHANGED
@@ -100,11 +100,13 @@ class ConversationBot:
100
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
101
  audio_load = whisper.load_audio(file.name)
102
  soundfile.write(audio_filename, audio_load, samplerate = 16000)
103
- description = self.a2t.inference(audio_filename)
104
- Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
105
- "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
 
 
106
  AI_prompt = "Received. "
107
- self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
108
  print("======>Current memory:\n %s" % self.agent.memory)
109
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
110
  state = state + [(f"*{audio_filename}*", AI_prompt)]
@@ -124,11 +126,13 @@ class ConversationBot:
124
  img = img.convert('RGB')
125
  img.save(image_filename, "PNG")
126
  print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
127
- description = self.i2t.inference(image_filename)
128
- Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to understand this image, but you should use tools to finish following tasks, " \
129
- "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(image_filename, description)
 
 
130
  AI_prompt = "Received. "
131
- self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
132
  print("======>Current memory:\n %s" % self.agent.memory)
133
  state = state + [(f"![]({image_filename})*{image_filename}*", AI_prompt)]
134
  print("Outputs:", state)
@@ -159,10 +163,10 @@ class ConversationBot:
159
  self.t2a = T2A(device="cpu")
160
  self.tts = TTS(device="cpu")
161
  # self.t2s = T2S(device="cuda:0")
162
- # self.i2a = I2A(device="cuda:0")
163
  self.a2t = A2T(device="cpu")
164
  # self.asr = ASR(device="cuda:0")
165
- # self.inpaint = Inpaint(device="cuda:0")
166
  #self.tts_ood = TTS_OOD(device="cuda:0")
167
  self.tools = [
168
  # Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
@@ -188,15 +192,15 @@ class ConversationBot:
188
  Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
189
  description="useful for when you want to convert a user input text into speech audio it saved it to a file."
190
  "The input to this tool should be a string, representing the text used to be converted to speech."),
191
- # Tool(name="Generate Audio From The Image", func=self.i2a.inference,
192
- # description="useful for when you want to generate an audio based on an image."
193
- # "The input to this tool should be a string, representing the image_path. "),
194
  Tool(name="Generate Text From The Audio", func=self.a2t.inference,
195
  description="useful for when you want to describe an audio in text, receives audio_path as input."
 
 
 
196
  "The input to this tool should be a string, representing the audio_path.")]
197
- # Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
198
- # description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
199
- # "The input to this tool should be a string, representing the audio_path."),
200
  # Tool(name="Transcribe speech", func=self.asr.inference,
201
  # description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
202
  # "The input to this tool should be a string, representing the audio_path.")]
@@ -218,7 +222,7 @@ if __name__ == '__main__':
218
  with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
219
  with gr.Row():
220
  openai_api_key_textbox = gr.Textbox(
221
- placeholder="Paste your OpenAI API key here to start Visual ChatGPT(sk-...) and press Enter ↵️",
222
  show_label=False,
223
  lines=1,
224
  type="password",
@@ -228,7 +232,7 @@ if __name__ == '__main__':
228
  chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
229
  state = gr.State([])
230
  with gr.Row(visible = False) as input_raws:
231
- with gr.Column(scale=0.7):
232
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
233
  with gr.Column(scale=0.15, min_width=0):
234
  clear = gr.Button("Clear️")
 
100
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
101
  audio_load = whisper.load_audio(file.name)
102
  soundfile.write(audio_filename, audio_load, samplerate = 16000)
103
+ # description = self.a2t.inference(audio_filename)
104
+ # Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
105
+ # "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
106
+ # AI_prompt = "Received. "
107
+ # self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
108
  AI_prompt = "Received. "
109
+ self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
110
  print("======>Current memory:\n %s" % self.agent.memory)
111
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
112
  state = state + [(f"*{audio_filename}*", AI_prompt)]
 
126
  img = img.convert('RGB')
127
  img.save(image_filename, "PNG")
128
  print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
129
+ # description = self.i2t.inference(image_filename)
130
+ # Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to understand this image, but you should use tools to finish following tasks, " \
131
+ # "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(image_filename, description)
132
+ # AI_prompt = "Received. "
133
+ # self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
134
  AI_prompt = "Received. "
135
+ self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
136
  print("======>Current memory:\n %s" % self.agent.memory)
137
  state = state + [(f"![]({image_filename})*{image_filename}*", AI_prompt)]
138
  print("Outputs:", state)
 
163
  self.t2a = T2A(device="cpu")
164
  self.tts = TTS(device="cpu")
165
  # self.t2s = T2S(device="cuda:0")
166
+ self.i2a = I2A(device="cuda:0")
167
  self.a2t = A2T(device="cpu")
168
  # self.asr = ASR(device="cuda:0")
169
+ self.inpaint = Inpaint(device="cuda:0")
170
  #self.tts_ood = TTS_OOD(device="cuda:0")
171
  self.tools = [
172
  # Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
 
192
  Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
193
  description="useful for when you want to convert a user input text into speech audio it saved it to a file."
194
  "The input to this tool should be a string, representing the text used to be converted to speech."),
195
+ Tool(name="Generate Audio From The Image", func=self.i2a.inference,
196
+ description="useful for when you want to generate an audio based on an image."
197
+ "The input to this tool should be a string, representing the image_path. "),
198
  Tool(name="Generate Text From The Audio", func=self.a2t.inference,
199
  description="useful for when you want to describe an audio in text, receives audio_path as input."
200
+ "The input to this tool should be a string, representing the audio_path.")
201
+ Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
202
+ description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
203
  "The input to this tool should be a string, representing the audio_path.")]
 
 
 
204
  # Tool(name="Transcribe speech", func=self.asr.inference,
205
  # description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
206
  # "The input to this tool should be a string, representing the audio_path.")]
 
222
  with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
223
  with gr.Row():
224
  openai_api_key_textbox = gr.Textbox(
225
+ placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
226
  show_label=False,
227
  lines=1,
228
  type="password",
 
232
  chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
233
  state = gr.State([])
234
  with gr.Row(visible = False) as input_raws:
235
+ with gr.Column(scale=0.9):
236
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
237
  with gr.Column(scale=0.15, min_width=0):
238
  clear = gr.Button("Clear️")