lmzjms commited on
Commit
9e43f21
1 Parent(s): 4d54c87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -50
app.py CHANGED
@@ -50,10 +50,19 @@ def cut_dialogue_history(history_memory, keep_last_n_words = 500):
50
  return '\n' + '\n'.join(paragraphs)
51
 
52
  class ConversationBot:
53
- def __init__(self):
54
  print("Initializing AudioGPT")
55
  self.tools = []
56
  self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
 
 
 
 
 
 
 
 
 
57
  def run_text(self, text, state):
58
  print("===============Running run_text =============")
59
  print("Inputs:", text, state)
@@ -147,7 +156,7 @@ class ConversationBot:
147
  print("===============Running inpainting =============")
148
  print("Inputs:", state)
149
  print("======>Previous memory:\n %s" % self.agent.memory)
150
- inpaint = Inpaint(device="cuda:0")
151
  new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
152
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
153
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
@@ -163,52 +172,6 @@ class ConversationBot:
163
  return gr.Button.update(visible=False)
164
  def init_agent(self, openai_api_key):
165
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
166
- self.t2i = T2I(device="cuda:0")
167
- # self.i2t = ImageCaptioning(device="cuda:0")
168
- self.t2a = T2A(device="cuda:0")
169
- self.tts = TTS(device="cpu")
170
- self.t2s = T2S(device="cpu")
171
- self.i2a = I2A(device="cuda:0")
172
- self.a2t = A2T(device="cpu")
173
- self.asr = ASR(device="cuda:0")
174
- self.inpaint = Inpaint(device="cuda:0")
175
- # self.tts_ood = TTS_OOD(device="cpu")
176
- self.tools = [
177
- Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
178
- description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
179
- "The input to this tool should be a string, representing the text used to generate image. "),
180
- # Tool(name="Get Photo Description", func=self.i2t.inference,
181
- # description="useful for when you want to know what is inside the photo. receives image_path as input. "
182
- # "The input to this tool should be a string, representing the image_path. "),
183
- Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
184
- description="useful for when you want to generate an audio from a user input text and it saved it to a file."
185
- "The input to this tool should be a string, representing the text used to generate audio."),
186
- # Tool(
187
- # name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
188
- # description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
189
- # "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
190
- # "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
191
- Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
192
- description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
193
- "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
194
- "If Like: Generate a piece of singing voice. Text: xxx, Note: xxx, Duration: xxx. "
195
- "Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
196
- "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
197
- Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
198
- description="useful for when you want to convert a user input text into speech audio it saved it to a file."
199
- "The input to this tool should be a string, representing the text used to be converted to speech."),
200
- Tool(name="Generate Audio From The Image", func=self.i2a.inference,
201
- description="useful for when you want to generate an audio based on an image."
202
- "The input to this tool should be a string, representing the image_path. "),
203
- Tool(name="Generate Text From The Audio", func=self.a2t.inference,
204
- description="useful for when you want to describe an audio in text, receives audio_path as input."
205
- "The input to this tool should be a string, representing the audio_path."),
206
- Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
207
- description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
208
- "The input to this tool should be a string, representing the audio_path."),
209
- Tool(name="Transcribe speech", func=self.asr.inference,
210
- description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
211
- "The input to this tool should be a string, representing the audio_path.")]
212
  self.agent = initialize_agent(
213
  self.tools,
214
  self.llm,
@@ -221,8 +184,16 @@ class ConversationBot:
221
 
222
 
223
 
224
- if __name__ == '__main__':
225
- bot = ConversationBot()
 
 
 
 
 
 
 
 
226
  with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
227
  with gr.Row():
228
  openai_api_key_textbox = gr.Textbox(
 
50
  return '\n' + '\n'.join(paragraphs)
51
 
52
  class ConversationBot:
53
+ def __init__(self, load_dict):
54
  print("Initializing AudioGPT")
55
  self.tools = []
56
  self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
57
+ self.models = dict()
58
+ for class_name, device in load_dict.items():
59
+ self.models[class_name] = globals()[class_name](device=device)
60
+ for class_name, instance in self.models.items():
61
+ for e in dir(instance):
62
+ if e.startswith('inference'):
63
+ func = getattr(instance, e)
64
+ self.tools.append(Tool(name=func.name, description=func.description, func=func))
65
+
66
  def run_text(self, text, state):
67
  print("===============Running run_text =============")
68
  print("Inputs:", text, state)
 
156
  print("===============Running inpainting =============")
157
  print("Inputs:", state)
158
  print("======>Previous memory:\n %s" % self.agent.memory)
159
+ inpaint = Inpaint(device="cpu")
160
  new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
161
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
162
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
 
172
  return gr.Button.update(visible=False)
173
  def init_agent(self, openai_api_key):
174
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  self.agent = initialize_agent(
176
  self.tools,
177
  self.llm,
 
184
 
185
 
186
 
187
+ if __name__ == '__main__':
188
+ bot = ConversationBot({'T2I': 'cuda:0',
189
+ 'T2A': 'cuda:0',
190
+ 'I2A': 'cuda:0',
191
+ 'TTS': 'cpu',
192
+ 'T2S': 'cpu',
193
+ 'Inpaint': 'cpu',
194
+ 'ASR': 'cuda:0',
195
+ 'A2T': 'cpu',
196
+ })
197
  with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
198
  with gr.Row():
199
  openai_api_key_textbox = gr.Textbox(