lmzjms commited on
Commit
85c8fb1
·
1 Parent(s): 71e76ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -176
app.py CHANGED
@@ -6,7 +6,7 @@ from audio_foundation_models import *
6
  import gradio as gr
7
 
8
  _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
9
- _DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Sending and Receiving Speech, Sing, Audio, and Talking head during chatting</a>. </p>'
10
  _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes.'
11
  if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
12
  _DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
@@ -43,23 +43,19 @@ Previous conversation history:
43
  New input: {input}
44
  Thought: Do I need to use a tool? {agent_scratchpad}"""
45
 
46
-
47
- def cut_dialogue_history(history_memory, keep_last_n_words=400):
48
- if history_memory is None or len(history_memory) == 0:
49
- return history_memory
50
  tokens = history_memory.split()
51
  n_tokens = len(tokens)
52
  print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
53
  if n_tokens < keep_last_n_words:
54
  return history_memory
55
- paragraphs = history_memory.split('\n')
56
- last_n_tokens = n_tokens
57
- while last_n_tokens >= keep_last_n_words:
58
- last_n_tokens -= len(paragraphs[0].split(' '))
59
- paragraphs = paragraphs[1:]
60
- return '\n' + '\n'.join(paragraphs)
61
-
62
-
63
 
64
  class ConversationBot:
65
  def __init__(self, load_dict):
@@ -69,6 +65,11 @@ class ConversationBot:
69
  self.models = dict()
70
  for class_name, device in load_dict.items():
71
  self.models[class_name] = globals()[class_name](device=device)
 
 
 
 
 
72
 
73
  def run_text(self, text, state):
74
  print("===============Running run_text =============")
@@ -81,7 +82,7 @@ class ConversationBot:
81
  response = res['output']
82
  state = state + [(text, response)]
83
  print("Outputs:", state)
84
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
85
  else:
86
  tool = res['intermediate_steps'][0][0].tool
87
  if tool == "Generate Image From User Input Text":
@@ -90,14 +91,14 @@ class ConversationBot:
90
  state = state + [(text, response)]
91
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
92
  f"Current Memory: {self.agent.memory.buffer}")
93
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
94
  elif tool == "Detect The Sound Event From The Audio":
95
  image_filename = res['intermediate_steps'][0][1]
96
  response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
97
  state = state + [(text, response)]
98
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
99
  f"Current Memory: {self.agent.memory.buffer}")
100
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
101
  elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
102
  print("======>Current memory:\n %s" % self.agent.memory)
103
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
@@ -105,21 +106,22 @@ class ConversationBot:
105
  #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
106
  state = state + [(text, response)]
107
  print("Outputs:", state)
108
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
109
  elif tool == "Audio Inpainting":
110
  audio_filename = res['intermediate_steps'][0][0].tool_input
111
  image_filename = res['intermediate_steps'][0][1]
112
  print("======>Current memory:\n %s" % self.agent.memory)
 
113
  response = res['output']
114
  state = state + [(text, response)]
115
  print("Outputs:", state)
116
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
117
  print("======>Current memory:\n %s" % self.agent.memory)
118
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
119
  audio_filename = res['intermediate_steps'][0][1]
120
  state = state + [(text, response)]
121
  print("Outputs:", state)
122
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
123
 
124
  def run_image_or_audio(self, file, state, txt):
125
  file_type = file.name[-3:]
@@ -128,9 +130,8 @@ class ConversationBot:
128
  print("Inputs:", file, state)
129
  print("======>Previous memory:\n %s" % self.agent.memory)
130
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
131
- # audio_load = whisper.load_audio(file.name)
132
- audio_load, sr = soundfile.read(file.name)
133
- soundfile.write(audio_filename, audio_load, samplerate = sr)
134
  description = self.models['A2T'].inference(audio_filename)
135
  Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
136
  "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
@@ -142,7 +143,7 @@ class ConversationBot:
142
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
143
  state = state + [(f"*{audio_filename}*", AI_prompt)]
144
  print("Outputs:", state)
145
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False)
146
  else:
147
  # print("===============Running run_image =============")
148
  # print("Inputs:", file, state)
@@ -168,69 +169,13 @@ class ConversationBot:
168
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
169
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
170
  f"Current Memory: {self.agent.memory.buffer}")
171
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False)
172
-
173
- def speech(self, speech_input, state):
174
- input_audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
175
- text = self.models['ASR'].translate_english(speech_input)
176
- print("Inputs:", text, state)
177
- print("======>Previous memory:\n %s" % self.agent.memory)
178
- self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
179
- res = self.agent({"input": text})
180
- if res['intermediate_steps'] == []:
181
- print("======>Current memory:\n %s" % self.agent.memory)
182
- response = res['output']
183
- output_audio_filename = self.models['TTS'].inference(response)
184
- state = state + [(text, response)]
185
- print("Outputs:", state)
186
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
187
- else:
188
- tool = res['intermediate_steps'][0][0].tool
189
- if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Target Sound Detection":
190
- print("======>Current memory:\n %s" % self.agent.memory)
191
- response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
192
- output_audio_filename = self.models['TTS'].inference(res['output'])
193
- state = state + [(text, response)]
194
- print("Outputs:", state)
195
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
196
- elif tool == "Transcribe Speech":
197
- print("======>Current memory:\n %s" % self.agent.memory)
198
- output_audio_filename = self.models['TTS'].inference(res['output'])
199
- response = res['output']
200
- state = state + [(text, response)]
201
- print("Outputs:", state)
202
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
203
- elif tool == "Detect The Sound Event From The Audio":
204
- print("======>Current memory:\n %s" % self.agent.memory)
205
- image_filename = res['intermediate_steps'][0][1]
206
- output_audio_filename = self.models['TTS'].inference(res['output'])
207
- response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
208
- state = state + [(text, response)]
209
- print("Outputs:", state)
210
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
211
- elif tool == "Generate a talking human portrait video given a input Audio":
212
- video_filename = res['intermediate_steps'][0][1]
213
- print("======>Current memory:\n %s" % self.agent.memory)
214
- response = res['output']
215
- output_audio_filename = self.models['TTS'].inference(res['output'])
216
- state = state + [(text, response)]
217
- print("Outputs:", state)
218
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(value=video_filename,visible=True)
219
- print("======>Current memory:\n %s" % self.agent.memory)
220
- response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
221
- audio_filename = res['intermediate_steps'][0][1]
222
- Res = "The audio file has been generated and the audio is "
223
- output_audio_filename = merge_audio(self.models['TTS'].inference(Res), audio_filename)
224
- print(output_audio_filename)
225
- state = state + [(text, response)]
226
- response = res['output']
227
- print("Outputs:", state)
228
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
229
 
230
  def inpainting(self, state, audio_filename, image_filename):
231
  print("===============Running inpainting =============")
232
  print("Inputs:", state)
233
  print("======>Previous memory:\n %s" % self.agent.memory)
 
234
  new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
235
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
236
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
@@ -240,62 +185,33 @@ class ConversationBot:
240
  return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
241
  def clear_audio(self):
242
  return gr.Audio.update(value=None, visible=False)
243
- def clear_input_audio(self):
244
- return gr.Audio.update(value=None)
245
  def clear_image(self):
246
  return gr.Image.update(value=None, visible=False)
247
- def clear_video(self):
248
- return gr.Video.update(value=None, visible=False)
249
  def clear_button(self):
250
  return gr.Button.update(visible=False)
251
-
252
- def init_agent(self, openai_api_key, interaction_type):
253
- if interaction_type == "text":
254
- for class_name, instance in self.models.items():
255
- for e in dir(instance):
256
- if e.startswith('inference'):
257
- func = getattr(instance, e)
258
- self.tools.append(Tool(name=func.name, description=func.description, func=func))
259
- self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
260
- self.agent = initialize_agent(
261
- self.tools,
262
- self.llm,
263
- agent="conversational-react-description",
264
- verbose=True,
265
- memory=self.memory,
266
- return_intermediate_steps=True,
267
- agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
268
- return gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)
269
- else:
270
- for class_name, instance in self.models.items():
271
- if class_name != 'T2A' and class_name != 'I2A' and class_name != 'Inpaint' and class_name != 'ASR' and class_name != 'SoundDetection' and class_name != 'Speech_Enh_SC' and class_name != 'Speech_SS':
272
- for e in dir(instance):
273
- if e.startswith('inference'):
274
- func = getattr(instance, e)
275
- self.tools.append(Tool(name=func.name, description=func.description, func=func))
276
-
277
- self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
278
- self.agent = initialize_agent(
279
- self.tools,
280
- self.llm,
281
- agent="conversational-react-description",
282
- verbose=True,
283
- memory=self.memory,
284
- return_intermediate_steps=True,
285
- agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
286
- return gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)
287
 
288
 
289
 
290
  if __name__ == '__main__':
291
  bot = ConversationBot({'ImageCaptioning': 'cuda:0',
292
- #'T2A': 'cuda:0',
293
- #'I2A': 'cuda:0',
294
  'TTS': 'cpu',
295
  'T2S': 'cpu',
296
  'ASR': 'cuda:0',
297
  'A2T': 'cpu',
298
- #'Inpaint': 'cuda:0',
299
  'SoundDetection': 'cpu',
300
  'Binaural': 'cuda:0',
301
  'SoundExtraction': 'cuda:0',
@@ -303,50 +219,37 @@ if __name__ == '__main__':
303
  'Speech_Enh_SC': 'cuda:0',
304
  'Speech_SS': 'cuda:0'
305
  })
306
- with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
307
- with gr.Row():
308
- gr.Markdown("## Audio ChatGPT")
309
- chatbot = gr.Chatbot(elem_id="chatbot", label="Audio ChatGPT", visible=False)
310
- state = gr.State([])
311
 
312
- with gr.Row() as select_raws:
313
- with gr.Column(scale=0.7):
314
- interaction_type = gr.Radio(choices=['text', 'speech'], value='text', label='Interaction Type')
315
  openai_api_key_textbox = gr.Textbox(
316
- placeholder="Paste your OpenAI API key here to start Audio ChatGPT(sk-...) and press Enter ↵️",
317
  show_label=False,
318
  lines=1,
319
  type="password",
320
  )
321
- with gr.Row(visible=False) as text_input_raws:
 
 
 
322
  with gr.Column(scale=0.7):
323
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
324
  with gr.Column(scale=0.1, min_width=0):
325
  run = gr.Button("🏃‍♂️Run")
326
  with gr.Column(scale=0.1, min_width=0):
327
- clear_txt = gr.Button("🔄Clear️")
328
  with gr.Column(scale=0.1, min_width=0):
329
  btn = gr.UploadButton("🖼️/🎙️ Upload", file_types=["image","audio"])
330
-
331
- with gr.Row():
332
- outaudio = gr.Audio(visible=False)
333
- with gr.Row():
334
- with gr.Column(scale=0.3, min_width=0):
335
- outvideo = gr.Video(visible=False)
336
- with gr.Row():
337
- show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
338
- with gr.Row():
339
- run_button = gr.Button("Predict Masked Place",visible=False)
340
-
341
- with gr.Row(visible=False) as speech_input_raws:
342
- with gr.Column(scale=0.7):
343
- speech_input = gr.Audio(source="microphone", type="filepath", label="Input")
344
- with gr.Column(scale=0.15, min_width=0):
345
- submit_btn = gr.Button("🏃‍♂️submit")
346
- with gr.Column(scale=0.15, min_width=0):
347
- clear_speech = gr.Button("🔄Clear️")
348
- with gr.Row():
349
- speech_output = gr.Audio(label="Output",visible=False)
350
  gr.Examples(
351
  examples=["Generate a speech with text 'here we go'",
352
  "Transcribe this speech",
@@ -363,27 +266,18 @@ if __name__ == '__main__':
363
  inputs=txt
364
  )
365
 
366
- openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox, interaction_type], [select_raws, chatbot, text_input_raws, speech_input_raws])
367
-
368
- txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
369
  txt.submit(lambda: "", None, txt)
370
- run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
371
  run.click(lambda: "", None, txt)
372
- btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, outaudio, outvideo])
373
- run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, outvideo, run_button])
374
- clear_txt.click(bot.memory.clear)
375
- clear_txt.click(lambda: [], None, chatbot)
376
- clear_txt.click(lambda: [], None, state)
377
- clear_txt.click(lambda:None, None, txt)
378
- clear_txt.click(bot.clear_button, None, run_button)
379
- clear_txt.click(bot.clear_image, None, show_mel)
380
- clear_txt.click(bot.clear_audio, None, outaudio)
381
- clear_txt.click(bot.clear_video, None, outvideo)
382
-
383
- submit_btn.click(bot.speech, [speech_input, state], [speech_input, speech_output, state, outvideo])
384
- clear_speech.click(bot.clear_input_audio, None, speech_input)
385
- clear_speech.click(bot.clear_audio, None, speech_output)
386
- clear_speech.click(lambda: [], None, state)
387
- clear_speech.click(bot.clear_video, None, outvideo)
388
-
389
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
6
  import gradio as gr
7
 
8
  _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
9
+ _DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head</a>. </p>'
10
  _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes.'
11
  if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
12
  _DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
 
43
  New input: {input}
44
  Thought: Do I need to use a tool? {agent_scratchpad}"""
45
 
46
+ def cut_dialogue_history(history_memory, keep_last_n_words = 500):
 
 
 
47
  tokens = history_memory.split()
48
  n_tokens = len(tokens)
49
  print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
50
  if n_tokens < keep_last_n_words:
51
  return history_memory
52
+ else:
53
+ paragraphs = history_memory.split('\n')
54
+ last_n_tokens = n_tokens
55
+ while last_n_tokens >= keep_last_n_words:
56
+ last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
57
+ paragraphs = paragraphs[1:]
58
+ return '\n' + '\n'.join(paragraphs)
 
59
 
60
  class ConversationBot:
61
  def __init__(self, load_dict):
 
65
  self.models = dict()
66
  for class_name, device in load_dict.items():
67
  self.models[class_name] = globals()[class_name](device=device)
68
+ for class_name, instance in self.models.items():
69
+ for e in dir(instance):
70
+ if e.startswith('inference'):
71
+ func = getattr(instance, e)
72
+ self.tools.append(Tool(name=func.name, description=func.description, func=func))
73
 
74
  def run_text(self, text, state):
75
  print("===============Running run_text =============")
 
82
  response = res['output']
83
  state = state + [(text, response)]
84
  print("Outputs:", state)
85
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
86
  else:
87
  tool = res['intermediate_steps'][0][0].tool
88
  if tool == "Generate Image From User Input Text":
 
91
  state = state + [(text, response)]
92
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
93
  f"Current Memory: {self.agent.memory.buffer}")
94
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
95
  elif tool == "Detect The Sound Event From The Audio":
96
  image_filename = res['intermediate_steps'][0][1]
97
  response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
98
  state = state + [(text, response)]
99
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
100
  f"Current Memory: {self.agent.memory.buffer}")
101
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
102
  elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
103
  print("======>Current memory:\n %s" % self.agent.memory)
104
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
 
106
  #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
107
  state = state + [(text, response)]
108
  print("Outputs:", state)
109
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
110
  elif tool == "Audio Inpainting":
111
  audio_filename = res['intermediate_steps'][0][0].tool_input
112
  image_filename = res['intermediate_steps'][0][1]
113
  print("======>Current memory:\n %s" % self.agent.memory)
114
+ print(res)
115
  response = res['output']
116
  state = state + [(text, response)]
117
  print("Outputs:", state)
118
+ return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
119
  print("======>Current memory:\n %s" % self.agent.memory)
120
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
121
  audio_filename = res['intermediate_steps'][0][1]
122
  state = state + [(text, response)]
123
  print("Outputs:", state)
124
+ return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(visible=False), gr.Button.update(visible=False)
125
 
126
  def run_image_or_audio(self, file, state, txt):
127
  file_type = file.name[-3:]
 
130
  print("Inputs:", file, state)
131
  print("======>Previous memory:\n %s" % self.agent.memory)
132
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
133
+ audio_load = whisper.load_audio(file.name)
134
+ soundfile.write(audio_filename, audio_load, samplerate = 16000)
 
135
  description = self.models['A2T'].inference(audio_filename)
136
  Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
137
  "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
 
143
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
144
  state = state + [(f"*{audio_filename}*", AI_prompt)]
145
  print("Outputs:", state)
146
+ return state, state, txt + ' ' + audio_filename + ' ', gr.Audio.update(value=audio_filename,visible=True)
147
  else:
148
  # print("===============Running run_image =============")
149
  # print("Inputs:", file, state)
 
169
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
170
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
171
  f"Current Memory: {self.agent.memory.buffer}")
172
+ return state, state, txt + f'{txt} {image_filename} ', gr.Audio.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  def inpainting(self, state, audio_filename, image_filename):
175
  print("===============Running inpainting =============")
176
  print("Inputs:", state)
177
  print("======>Previous memory:\n %s" % self.agent.memory)
178
+ # inpaint = Inpaint(device="cpu")
179
  new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
180
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
181
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
 
185
  return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
186
  def clear_audio(self):
187
  return gr.Audio.update(value=None, visible=False)
 
 
188
  def clear_image(self):
189
  return gr.Image.update(value=None, visible=False)
 
 
190
  def clear_button(self):
191
  return gr.Button.update(visible=False)
192
+ def init_agent(self, openai_api_key):
193
+ self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
194
+ self.agent = initialize_agent(
195
+ self.tools,
196
+ self.llm,
197
+ agent="conversational-react-description",
198
+ verbose=True,
199
+ memory=self.memory,
200
+ return_intermediate_steps=True,
201
+ agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
202
+ return gr.update(visible = True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
 
205
 
206
  if __name__ == '__main__':
207
  bot = ConversationBot({'ImageCaptioning': 'cuda:0',
208
+ 'T2A': 'cuda:0',
209
+ 'I2A': 'cuda:0',
210
  'TTS': 'cpu',
211
  'T2S': 'cpu',
212
  'ASR': 'cuda:0',
213
  'A2T': 'cpu',
214
+ 'Inpaint': 'cuda:0',
215
  'SoundDetection': 'cpu',
216
  'Binaural': 'cuda:0',
217
  'SoundExtraction': 'cuda:0',
 
219
  'Speech_Enh_SC': 'cuda:0',
220
  'Speech_SS': 'cuda:0'
221
  })
222
+ with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
223
+ gr.Markdown(_DESCRIPTION)
 
 
 
224
 
225
+ with gr.Row():
 
 
226
  openai_api_key_textbox = gr.Textbox(
227
+ placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
228
  show_label=False,
229
  lines=1,
230
  type="password",
231
  )
232
+
233
+ chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
234
+ state = gr.State([])
235
+ with gr.Row(visible = False) as input_raws:
236
  with gr.Column(scale=0.7):
237
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
238
  with gr.Column(scale=0.1, min_width=0):
239
  run = gr.Button("🏃‍♂️Run")
240
  with gr.Column(scale=0.1, min_width=0):
241
+ clear = gr.Button("🔄Clear️")
242
  with gr.Column(scale=0.1, min_width=0):
243
  btn = gr.UploadButton("🖼️/🎙️ Upload", file_types=["image","audio"])
244
+ with gr.Row():
245
+ with gr.Column():
246
+ outaudio = gr.Audio(visible=False)
247
+ with gr.Row():
248
+ with gr.Column():
249
+ show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
250
+ with gr.Row():
251
+ with gr.Column():
252
+ run_button = gr.Button("Predict Masked Place",visible=False)
 
 
 
 
 
 
 
 
 
 
 
253
  gr.Examples(
254
  examples=["Generate a speech with text 'here we go'",
255
  "Transcribe this speech",
 
266
  inputs=txt
267
  )
268
 
269
+ openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
270
+ txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
 
271
  txt.submit(lambda: "", None, txt)
272
+ run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
273
  run.click(lambda: "", None, txt)
274
+ btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
275
+ run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, run_button])
276
+ clear.click(bot.memory.clear)
277
+ clear.click(lambda: [], None, chatbot)
278
+ clear.click(lambda: [], None, state)
279
+ clear.click(lambda:None, None, txt)
280
+ clear.click(bot.clear_button, None, run_button)
281
+ clear.click(bot.clear_image, None, show_mel)
282
+ clear.click(bot.clear_audio, None, outaudio)
 
 
 
 
 
 
 
 
283
  demo.launch(server_name="0.0.0.0", server_port=7860)