lmzjms commited on
Commit
4d6d83d
·
1 Parent(s): 90e0038

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -179
app.py CHANGED
@@ -149,7 +149,7 @@ from audio_foundation_models import *
149
  import gradio as gr
150
 
151
  _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
152
- _DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Sending and Receiving Speech, Sing, Audio, and Talking head during chatting</a>. </p>'
153
  _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes.'
154
  if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
155
  _DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
@@ -187,22 +187,18 @@ New input: {input}
187
  Thought: Do I need to use a tool? {agent_scratchpad}"""
188
 
189
  def cut_dialogue_history(history_memory, keep_last_n_words = 500):
190
- if history_memory is None or len(history_memory) == 0:
191
- print("history memory is none.")
192
- return history_memory
193
  tokens = history_memory.split()
194
  n_tokens = len(tokens)
195
  print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
196
  if n_tokens < keep_last_n_words:
197
  return history_memory
198
- paragraphs = history_memory.split('\n')
199
- last_n_tokens = n_tokens
200
- while last_n_tokens >= keep_last_n_words:
201
- last_n_tokens -= len(paragraphs[0].split(' '))
202
- paragraphs = paragraphs[1:]
203
- return '\n' + '\n'.join(paragraphs)
204
-
205
-
206
 
207
  class ConversationBot:
208
  def __init__(self, load_dict):
@@ -212,24 +208,24 @@ class ConversationBot:
212
  self.models = dict()
213
  for class_name, device in load_dict.items():
214
  self.models[class_name] = globals()[class_name](device=device)
 
 
 
 
 
215
 
216
  def run_text(self, text, state):
217
  print("===============Running run_text =============")
218
  print("Inputs:", text, state)
219
  print("======>Previous memory:\n %s" % self.agent.memory)
220
  self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
221
- print("###memory###")
222
- print(self.agent.memory)
223
- print("###buffer###")
224
- print(self.agent.memory.buffer)
225
-
226
  res = self.agent({"input": text})
227
  if res['intermediate_steps'] == []:
228
  print("======>Current memory:\n %s" % self.agent.memory)
229
  response = res['output']
230
  state = state + [(text, response)]
231
  print("Outputs:", state)
232
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
233
  else:
234
  tool = res['intermediate_steps'][0][0].tool
235
  if tool == "Generate Image From User Input Text":
@@ -238,14 +234,14 @@ class ConversationBot:
238
  state = state + [(text, response)]
239
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
240
  f"Current Memory: {self.agent.memory.buffer}")
241
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
242
  elif tool == "Detect The Sound Event From The Audio":
243
  image_filename = res['intermediate_steps'][0][1]
244
  response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
245
  state = state + [(text, response)]
246
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
247
  f"Current Memory: {self.agent.memory.buffer}")
248
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
249
  elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
250
  print("======>Current memory:\n %s" % self.agent.memory)
251
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
@@ -253,21 +249,22 @@ class ConversationBot:
253
  #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
254
  state = state + [(text, response)]
255
  print("Outputs:", state)
256
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
257
  elif tool == "Audio Inpainting":
258
  audio_filename = res['intermediate_steps'][0][0].tool_input
259
  image_filename = res['intermediate_steps'][0][1]
260
  print("======>Current memory:\n %s" % self.agent.memory)
 
261
  response = res['output']
262
  state = state + [(text, response)]
263
  print("Outputs:", state)
264
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
265
  print("======>Current memory:\n %s" % self.agent.memory)
266
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
267
  audio_filename = res['intermediate_steps'][0][1]
268
  state = state + [(text, response)]
269
  print("Outputs:", state)
270
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
271
 
272
  def run_image_or_audio(self, file, state, txt):
273
  file_type = file.name[-3:]
@@ -276,9 +273,8 @@ class ConversationBot:
276
  print("Inputs:", file, state)
277
  print("======>Previous memory:\n %s" % self.agent.memory)
278
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
279
- # audio_load = whisper.load_audio(file.name)
280
- audio_load, sr = soundfile.read(file.name)
281
- soundfile.write(audio_filename, audio_load, samplerate = sr)
282
  description = self.models['A2T'].inference(audio_filename)
283
  Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
284
  "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
@@ -290,7 +286,7 @@ class ConversationBot:
290
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
291
  state = state + [(f"*{audio_filename}*", AI_prompt)]
292
  print("Outputs:", state)
293
- return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False)
294
  else:
295
  # print("===============Running run_image =============")
296
  # print("Inputs:", file, state)
@@ -316,69 +312,13 @@ class ConversationBot:
316
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
317
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
318
  f"Current Memory: {self.agent.memory.buffer}")
319
- return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False)
320
-
321
- def speech(self, speech_input, state):
322
- input_audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
323
- text = self.models['ASR'].translate_english(speech_input)
324
- print("Inputs:", text, state)
325
- print("======>Previous memory:\n %s" % self.agent.memory)
326
- self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
327
- res = self.agent({"input": text})
328
- if res['intermediate_steps'] == []:
329
- print("======>Current memory:\n %s" % self.agent.memory)
330
- response = res['output']
331
- output_audio_filename = self.models['TTS'].inference(response)
332
- state = state + [(text, response)]
333
- print("Outputs:", state)
334
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
335
- else:
336
- tool = res['intermediate_steps'][0][0].tool
337
- if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Target Sound Detection":
338
- print("======>Current memory:\n %s" % self.agent.memory)
339
- response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
340
- output_audio_filename = self.models['TTS'].inference(res['output'])
341
- state = state + [(text, response)]
342
- print("Outputs:", state)
343
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
344
- elif tool == "Transcribe Speech":
345
- print("======>Current memory:\n %s" % self.agent.memory)
346
- output_audio_filename = self.models['TTS'].inference(res['output'])
347
- response = res['output']
348
- state = state + [(text, response)]
349
- print("Outputs:", state)
350
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
351
- elif tool == "Detect The Sound Event From The Audio":
352
- print("======>Current memory:\n %s" % self.agent.memory)
353
- image_filename = res['intermediate_steps'][0][1]
354
- output_audio_filename = self.models['TTS'].inference(res['output'])
355
- response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
356
- state = state + [(text, response)]
357
- print("Outputs:", state)
358
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
359
- elif tool == "Generate a talking human portrait video given a input Audio":
360
- video_filename = res['intermediate_steps'][0][1]
361
- print("======>Current memory:\n %s" % self.agent.memory)
362
- response = res['output']
363
- output_audio_filename = self.models['TTS'].inference(res['output'])
364
- state = state + [(text, response)]
365
- print("Outputs:", state)
366
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(value=video_filename,visible=True)
367
- print("======>Current memory:\n %s" % self.agent.memory)
368
- response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
369
- audio_filename = res['intermediate_steps'][0][1]
370
- Res = "The audio file has been generated and the audio is "
371
- output_audio_filename = merge_audio(self.models['TTS'].inference(Res), audio_filename)
372
- print(output_audio_filename)
373
- state = state + [(text, response)]
374
- response = res['output']
375
- print("Outputs:", state)
376
- return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
377
 
378
  def inpainting(self, state, audio_filename, image_filename):
379
  print("===============Running inpainting =============")
380
  print("Inputs:", state)
381
  print("======>Previous memory:\n %s" % self.agent.memory)
 
382
  new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
383
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
384
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
@@ -388,52 +328,21 @@ class ConversationBot:
388
  return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
389
  def clear_audio(self):
390
  return gr.Audio.update(value=None, visible=False)
391
- def clear_input_audio(self):
392
- return gr.Audio.update(value=None)
393
  def clear_image(self):
394
  return gr.Image.update(value=None, visible=False)
395
- def clear_video(self):
396
- return gr.Video.update(value=None, visible=False)
397
  def clear_button(self):
398
  return gr.Button.update(visible=False)
399
-
400
- def init_agent(self, openai_api_key, interaction_type):
401
- if interaction_type == "text":
402
- for class_name, instance in self.models.items():
403
- for e in dir(instance):
404
- if e.startswith('inference'):
405
- func = getattr(instance, e)
406
- self.tools.append(Tool(name=func.name, description=func.description, func=func))
407
- self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
408
- self.agent = initialize_agent(
409
- self.tools,
410
- self.llm,
411
- agent="conversational-react-description",
412
- verbose=True,
413
- memory=self.memory,
414
- return_intermediate_steps=True,
415
- agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
416
- return gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)
417
- else:
418
- for class_name, instance in self.models.items():
419
- if class_name != 'T2A' and class_name != 'I2A' and class_name != 'Inpaint' and class_name != 'ASR' and class_name != 'SoundDetection' and class_name != 'Speech_Enh_SC' and class_name != 'Speech_SS':
420
- for e in dir(instance):
421
- if e.startswith('inference'):
422
- func = getattr(instance, e)
423
- self.tools.append(Tool(name=func.name, description=func.description, func=func))
424
-
425
- self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
426
- self.agent = initialize_agent(
427
- self.tools,
428
- self.llm,
429
- agent="conversational-react-description",
430
- verbose=True,
431
- memory=self.memory,
432
- return_intermediate_steps=True,
433
- agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
434
- self.agent.agent.llm_chain.verbose = True
435
- self.agent.llm_chain.verbose = True
436
- return gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)
437
 
438
 
439
 
@@ -453,50 +362,37 @@ if __name__ == '__main__':
453
  'Speech_Enh_SC': 'cuda:0',
454
  'Speech_SS': 'cuda:0'
455
  })
456
- with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
457
- with gr.Row():
458
- gr.Markdown("## AudioGPT")
459
- chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT", visible=False)
460
- state = gr.State([])
461
 
462
- with gr.Row() as select_raws:
463
- with gr.Column(scale=0.7):
464
- interaction_type = gr.Radio(choices=['text', 'speech'], value='text', label='Interaction Type')
465
  openai_api_key_textbox = gr.Textbox(
466
  placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
467
  show_label=False,
468
  lines=1,
469
  type="password",
470
  )
471
- with gr.Row(visible=False) as text_input_raws:
 
 
 
472
  with gr.Column(scale=0.7):
473
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
474
  with gr.Column(scale=0.1, min_width=0):
475
  run = gr.Button("🏃‍♂️Run")
476
  with gr.Column(scale=0.1, min_width=0):
477
- clear_txt = gr.Button("🔄Clear️")
478
  with gr.Column(scale=0.1, min_width=0):
479
- btn = gr.UploadButton("🖼️Upload", file_types=["image","audio"])
480
-
481
- with gr.Row():
482
- outaudio = gr.Audio(visible=False)
483
- with gr.Row():
484
- with gr.Column(scale=0.3, min_width=0):
485
- outvideo = gr.Video(visible=False)
486
- with gr.Row():
487
- show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
488
- with gr.Row():
489
- run_button = gr.Button("Predict Masked Place",visible=False)
490
-
491
- with gr.Row(visible=False) as speech_input_raws:
492
- with gr.Column(scale=0.7):
493
- speech_input = gr.Audio(source="microphone", type="filepath", label="Input")
494
- with gr.Column(scale=0.15, min_width=0):
495
- submit_btn = gr.Button("🏃‍♂️Submit")
496
- with gr.Column(scale=0.15, min_width=0):
497
- clear_speech = gr.Button("🔄Clear️")
498
- with gr.Row():
499
- speech_output = gr.Audio(label="Output",visible=False)
500
  gr.Examples(
501
  examples=["Generate a speech with text 'here we go'",
502
  "Transcribe this speech",
@@ -513,27 +409,18 @@ if __name__ == '__main__':
513
  inputs=txt
514
  )
515
 
516
- openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox, interaction_type], [select_raws, chatbot, text_input_raws, speech_input_raws])
517
-
518
- txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
519
  txt.submit(lambda: "", None, txt)
520
- run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
521
  run.click(lambda: "", None, txt)
522
- btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, outaudio, outvideo])
523
- run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, outvideo, run_button])
524
- clear_txt.click(bot.memory.clear)
525
- clear_txt.click(lambda: [], None, chatbot)
526
- clear_txt.click(lambda: [], None, state)
527
- clear_txt.click(lambda:None, None, txt)
528
- clear_txt.click(bot.clear_button, None, run_button)
529
- clear_txt.click(bot.clear_image, None, show_mel)
530
- clear_txt.click(bot.clear_audio, None, outaudio)
531
- clear_txt.click(bot.clear_video, None, outvideo)
532
-
533
- submit_btn.click(bot.speech, [speech_input, state], [speech_input, speech_output, state, outvideo])
534
- clear_speech.click(bot.clear_input_audio, None, speech_input)
535
- clear_speech.click(bot.clear_audio, None, speech_output)
536
- clear_speech.click(lambda: [], None, state)
537
- clear_speech.click(bot.clear_video, None, outvideo)
538
-
539
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
149
  import gradio as gr
150
 
151
  _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
152
+ _DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head</a>. </p>'
153
  _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes.'
154
  if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
155
  _DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
 
187
  Thought: Do I need to use a tool? {agent_scratchpad}"""
188
 
189
  def cut_dialogue_history(history_memory, keep_last_n_words = 500):
 
 
 
190
  tokens = history_memory.split()
191
  n_tokens = len(tokens)
192
  print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
193
  if n_tokens < keep_last_n_words:
194
  return history_memory
195
+ else:
196
+ paragraphs = history_memory.split('\n')
197
+ last_n_tokens = n_tokens
198
+ while last_n_tokens >= keep_last_n_words:
199
+ last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
200
+ paragraphs = paragraphs[1:]
201
+ return '\n' + '\n'.join(paragraphs)
 
202
 
203
  class ConversationBot:
204
  def __init__(self, load_dict):
 
208
  self.models = dict()
209
  for class_name, device in load_dict.items():
210
  self.models[class_name] = globals()[class_name](device=device)
211
+ for class_name, instance in self.models.items():
212
+ for e in dir(instance):
213
+ if e.startswith('inference'):
214
+ func = getattr(instance, e)
215
+ self.tools.append(Tool(name=func.name, description=func.description, func=func))
216
 
217
  def run_text(self, text, state):
218
  print("===============Running run_text =============")
219
  print("Inputs:", text, state)
220
  print("======>Previous memory:\n %s" % self.agent.memory)
221
  self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
 
 
 
 
 
222
  res = self.agent({"input": text})
223
  if res['intermediate_steps'] == []:
224
  print("======>Current memory:\n %s" % self.agent.memory)
225
  response = res['output']
226
  state = state + [(text, response)]
227
  print("Outputs:", state)
228
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
229
  else:
230
  tool = res['intermediate_steps'][0][0].tool
231
  if tool == "Generate Image From User Input Text":
 
234
  state = state + [(text, response)]
235
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
236
  f"Current Memory: {self.agent.memory.buffer}")
237
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
238
  elif tool == "Detect The Sound Event From The Audio":
239
  image_filename = res['intermediate_steps'][0][1]
240
  response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
241
  state = state + [(text, response)]
242
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
243
  f"Current Memory: {self.agent.memory.buffer}")
244
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
245
  elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
246
  print("======>Current memory:\n %s" % self.agent.memory)
247
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
 
249
  #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
250
  state = state + [(text, response)]
251
  print("Outputs:", state)
252
+ return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
253
  elif tool == "Audio Inpainting":
254
  audio_filename = res['intermediate_steps'][0][0].tool_input
255
  image_filename = res['intermediate_steps'][0][1]
256
  print("======>Current memory:\n %s" % self.agent.memory)
257
+ print(res)
258
  response = res['output']
259
  state = state + [(text, response)]
260
  print("Outputs:", state)
261
+ return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
262
  print("======>Current memory:\n %s" % self.agent.memory)
263
  response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
264
  audio_filename = res['intermediate_steps'][0][1]
265
  state = state + [(text, response)]
266
  print("Outputs:", state)
267
+ return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(visible=False), gr.Button.update(visible=False)
268
 
269
  def run_image_or_audio(self, file, state, txt):
270
  file_type = file.name[-3:]
 
273
  print("Inputs:", file, state)
274
  print("======>Previous memory:\n %s" % self.agent.memory)
275
  audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
276
+ audio_load = whisper.load_audio(file.name)
277
+ soundfile.write(audio_filename, audio_load, samplerate = 16000)
 
278
  description = self.models['A2T'].inference(audio_filename)
279
  Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
280
  "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
 
286
  #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
287
  state = state + [(f"*{audio_filename}*", AI_prompt)]
288
  print("Outputs:", state)
289
+ return state, state, txt + ' ' + audio_filename + ' ', gr.Audio.update(value=audio_filename,visible=True)
290
  else:
291
  # print("===============Running run_image =============")
292
  # print("Inputs:", file, state)
 
312
  state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
313
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
314
  f"Current Memory: {self.agent.memory.buffer}")
315
+ return state, state, txt + f'{txt} {image_filename} ', gr.Audio.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  def inpainting(self, state, audio_filename, image_filename):
318
  print("===============Running inpainting =============")
319
  print("Inputs:", state)
320
  print("======>Previous memory:\n %s" % self.agent.memory)
321
+ # inpaint = Inpaint(device="cpu")
322
  new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
323
  AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
324
  self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
 
328
  return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
329
  def clear_audio(self):
330
  return gr.Audio.update(value=None, visible=False)
 
 
331
  def clear_image(self):
332
  return gr.Image.update(value=None, visible=False)
 
 
333
  def clear_button(self):
334
  return gr.Button.update(visible=False)
335
+ def init_agent(self, openai_api_key):
336
+ self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
337
+ self.agent = initialize_agent(
338
+ self.tools,
339
+ self.llm,
340
+ agent="conversational-react-description",
341
+ verbose=True,
342
+ memory=self.memory,
343
+ return_intermediate_steps=True,
344
+ agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
345
+ return gr.update(visible = True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
 
348
 
 
362
  'Speech_Enh_SC': 'cuda:0',
363
  'Speech_SS': 'cuda:0'
364
  })
365
+ with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
366
+ gr.Markdown(_DESCRIPTION)
 
 
 
367
 
368
+ with gr.Row():
 
 
369
  openai_api_key_textbox = gr.Textbox(
370
  placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
371
  show_label=False,
372
  lines=1,
373
  type="password",
374
  )
375
+
376
+ chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
377
+ state = gr.State([])
378
+ with gr.Row(visible = False) as input_raws:
379
  with gr.Column(scale=0.7):
380
  txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
381
  with gr.Column(scale=0.1, min_width=0):
382
  run = gr.Button("🏃‍♂️Run")
383
  with gr.Column(scale=0.1, min_width=0):
384
+ clear = gr.Button("🔄Clear️")
385
  with gr.Column(scale=0.1, min_width=0):
386
+ btn = gr.UploadButton("🖼️/🎙️ Upload", file_types=["image","audio"])
387
+ with gr.Row():
388
+ with gr.Column():
389
+ outaudio = gr.Audio(visible=False)
390
+ with gr.Row():
391
+ with gr.Column():
392
+ show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
393
+ with gr.Row():
394
+ with gr.Column():
395
+ run_button = gr.Button("Predict Masked Place",visible=False)
 
 
 
 
 
 
 
 
 
 
 
396
  gr.Examples(
397
  examples=["Generate a speech with text 'here we go'",
398
  "Transcribe this speech",
 
409
  inputs=txt
410
  )
411
 
412
+ openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
413
+ txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
 
414
  txt.submit(lambda: "", None, txt)
415
+ run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
416
  run.click(lambda: "", None, txt)
417
+ btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
418
+ run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, run_button])
419
+ clear.click(bot.memory.clear)
420
+ clear.click(lambda: [], None, chatbot)
421
+ clear.click(lambda: [], None, state)
422
+ clear.click(lambda:None, None, txt)
423
+ clear.click(bot.clear_button, None, run_button)
424
+ clear.click(bot.clear_image, None, show_mel)
425
+ clear.click(bot.clear_audio, None, outaudio)
 
 
 
 
 
 
 
 
426
  demo.launch(server_name="0.0.0.0", server_port=7860)