Spaces:

AIGC-Audio
/

Make_An_Audio

Sleeping

App Files Files Community

lmzjms commited on May 22, 2023

Commit

85c78a5

1 Parent(s): ce764e5

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -213

app.py CHANGED Viewed

@@ -1,146 +1,3 @@
-# import torch
-# import numpy as np
-# import gradio as gr
-# from PIL import Image
-# from omegaconf import OmegaConf
-# from pathlib import Path
-# from vocoder.bigvgan.models import VocoderBigVGAN
-# from ldm.models.diffusion.ddim import DDIMSampler
-# from ldm.util import instantiate_from_config
-# from wav_evaluation.models.CLAPWrapper import CLAPWrapper
-# SAMPLE_RATE = 16000
-# torch.set_grad_enabled(False)
-# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-# def initialize_model(config, ckpt):
-#     config = OmegaConf.load(config)
-#     model = instantiate_from_config(config.model)
-#     model.load_state_dict(torch.load(ckpt,map_location='cpu')["state_dict"], strict=False)
-#     model = model.to(device)
-#     model.cond_stage_model.to(model.device)
-#     model.cond_stage_model.device = model.device
-#     print(model.device,device,model.cond_stage_model.device)
-#     sampler = DDIMSampler(model)
-#     return sampler
-# sampler = initialize_model('configs/text_to_audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt')
-# vocoder = VocoderBigVGAN('vocoder/logs/bigv16k53w',device=device)
-# clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
-# def select_best_audio(prompt,wav_list):
-#     text_embeddings = clap_model.get_text_embeddings([prompt])
-#     score_list = []
-#     for data in wav_list:
-#         sr,wav = data
-#         audio_embeddings = clap_model.get_audio_embeddings([(torch.FloatTensor(wav),sr)], resample=True)
-#         score = clap_model.compute_similarity(audio_embeddings, text_embeddings,use_logit_scale=False).squeeze().cpu().numpy()
-#         score_list.append(score)
-#     max_index = np.array(score_list).argmax()
-#     print(score_list,max_index)
-#     return wav_list[max_index]
-# def txt2audio(sampler,vocoder,prompt, seed, scale, ddim_steps, n_samples=1, W=624, H=80):
-#     prng = np.random.RandomState(seed)
-#     start_code = prng.randn(n_samples, sampler.model.first_stage_model.embed_dim, H // 8, W // 8)
-#     start_code = torch.from_numpy(start_code).to(device=device, dtype=torch.float32)
-#     uc = None
-#     if scale != 1.0:
-#         uc = sampler.model.get_learned_conditioning(n_samples * [""])
-#     c = sampler.model.get_learned_conditioning(n_samples * [prompt])# shape:[1,77,1280],即还没有变成句子embedding，仍是每个单词的embedding
-#     shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8]  # (z_dim, 80//2^x, 848//2^x)
-#     samples_ddim, _ = sampler.sample(S=ddim_steps,
-#                                         conditioning=c,
-#                                         batch_size=n_samples,
-#                                         shape=shape,
-#                                         verbose=False,
-#                                         unconditional_guidance_scale=scale,
-#                                         unconditional_conditioning=uc,
-#                                         x_T=start_code)
-#     x_samples_ddim = sampler.model.decode_first_stage(samples_ddim)
-#     x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0, min=0.0, max=1.0) # [0, 1]
-#     wav_list = []
-#     for idx,spec in enumerate(x_samples_ddim):
-#         wav = vocoder.vocode(spec)
-#         wav_list.append((SAMPLE_RATE,wav))
-#     best_wav = select_best_audio(prompt,wav_list)
-#     return best_wav
-# def predict(prompt, ddim_steps, num_samples, scale, seed):# 经过试验，这个input_image需要是256x256、512x512的大小效果才正常，实际应该resize一下，输出再resize回去，但是他们使用的是pad，不知道为什么
-#     melbins,mel_len = 80,624
-#     with torch.no_grad():
-#         result = txt2audio(
-#             sampler=sampler,
-#             vocoder=vocoder,
-#             prompt=prompt,
-#             seed=seed,
-#             scale=scale,
-#             ddim_steps=ddim_steps,
-#             n_samples=num_samples,
-#             H=melbins, W=mel_len
-#         )
-#     return result
-# with gr.Blocks() as demo:
-#     with gr.Row():
-#         gr.Markdown("## Make-An-Audio: Text-to-Audio Generation")
-#     with gr.Row():
-#         with gr.Column():
-#             prompt = gr.Textbox(label="Prompt: Input your text here.        ")
-#             run_button = gr.Button(label="Run")
-#             with gr.Accordion("Advanced options", open=False):
-#                 num_samples = gr.Slider(
-#                     label="Select from audios num.This number control the number of candidates \
-#                         (e.g., generate three audios and choose the best to show you). A Larger value usually lead to \
-#                         better quality with heavier computation", minimum=1, maximum=10, value=3, step=1)
-#                 # num_samples = 1
-#                 ddim_steps = gr.Slider(label="Steps", minimum=1,
-#                                        maximum=150, value=100, step=1)
-#                 scale = gr.Slider(
-#                     label="Guidance Scale:(Large => more relevant to text but the quality may drop)", minimum=0.1, maximum=4.0, value=1.5, step=0.1
-#                 )
-#                 seed = gr.Slider(
-#                     label="Seed:Change this value (any integer number) will lead to a different generation result.",
-#                     minimum=0,
-#                     maximum=2147483647,
-#                     step=1,
-#                     value=44,
-#                 )
-#         with gr.Column():
-#             # audio_list = []
-#             # for i in range(int(num_samples)):
-#             #     audio_list.append(gr.outputs.Audio())
-#             outaudio = gr.Audio()
-#     run_button.click(fn=predict, inputs=[
-#                     prompt,ddim_steps, num_samples, scale, seed], outputs=[outaudio])# inputs的参数只能传gr.xxx
-#     with gr.Row():
-#         with gr.Column():
-#             gr.Examples(
-#                         examples = [['a dog barking and a bird chirping',100,3,1.5,55],['fireworks pop and explode',100,3,1.5,55],
-#                                         ['piano and violin plays',100,3,1.5,55],['wind thunder and rain falling',100,3,1.5,55],['music made by drum kit',100,3,1.5,55]],
-#                         inputs = [prompt,ddim_steps, num_samples, scale, seed],
-#                         outputs = [outaudio]
-#                         )
-#         with gr.Column():
-#             pass
-# demo.launch()
 from langchain.agents.initialize import initialize_agent
 from langchain.agents.tools import Tool
 from langchain.chains.conversation.memory import ConversationBufferMemory
@@ -149,7 +6,7 @@ from audio_foundation_models import *
 import gradio as gr
 _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
-_DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head</a>. </p>'
 _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes.'
 if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
     _DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
@@ -186,19 +43,23 @@ Previous conversation history:
 New input: {input}
 Thought: Do I need to use a tool? {agent_scratchpad}"""
-def cut_dialogue_history(history_memory, keep_last_n_words = 500):
     tokens = history_memory.split()
     n_tokens = len(tokens)
     print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
     if n_tokens < keep_last_n_words:
         return history_memory
-    else:
-        paragraphs = history_memory.split('\n')
-        last_n_tokens = n_tokens
-        while last_n_tokens >= keep_last_n_words:
-            last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
-            paragraphs = paragraphs[1:]
-        return '\n' + '\n'.join(paragraphs)
 class ConversationBot:
     def __init__(self, load_dict):
@@ -208,11 +69,6 @@ class ConversationBot:
         self.models = dict()
         for class_name, device in load_dict.items():
             self.models[class_name] = globals()[class_name](device=device)
-        for class_name, instance in self.models.items():
-            for e in dir(instance):
-                if e.startswith('inference'):
-                    func = getattr(instance, e)
-                    self.tools.append(Tool(name=func.name, description=func.description, func=func))
     def run_text(self, text, state):
         print("===============Running run_text =============")
@@ -225,7 +81,7 @@ class ConversationBot:
             response = res['output']
             state = state + [(text, response)]
             print("Outputs:", state)
-            return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
         else:
             tool = res['intermediate_steps'][0][0].tool
             if tool == "Generate Image From User Input Text":
@@ -234,14 +90,14 @@ class ConversationBot:
                 state = state + [(text, response)]
                 print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
                       f"Current Memory: {self.agent.memory.buffer}")
-                return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Detect The Sound Event From The Audio":
                 image_filename = res['intermediate_steps'][0][1]
                 response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
                 state = state + [(text, response)]
                 print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
                       f"Current Memory: {self.agent.memory.buffer}")
-                return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
                 print("======>Current memory:\n %s" % self.agent.memory)
                 response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
@@ -249,22 +105,21 @@ class ConversationBot:
                 #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
                 state = state + [(text, response)]
                 print("Outputs:", state)
-                return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Audio Inpainting":
                 audio_filename = res['intermediate_steps'][0][0].tool_input
                 image_filename = res['intermediate_steps'][0][1]
                 print("======>Current memory:\n %s" % self.agent.memory)
-                print(res)
                 response = res['output']
                 state = state + [(text, response)]
                 print("Outputs:", state)
-                return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
             print("======>Current memory:\n %s" % self.agent.memory)
             response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
             audio_filename = res['intermediate_steps'][0][1]
             state = state + [(text, response)]
             print("Outputs:", state)
-            return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(visible=False), gr.Button.update(visible=False)
     def run_image_or_audio(self, file, state, txt):
         file_type = file.name[-3:]
@@ -273,8 +128,9 @@ class ConversationBot:
             print("Inputs:", file, state)
             print("======>Previous memory:\n %s" % self.agent.memory)
             audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
-            audio_load = whisper.load_audio(file.name)
-            soundfile.write(audio_filename, audio_load, samplerate = 16000)
             description = self.models['A2T'].inference(audio_filename)
             Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
                            "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
@@ -286,7 +142,7 @@ class ConversationBot:
             #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
             state = state + [(f"*{audio_filename}*", AI_prompt)]
             print("Outputs:", state)
-            return state, state, txt + ' ' + audio_filename + ' ', gr.Audio.update(value=audio_filename,visible=True)
         else:
             # print("===============Running run_image =============")
             # print("Inputs:", file, state)
@@ -312,13 +168,69 @@ class ConversationBot:
             state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
             print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
                   f"Current Memory: {self.agent.memory.buffer}")
-            return state, state, txt + f'{txt} {image_filename} ', gr.Audio.update(visible=False)
     def inpainting(self, state, audio_filename, image_filename):
         print("===============Running inpainting =============")
         print("Inputs:", state)
         print("======>Previous memory:\n %s" % self.agent.memory)
-        # inpaint = Inpaint(device="cpu")
         new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
         AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
         self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
@@ -328,33 +240,62 @@ class ConversationBot:
         return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
     def clear_audio(self):
         return gr.Audio.update(value=None, visible=False)
     def clear_image(self):
         return gr.Image.update(value=None, visible=False)
     def clear_button(self):
         return gr.Button.update(visible=False)
-    def init_agent(self, openai_api_key):
-        self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
-        self.agent = initialize_agent(
-            self.tools,
-            self.llm,
-            agent="conversational-react-description",
-            verbose=True,
-            memory=self.memory,
-            return_intermediate_steps=True,
-            agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
-        return gr.update(visible = True)
 if __name__ == '__main__':
     bot = ConversationBot({'ImageCaptioning': 'cuda:0',
-                           # 'T2A': 'cuda:0',
-                           # 'I2A': 'cuda:0',
                            'TTS': 'cpu',
                            'T2S': 'cpu',
                            'ASR': 'cuda:0',
                            'A2T': 'cpu',
-                           # 'Inpaint': 'cuda:0',
                            'SoundDetection': 'cpu',
                            'Binaural': 'cuda:0',
                            'SoundExtraction': 'cuda:0',
@@ -362,37 +303,50 @@ if __name__ == '__main__':
                            'Speech_Enh_SC': 'cuda:0',
                            'Speech_SS': 'cuda:0'
                            })
-    with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
-        gr.Markdown(_DESCRIPTION)
         with gr.Row():
             openai_api_key_textbox = gr.Textbox(
-                placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
                 show_label=False,
                 lines=1,
                 type="password",
             )
-        chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
-        state = gr.State([])
-        with gr.Row(visible = False) as input_raws:
             with gr.Column(scale=0.7):
                 txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
             with gr.Column(scale=0.1, min_width=0):
                 run = gr.Button("🏃‍♂️Run")
             with gr.Column(scale=0.1, min_width=0):
-                clear = gr.Button("🔄Clear️")
             with gr.Column(scale=0.1, min_width=0):
                 btn = gr.UploadButton("🖼️/🎙️ Upload", file_types=["image","audio"])
-        with gr.Row():
-            with gr.Column():
-                outaudio = gr.Audio(visible=False)
-        with gr.Row():
-            with gr.Column():
-                show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
-        with gr.Row():
-            with gr.Column():
-                run_button = gr.Button("Predict Masked Place",visible=False)
         gr.Examples(
             examples=["Generate a speech with text 'here we go'",
                       "Transcribe this speech",
@@ -409,18 +363,27 @@ if __name__ == '__main__':
             inputs=txt
         )
-        openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
-        txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
         txt.submit(lambda: "", None, txt)
-        run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
         run.click(lambda: "", None, txt)
-        btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
-        run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, run_button])
-        clear.click(bot.memory.clear)
-        clear.click(lambda: [], None, chatbot)
-        clear.click(lambda: [], None, state)
-        clear.click(lambda:None, None, txt)
-        clear.click(bot.clear_button, None, run_button)
-        clear.click(bot.clear_image, None, show_mel)
-        clear.click(bot.clear_audio, None, outaudio)
         demo.launch(server_name="0.0.0.0", server_port=7860)

 from langchain.agents.initialize import initialize_agent
 from langchain.agents.tools import Tool
 from langchain.chains.conversation.memory import ConversationBufferMemory
 import gradio as gr
 _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
+_DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Sending and Receiving Speech, Sing, Audio, and Talking head during chatting</a>. </p>'
 _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes.'
 if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
     _DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
 New input: {input}
 Thought: Do I need to use a tool? {agent_scratchpad}"""
+def cut_dialogue_history(history_memory, keep_last_n_words=400):
+    if history_memory is None or len(history_memory) == 0:
+        return history_memory
     tokens = history_memory.split()
     n_tokens = len(tokens)
     print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
     if n_tokens < keep_last_n_words:
         return history_memory
+    paragraphs = history_memory.split('\n')
+    last_n_tokens = n_tokens
+    while last_n_tokens >= keep_last_n_words:
+        last_n_tokens -= len(paragraphs[0].split(' '))
+        paragraphs = paragraphs[1:]
+    return '\n' + '\n'.join(paragraphs)
 class ConversationBot:
     def __init__(self, load_dict):
         self.models = dict()
         for class_name, device in load_dict.items():
             self.models[class_name] = globals()[class_name](device=device)
     def run_text(self, text, state):
         print("===============Running run_text =============")
             response = res['output']
             state = state + [(text, response)]
             print("Outputs:", state)
+            return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
         else:
             tool = res['intermediate_steps'][0][0].tool
             if tool == "Generate Image From User Input Text":
                 state = state + [(text, response)]
                 print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
                       f"Current Memory: {self.agent.memory.buffer}")
+                return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Detect The Sound Event From The Audio":
                 image_filename = res['intermediate_steps'][0][1]
                 response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
                 state = state + [(text, response)]
                 print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
                       f"Current Memory: {self.agent.memory.buffer}")
+                return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
                 print("======>Current memory:\n %s" % self.agent.memory)
                 response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
                 #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
                 state = state + [(text, response)]
                 print("Outputs:", state)
+                return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Audio Inpainting":
                 audio_filename = res['intermediate_steps'][0][0].tool_input
                 image_filename = res['intermediate_steps'][0][1]
                 print("======>Current memory:\n %s" % self.agent.memory)
                 response = res['output']
                 state = state + [(text, response)]
                 print("Outputs:", state)
+                return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
             print("======>Current memory:\n %s" % self.agent.memory)
             response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
             audio_filename = res['intermediate_steps'][0][1]
             state = state + [(text, response)]
             print("Outputs:", state)
+            return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
     def run_image_or_audio(self, file, state, txt):
         file_type = file.name[-3:]
             print("Inputs:", file, state)
             print("======>Previous memory:\n %s" % self.agent.memory)
             audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+            # audio_load = whisper.load_audio(file.name)
+            audio_load, sr = soundfile.read(file.name)
+            soundfile.write(audio_filename, audio_load, samplerate = sr)
             description = self.models['A2T'].inference(audio_filename)
             Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
                            "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
             #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
             state = state + [(f"*{audio_filename}*", AI_prompt)]
             print("Outputs:", state)
+            return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False)
         else:
             # print("===============Running run_image =============")
             # print("Inputs:", file, state)
             state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
             print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
                   f"Current Memory: {self.agent.memory.buffer}")
+            return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False)
+    def speech(self, speech_input, state):
+        input_audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        text = self.models['ASR'].translate_english(speech_input)
+        print("Inputs:", text, state)
+        print("======>Previous memory:\n %s" % self.agent.memory)
+        self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
+        res = self.agent({"input": text})
+        if res['intermediate_steps'] == []:
+            print("======>Current memory:\n %s" % self.agent.memory)
+            response = res['output']
+            output_audio_filename = self.models['TTS'].inference(response)
+            state = state + [(text, response)]
+            print("Outputs:", state)
+            return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
+        else:
+            tool = res['intermediate_steps'][0][0].tool
+            if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Target Sound Detection":
+                print("======>Current memory:\n %s" % self.agent.memory)
+                response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
+                output_audio_filename = self.models['TTS'].inference(res['output'])
+                state = state + [(text, response)]
+                print("Outputs:", state)
+                return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
+            elif tool == "Transcribe Speech":
+                print("======>Current memory:\n %s" % self.agent.memory)
+                output_audio_filename = self.models['TTS'].inference(res['output'])
+                response = res['output']
+                state = state + [(text, response)]
+                print("Outputs:", state)
+                return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
+            elif tool == "Detect The Sound Event From The Audio":
+                print("======>Current memory:\n %s" % self.agent.memory)
+                image_filename = res['intermediate_steps'][0][1]
+                output_audio_filename = self.models['TTS'].inference(res['output'])
+                response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
+                state = state + [(text, response)]
+                print("Outputs:", state)
+                return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
+            elif tool == "Generate a talking human portrait video given a input Audio":
+                video_filename = res['intermediate_steps'][0][1]
+                print("======>Current memory:\n %s" % self.agent.memory)
+                response = res['output']
+                output_audio_filename = self.models['TTS'].inference(res['output'])
+                state = state + [(text, response)]
+                print("Outputs:", state)
+                return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(value=video_filename,visible=True)
+            print("======>Current memory:\n %s" % self.agent.memory)
+            response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
+            audio_filename = res['intermediate_steps'][0][1]
+            Res = "The audio file has been generated and the audio is "
+            output_audio_filename = merge_audio(self.models['TTS'].inference(Res), audio_filename)
+            print(output_audio_filename)
+            state = state + [(text, response)]
+            response = res['output']
+            print("Outputs:", state)
+            return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
     def inpainting(self, state, audio_filename, image_filename):
         print("===============Running inpainting =============")
         print("Inputs:", state)
         print("======>Previous memory:\n %s" % self.agent.memory)
         new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
         AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
         self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
         return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
     def clear_audio(self):
         return gr.Audio.update(value=None, visible=False)
+    def clear_input_audio(self):
+        return gr.Audio.update(value=None)
     def clear_image(self):
         return gr.Image.update(value=None, visible=False)
+    def clear_video(self):
+        return gr.Video.update(value=None, visible=False)
     def clear_button(self):
         return gr.Button.update(visible=False)
+    def init_agent(self, openai_api_key, interaction_type):
+        if interaction_type == "text":
+            for class_name, instance in self.models.items():
+                for e in dir(instance):
+                    if e.startswith('inference'):
+                        func = getattr(instance, e)
+                        self.tools.append(Tool(name=func.name, description=func.description, func=func))
+            self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
+            self.agent = initialize_agent(
+                self.tools,
+                self.llm,
+                agent="conversational-react-description",
+                verbose=True,
+                memory=self.memory,
+                return_intermediate_steps=True,
+                agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
+            return gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)
+        else:
+            for class_name, instance in self.models.items():
+                if class_name != 'T2A' and class_name != 'I2A' and class_name != 'Inpaint' and class_name != 'ASR' and class_name != 'SoundDetection' and class_name != 'Speech_Enh_SC' and class_name != 'Speech_SS':
+                    for e in dir(instance):
+                        if e.startswith('inference'):
+                            func = getattr(instance, e)
+                            self.tools.append(Tool(name=func.name, description=func.description, func=func))
+            self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
+            self.agent = initialize_agent(
+                self.tools,
+                self.llm,
+                agent="conversational-react-description",
+                verbose=True,
+                memory=self.memory,
+                return_intermediate_steps=True,
+                agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
+            return gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)
 if __name__ == '__main__':
     bot = ConversationBot({'ImageCaptioning': 'cuda:0',
+                           'T2A': 'cuda:0',
+                           'I2A': 'cuda:0',
                            'TTS': 'cpu',
                            'T2S': 'cpu',
                            'ASR': 'cuda:0',
                            'A2T': 'cpu',
+                           'Inpaint': 'cuda:0',
                            'SoundDetection': 'cpu',
                            'Binaural': 'cuda:0',
                            'SoundExtraction': 'cuda:0',
                            'Speech_Enh_SC': 'cuda:0',
                            'Speech_SS': 'cuda:0'
                            })
+    with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
         with gr.Row():
+            gr.Markdown("## AudioGPT")
+        chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT", visible=False)
+        state = gr.State([])
+        with gr.Row() as select_raws:
+            with gr.Column(scale=0.7):
+                interaction_type = gr.Radio(choices=['text', 'speech'], value='text', label='Interaction Type')
             openai_api_key_textbox = gr.Textbox(
+                placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter 鈫碉笍",
                 show_label=False,
                 lines=1,
                 type="password",
             )
+        with gr.Row(visible=False) as text_input_raws:
             with gr.Column(scale=0.7):
                 txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
             with gr.Column(scale=0.1, min_width=0):
                 run = gr.Button("🏃‍♂️Run")
             with gr.Column(scale=0.1, min_width=0):
+                clear_txt = gr.Button("🔄Clear️")
             with gr.Column(scale=0.1, min_width=0):
                 btn = gr.UploadButton("🖼️/🎙️ Upload", file_types=["image","audio"])
+        with gr.Row():
+            outaudio = gr.Audio(visible=False)
+        with gr.Row():
+            with gr.Column(scale=0.3, min_width=0):
+                outvideo = gr.Video(visible=False)
+        with gr.Row():
+            show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
+        with gr.Row():
+            run_button = gr.Button("Predict Masked Place",visible=False)
+        with gr.Row(visible=False) as speech_input_raws:
+            with gr.Column(scale=0.7):
+                speech_input = gr.Audio(source="microphone", type="filepath", label="Input")
+            with gr.Column(scale=0.15, min_width=0):
+                submit_btn = gr.Button("🏃‍♂️submit")
+            with gr.Column(scale=0.15, min_width=0):
+                clear_speech = gr.Button("🔄Clear️")
+            with gr.Row():
+                speech_output = gr.Audio(label="Output",visible=False)
         gr.Examples(
             examples=["Generate a speech with text 'here we go'",
                       "Transcribe this speech",
             inputs=txt
         )
+        openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox, interaction_type], [select_raws, chatbot, text_input_raws, speech_input_raws])
+        txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
         txt.submit(lambda: "", None, txt)
+        run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
         run.click(lambda: "", None, txt)
+        btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, outaudio, outvideo])
+        run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, outvideo, run_button])
+        clear_txt.click(bot.memory.clear)
+        clear_txt.click(lambda: [], None, chatbot)
+        clear_txt.click(lambda: [], None, state)
+        clear_txt.click(lambda:None, None, txt)
+        clear_txt.click(bot.clear_button, None, run_button)
+        clear_txt.click(bot.clear_image, None, show_mel)
+        clear_txt.click(bot.clear_audio, None, outaudio)
+        clear_txt.click(bot.clear_video, None, outvideo)
+        submit_btn.click(bot.speech, [speech_input, state], [speech_input, speech_output, state, outvideo])
+        clear_speech.click(bot.clear_input_audio, None, speech_input)
+        clear_speech.click(bot.clear_audio, None, speech_output)
+        clear_speech.click(lambda: [], None, state)
+        clear_speech.click(bot.clear_video, None, outvideo)
         demo.launch(server_name="0.0.0.0", server_port=7860)