Spaces:

AIGC-Audio
/

Make_An_Audio

Sleeping

App Files Files Community

lmzjms commited on May 22, 2023

Commit

85c8fb1

1 Parent(s): 71e76ce

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -176

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from audio_foundation_models import *
 import gradio as gr
 _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
-_DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Sending and Receiving Speech, Sing, Audio, and Talking head during chatting</a>. </p>'
 _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes.'
 if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
     _DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
@@ -43,23 +43,19 @@ Previous conversation history:
 New input: {input}
 Thought: Do I need to use a tool? {agent_scratchpad}"""
-def cut_dialogue_history(history_memory, keep_last_n_words=400):
-    if history_memory is None or len(history_memory) == 0:
-        return history_memory
     tokens = history_memory.split()
     n_tokens = len(tokens)
     print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
     if n_tokens < keep_last_n_words:
         return history_memory
-    paragraphs = history_memory.split('\n')
-    last_n_tokens = n_tokens
-    while last_n_tokens >= keep_last_n_words:
-        last_n_tokens -= len(paragraphs[0].split(' '))
-        paragraphs = paragraphs[1:]
-    return '\n' + '\n'.join(paragraphs)
 class ConversationBot:
     def __init__(self, load_dict):
@@ -69,6 +65,11 @@ class ConversationBot:
         self.models = dict()
         for class_name, device in load_dict.items():
             self.models[class_name] = globals()[class_name](device=device)
     def run_text(self, text, state):
         print("===============Running run_text =============")
@@ -81,7 +82,7 @@ class ConversationBot:
             response = res['output']
             state = state + [(text, response)]
             print("Outputs:", state)
-            return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
         else:
             tool = res['intermediate_steps'][0][0].tool
             if tool == "Generate Image From User Input Text":
@@ -90,14 +91,14 @@ class ConversationBot:
                 state = state + [(text, response)]
                 print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
                       f"Current Memory: {self.agent.memory.buffer}")
-                return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Detect The Sound Event From The Audio":
                 image_filename = res['intermediate_steps'][0][1]
                 response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
                 state = state + [(text, response)]
                 print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
                       f"Current Memory: {self.agent.memory.buffer}")
-                return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
                 print("======>Current memory:\n %s" % self.agent.memory)
                 response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
@@ -105,21 +106,22 @@ class ConversationBot:
                 #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
                 state = state + [(text, response)]
                 print("Outputs:", state)
-                return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Audio Inpainting":
                 audio_filename = res['intermediate_steps'][0][0].tool_input
                 image_filename = res['intermediate_steps'][0][1]
                 print("======>Current memory:\n %s" % self.agent.memory)
                 response = res['output']
                 state = state + [(text, response)]
                 print("Outputs:", state)
-                return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
             print("======>Current memory:\n %s" % self.agent.memory)
             response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
             audio_filename = res['intermediate_steps'][0][1]
             state = state + [(text, response)]
             print("Outputs:", state)
-            return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
     def run_image_or_audio(self, file, state, txt):
         file_type = file.name[-3:]
@@ -128,9 +130,8 @@ class ConversationBot:
             print("Inputs:", file, state)
             print("======>Previous memory:\n %s" % self.agent.memory)
             audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
-            # audio_load = whisper.load_audio(file.name)
-            audio_load, sr = soundfile.read(file.name)
-            soundfile.write(audio_filename, audio_load, samplerate = sr)
             description = self.models['A2T'].inference(audio_filename)
             Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
                            "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
@@ -142,7 +143,7 @@ class ConversationBot:
             #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
             state = state + [(f"*{audio_filename}*", AI_prompt)]
             print("Outputs:", state)
-            return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Video.update(visible=False)
         else:
             # print("===============Running run_image =============")
             # print("Inputs:", file, state)
@@ -168,69 +169,13 @@ class ConversationBot:
             state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
             print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
                   f"Current Memory: {self.agent.memory.buffer}")
-            return state, state, gr.Audio.update(visible=False), gr.Video.update(visible=False)
-    def speech(self, speech_input, state):
-        input_audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
-        text = self.models['ASR'].translate_english(speech_input)
-        print("Inputs:", text, state)
-        print("======>Previous memory:\n %s" % self.agent.memory)
-        self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
-        res = self.agent({"input": text})
-        if res['intermediate_steps'] == []:
-            print("======>Current memory:\n %s" % self.agent.memory)
-            response = res['output']
-            output_audio_filename = self.models['TTS'].inference(response)
-            state = state + [(text, response)]
-            print("Outputs:", state)
-            return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
-        else:
-            tool = res['intermediate_steps'][0][0].tool
-            if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Target Sound Detection":
-                print("======>Current memory:\n %s" % self.agent.memory)
-                response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
-                output_audio_filename = self.models['TTS'].inference(res['output'])
-                state = state + [(text, response)]
-                print("Outputs:", state)
-                return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
-            elif tool == "Transcribe Speech":
-                print("======>Current memory:\n %s" % self.agent.memory)
-                output_audio_filename = self.models['TTS'].inference(res['output'])
-                response = res['output']
-                state = state + [(text, response)]
-                print("Outputs:", state)
-                return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
-            elif tool == "Detect The Sound Event From The Audio":
-                print("======>Current memory:\n %s" % self.agent.memory)
-                image_filename = res['intermediate_steps'][0][1]
-                output_audio_filename = self.models['TTS'].inference(res['output'])
-                response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
-                state = state + [(text, response)]
-                print("Outputs:", state)
-                return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
-            elif tool == "Generate a talking human portrait video given a input Audio":
-                video_filename = res['intermediate_steps'][0][1]
-                print("======>Current memory:\n %s" % self.agent.memory)
-                response = res['output']
-                output_audio_filename = self.models['TTS'].inference(res['output'])
-                state = state + [(text, response)]
-                print("Outputs:", state)
-                return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(value=video_filename,visible=True)
-            print("======>Current memory:\n %s" % self.agent.memory)
-            response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
-            audio_filename = res['intermediate_steps'][0][1]
-            Res = "The audio file has been generated and the audio is "
-            output_audio_filename = merge_audio(self.models['TTS'].inference(Res), audio_filename)
-            print(output_audio_filename)
-            state = state + [(text, response)]
-            response = res['output']
-            print("Outputs:", state)
-            return gr.Audio.update(value=None), gr.Audio.update(value=output_audio_filename,visible=True), state, gr.Video.update(visible=False)
     def inpainting(self, state, audio_filename, image_filename):
         print("===============Running inpainting =============")
         print("Inputs:", state)
         print("======>Previous memory:\n %s" % self.agent.memory)
         new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
         AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
         self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
@@ -240,62 +185,33 @@ class ConversationBot:
         return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
     def clear_audio(self):
         return gr.Audio.update(value=None, visible=False)
-    def clear_input_audio(self):
-        return gr.Audio.update(value=None)
     def clear_image(self):
         return gr.Image.update(value=None, visible=False)
-    def clear_video(self):
-        return gr.Video.update(value=None, visible=False)
     def clear_button(self):
         return gr.Button.update(visible=False)
-    def init_agent(self, openai_api_key, interaction_type):
-        if interaction_type == "text":
-            for class_name, instance in self.models.items():
-                for e in dir(instance):
-                    if e.startswith('inference'):
-                        func = getattr(instance, e)
-                        self.tools.append(Tool(name=func.name, description=func.description, func=func))
-            self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
-            self.agent = initialize_agent(
-                self.tools,
-                self.llm,
-                agent="conversational-react-description",
-                verbose=True,
-                memory=self.memory,
-                return_intermediate_steps=True,
-                agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
-            return gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)
-        else:
-            for class_name, instance in self.models.items():
-                if class_name != 'T2A' and class_name != 'I2A' and class_name != 'Inpaint' and class_name != 'ASR' and class_name != 'SoundDetection' and class_name != 'Speech_Enh_SC' and class_name != 'Speech_SS':
-                    for e in dir(instance):
-                        if e.startswith('inference'):
-                            func = getattr(instance, e)
-                            self.tools.append(Tool(name=func.name, description=func.description, func=func))
-            self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
-            self.agent = initialize_agent(
-                self.tools,
-                self.llm,
-                agent="conversational-react-description",
-                verbose=True,
-                memory=self.memory,
-                return_intermediate_steps=True,
-                agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
-            return gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)
 if __name__ == '__main__':
     bot = ConversationBot({'ImageCaptioning': 'cuda:0',
-                           #'T2A': 'cuda:0',
-                           #'I2A': 'cuda:0',
                            'TTS': 'cpu',
                            'T2S': 'cpu',
                            'ASR': 'cuda:0',
                            'A2T': 'cpu',
-                           #'Inpaint': 'cuda:0',
                            'SoundDetection': 'cpu',
                            'Binaural': 'cuda:0',
                            'SoundExtraction': 'cuda:0',
@@ -303,50 +219,37 @@ if __name__ == '__main__':
                            'Speech_Enh_SC': 'cuda:0',
                            'Speech_SS': 'cuda:0'
                            })
-    with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
-        with gr.Row():
-            gr.Markdown("## Audio ChatGPT")
-        chatbot = gr.Chatbot(elem_id="chatbot", label="Audio ChatGPT", visible=False)
-        state = gr.State([])
-        with gr.Row() as select_raws:
-            with gr.Column(scale=0.7):
-                interaction_type = gr.Radio(choices=['text', 'speech'], value='text', label='Interaction Type')
             openai_api_key_textbox = gr.Textbox(
-                placeholder="Paste your OpenAI API key here to start Audio ChatGPT(sk-...) and press Enter ↵️",
                 show_label=False,
                 lines=1,
                 type="password",
             )
-        with gr.Row(visible=False) as text_input_raws:
             with gr.Column(scale=0.7):
                 txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
             with gr.Column(scale=0.1, min_width=0):
                 run = gr.Button("🏃‍♂️Run")
             with gr.Column(scale=0.1, min_width=0):
-                clear_txt = gr.Button("🔄Clear️")
             with gr.Column(scale=0.1, min_width=0):
                 btn = gr.UploadButton("🖼️/🎙️ Upload", file_types=["image","audio"])
-        with gr.Row():
-            outaudio = gr.Audio(visible=False)
-        with gr.Row():
-            with gr.Column(scale=0.3, min_width=0):
-                outvideo = gr.Video(visible=False)
-        with gr.Row():
-            show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
-        with gr.Row():
-            run_button = gr.Button("Predict Masked Place",visible=False)
-        with gr.Row(visible=False) as speech_input_raws:
-            with gr.Column(scale=0.7):
-                speech_input = gr.Audio(source="microphone", type="filepath", label="Input")
-            with gr.Column(scale=0.15, min_width=0):
-                submit_btn = gr.Button("🏃‍♂️submit")
-            with gr.Column(scale=0.15, min_width=0):
-                clear_speech = gr.Button("🔄Clear️")
-            with gr.Row():
-                speech_output = gr.Audio(label="Output",visible=False)
         gr.Examples(
             examples=["Generate a speech with text 'here we go'",
                       "Transcribe this speech",
@@ -363,27 +266,18 @@ if __name__ == '__main__':
             inputs=txt
         )
-        openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox, interaction_type], [select_raws, chatbot, text_input_raws, speech_input_raws])
-        txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
         txt.submit(lambda: "", None, txt)
-        run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, outvideo, show_mel, run_button])
         run.click(lambda: "", None, txt)
-        btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, outaudio, outvideo])
-        run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, outvideo, run_button])
-        clear_txt.click(bot.memory.clear)
-        clear_txt.click(lambda: [], None, chatbot)
-        clear_txt.click(lambda: [], None, state)
-        clear_txt.click(lambda:None, None, txt)
-        clear_txt.click(bot.clear_button, None, run_button)
-        clear_txt.click(bot.clear_image, None, show_mel)
-        clear_txt.click(bot.clear_audio, None, outaudio)
-        clear_txt.click(bot.clear_video, None, outvideo)
-        submit_btn.click(bot.speech, [speech_input, state], [speech_input, speech_output, state, outvideo])
-        clear_speech.click(bot.clear_input_audio, None, speech_input)
-        clear_speech.click(bot.clear_audio, None, speech_output)
-        clear_speech.click(lambda: [], None, state)
-        clear_speech.click(bot.clear_video, None, outvideo)
         demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
+_DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head</a>. </p>'
 _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes.'
 if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
     _DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
 New input: {input}
 Thought: Do I need to use a tool? {agent_scratchpad}"""
+def cut_dialogue_history(history_memory, keep_last_n_words = 500):
     tokens = history_memory.split()
     n_tokens = len(tokens)
     print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
     if n_tokens < keep_last_n_words:
         return history_memory
+    else:
+        paragraphs = history_memory.split('\n')
+        last_n_tokens = n_tokens
+        while last_n_tokens >= keep_last_n_words:
+            last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
+            paragraphs = paragraphs[1:]
+        return '\n' + '\n'.join(paragraphs)
 class ConversationBot:
     def __init__(self, load_dict):
         self.models = dict()
         for class_name, device in load_dict.items():
             self.models[class_name] = globals()[class_name](device=device)
+        for class_name, instance in self.models.items():
+            for e in dir(instance):
+                if e.startswith('inference'):
+                    func = getattr(instance, e)
+                    self.tools.append(Tool(name=func.name, description=func.description, func=func))
     def run_text(self, text, state):
         print("===============Running run_text =============")
             response = res['output']
             state = state + [(text, response)]
             print("Outputs:", state)
+            return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
         else:
             tool = res['intermediate_steps'][0][0].tool
             if tool == "Generate Image From User Input Text":
                 state = state + [(text, response)]
                 print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
                       f"Current Memory: {self.agent.memory.buffer}")
+                return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Detect The Sound Event From The Audio":
                 image_filename = res['intermediate_steps'][0][1]
                 response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
                 state = state + [(text, response)]
                 print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
                       f"Current Memory: {self.agent.memory.buffer}")
+                return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Generate Text From The Audio" or tool == "Transcribe speech" or tool == "Target Sound Detection":
                 print("======>Current memory:\n %s" % self.agent.memory)
                 response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
                 #response = res['output'] + f"![](/file={image_filename})*{image_filename}*"
                 state = state + [(text, response)]
                 print("Outputs:", state)
+                return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
             elif tool == "Audio Inpainting":
                 audio_filename = res['intermediate_steps'][0][0].tool_input
                 image_filename = res['intermediate_steps'][0][1]
                 print("======>Current memory:\n %s" % self.agent.memory)
+                print(res)
                 response = res['output']
                 state = state + [(text, response)]
                 print("Outputs:", state)
+                return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
             print("======>Current memory:\n %s" % self.agent.memory)
             response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
             audio_filename = res['intermediate_steps'][0][1]
             state = state + [(text, response)]
             print("Outputs:", state)
+            return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(visible=False), gr.Button.update(visible=False)
     def run_image_or_audio(self, file, state, txt):
         file_type = file.name[-3:]
             print("Inputs:", file, state)
             print("======>Previous memory:\n %s" % self.agent.memory)
             audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+            audio_load = whisper.load_audio(file.name)
+            soundfile.write(audio_filename, audio_load, samplerate = 16000)
             description = self.models['A2T'].inference(audio_filename)
             Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
                            "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
             #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
             state = state + [(f"*{audio_filename}*", AI_prompt)]
             print("Outputs:", state)
+            return state, state, txt + ' ' + audio_filename + ' ', gr.Audio.update(value=audio_filename,visible=True)
         else:
             # print("===============Running run_image =============")
             # print("Inputs:", file, state)
             state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
             print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
                   f"Current Memory: {self.agent.memory.buffer}")
+            return state, state, txt + f'{txt} {image_filename} ', gr.Audio.update(visible=False)
     def inpainting(self, state, audio_filename, image_filename):
         print("===============Running inpainting =============")
         print("Inputs:", state)
         print("======>Previous memory:\n %s" % self.agent.memory)
+        # inpaint = Inpaint(device="cpu")
         new_image_filename, new_audio_filename = self.models['Inpaint'].predict(audio_filename, image_filename)
         AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
         self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
         return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
     def clear_audio(self):
         return gr.Audio.update(value=None, visible=False)
     def clear_image(self):
         return gr.Image.update(value=None, visible=False)
     def clear_button(self):
         return gr.Button.update(visible=False)
+    def init_agent(self, openai_api_key):
+        self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
+        self.agent = initialize_agent(
+            self.tools,
+            self.llm,
+            agent="conversational-react-description",
+            verbose=True,
+            memory=self.memory,
+            return_intermediate_steps=True,
+            agent_kwargs={'prefix': AUDIO_CHATGPT_PREFIX, 'format_instructions': AUDIO_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': AUDIO_CHATGPT_SUFFIX}, )
+        return gr.update(visible = True)
 if __name__ == '__main__':
     bot = ConversationBot({'ImageCaptioning': 'cuda:0',
+                           'T2A': 'cuda:0',
+                           'I2A': 'cuda:0',
                            'TTS': 'cpu',
                            'T2S': 'cpu',
                            'ASR': 'cuda:0',
                            'A2T': 'cpu',
+                           'Inpaint': 'cuda:0',
                            'SoundDetection': 'cpu',
                            'Binaural': 'cuda:0',
                            'SoundExtraction': 'cuda:0',
                            'Speech_Enh_SC': 'cuda:0',
                            'Speech_SS': 'cuda:0'
                            })
+    with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
+        gr.Markdown(_DESCRIPTION)
+        with gr.Row():
             openai_api_key_textbox = gr.Textbox(
+                placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
                 show_label=False,
                 lines=1,
                 type="password",
             )
+        chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
+        state = gr.State([])
+        with gr.Row(visible = False) as input_raws:
             with gr.Column(scale=0.7):
                 txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
             with gr.Column(scale=0.1, min_width=0):
                 run = gr.Button("🏃‍♂️Run")
             with gr.Column(scale=0.1, min_width=0):
+                clear = gr.Button("🔄Clear️")
             with gr.Column(scale=0.1, min_width=0):
                 btn = gr.UploadButton("🖼️/🎙️ Upload", file_types=["image","audio"])
+        with gr.Row():
+            with gr.Column():
+                outaudio = gr.Audio(visible=False)
+        with gr.Row():
+            with gr.Column():
+                show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
+        with gr.Row():
+            with gr.Column():
+                run_button = gr.Button("Predict Masked Place",visible=False)
         gr.Examples(
             examples=["Generate a speech with text 'here we go'",
                       "Transcribe this speech",
             inputs=txt
         )
+        openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
+        txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
         txt.submit(lambda: "", None, txt)
+        run.click(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
         run.click(lambda: "", None, txt)
+        btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
+        run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, run_button])
+        clear.click(bot.memory.clear)
+        clear.click(lambda: [], None, chatbot)
+        clear.click(lambda: [], None, state)
+        clear.click(lambda:None, None, txt)
+        clear.click(bot.clear_button, None, run_button)
+        clear.click(bot.clear_image, None, show_mel)
+        clear.click(bot.clear_audio, None, outaudio)
         demo.launch(server_name="0.0.0.0", server_port=7860)