Spaces:

paralym
/

MAmmoTH-VL-8B

Runtime error

App Files Files Community

paralym commited on Oct 23, 2024

Commit

83b30a3

verified ·

1 Parent(s): b5a3831

Update app.py

Browse files

Files changed (1) hide show

app.py +339 -149

app.py CHANGED Viewed

@@ -1,20 +1,22 @@
 import gradio as gr
 import os
 from threading import Thread
-from queue import Queue
-import time
 import cv2
 import datetime
 import torch
 import spaces
 import numpy as np
-import json
-import hashlib
-import PIL
-from typing import Iterator
 from llava import conversation as conversation_lib
 from llava.constants import DEFAULT_IMAGE_TOKEN
 from llava.constants import (
     IMAGE_TOKEN_INDEX,
     DEFAULT_IMAGE_TOKEN,
@@ -29,14 +31,24 @@ from llava.mm_utils import (
     get_model_name_from_path,
     KeywordsStoppingCriteria,
 )
-import sys
 from serve_constants import html_header
 import requests
 from PIL import Image
 from io import BytesIO
-from transformers import TextIteratorStreamer
 import subprocess
 external_log_dir = "./logs"
 LOGDIR = external_log_dir
@@ -51,9 +63,13 @@ def install_gradio_4_35_0():
     else:
         print("Gradio 4.35.0 is already installed.")
 install_gradio_4_35_0()
 print(f"Gradio version: {gr.__version__}")
 def get_conv_log_filename():
     t = datetime.datetime.now()
@@ -66,12 +82,12 @@ class InferenceDemo(object):
     ) -> None:
         disable_torch_init()
-        self.tokenizer = tokenizer
-        self.model = model
-        self.image_processor = image_processor
-        self.context_len = context_len
-        model_name = get_model_name_from_path(model_path)
         if "llama-2" in model_name.lower():
             conv_mode = "llava_llama_2"
@@ -94,43 +110,31 @@ class InferenceDemo(object):
             )
         else:
             args.conv_mode = conv_mode
         self.conv_mode = conv_mode
         self.conversation = conv_templates[args.conv_mode].copy()
         self.num_frames = args.num_frames
-def process_stream(streamer: TextIteratorStreamer, history: list, q: Queue):
-    """Process the output stream and put partial text into a queue"""
-    try:
-        current_message = ""
-        for new_text in streamer:
-            current_message += new_text
-            history[-1][1] = current_message
-            q.put(history.copy())
-            time.sleep(0.02)  # Add a small delay to prevent overloading
-    except Exception as e:
-        print(f"Error in process_stream: {e}")
-    finally:
-        q.put(None)  # Signal that we're done
-def stream_output(history: list, q: Queue) -> Iterator[list]:
-    """Yield updated history as it comes through the queue"""
-    while True:
-        val = q.get()
-        if val is None:
-            break
-        yield val
-        q.task_done()
 def is_valid_video_filename(name):
     video_extensions = ["avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg"]
     ext = name.split(".")[-1].lower()
-    return ext in video_extensions
 def is_valid_image_filename(name):
-    image_extensions = ["jpg", "jpeg", "png", "bmp", "gif", "tiff", "webp", "heic", "heif", "jfif", "svg", "eps", "raw"]
     ext = name.split(".")[-1].lower()
-    return ext in image_extensions
 def sample_frames(video_file, num_frames):
     video = cv2.VideoCapture(video_file)
@@ -139,33 +143,54 @@ def sample_frames(video_file, num_frames):
     frames = []
     for i in range(total_frames):
         ret, frame = video.read()
         if not ret:
             continue
-        pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
         if i % interval == 0:
             frames.append(pil_img)
     video.release()
     return frames
 def load_image(image_file):
-    if image_file.startswith(("http://", "https://")):
         response = requests.get(image_file)
         if response.status_code == 200:
             image = Image.open(BytesIO(response.content)).convert("RGB")
         else:
-            print("Failed to load the image")
-            return None
     else:
-        print("Load image from local file:", image_file)
         image = Image.open(image_file).convert("RGB")
     return image
 def clear_history(history):
-    global our_chatbot
     our_chatbot.conversation = conv_templates[our_chatbot.conv_mode].copy()
     return None
 def add_message(history, message):
     global our_chatbot
     if len(history) == 0:
         our_chatbot = InferenceDemo(
@@ -178,47 +203,38 @@ def add_message(history, message):
         history.append((message["text"], None))
     return history, gr.MultimodalTextbox(value=None, interactive=False)
 @spaces.GPU
 def bot(history):
-    global start_tstamp, finish_tstamp
-    start_tstamp = time.time()
     text = history[-1][0]
     images_this_term = []
     num_new_images = 0
     for i, message in enumerate(history[:-1]):
-        if isinstance(message[0], tuple):
             images_this_term.append(message[0][0])
             if is_valid_video_filename(message[0][0]):
                 raise ValueError("Video is not supported")
             elif is_valid_image_filename(message[0][0]):
                 num_new_images += 1
             else:
                 raise ValueError("Invalid image file")
         else:
             num_new_images = 0
-    assert len(images_this_term) > 0, "Must have an image"
-    image_list = []
-    for f in images_this_term:
-        if is_valid_video_filename(f):
-            image_list += sample_frames(f, our_chatbot.num_frames)
-        elif is_valid_image_filename(f):
-            image_list.append(load_image(f))
-        else:
-            raise ValueError("Invalid image file")
-    image_tensor = [
-        our_chatbot.image_processor.preprocess(f, return_tensors="pt")["pixel_values"][0]
-        .half()
-        .to(our_chatbot.model.device)
-        for f in image_list
-    ]
-    # Process image hashes
     all_image_hash = []
     for image_path in images_this_term:
         with open(image_path, "rb") as image_file:
             image_data = image_file.read()
@@ -232,26 +248,54 @@ def bot(history):
                 f"{t.year}-{t.month:02d}-{t.day:02d}",
                 f"{image_hash}.jpg",
             )
             if not os.path.isfile(filename):
                 os.makedirs(os.path.dirname(filename), exist_ok=True)
                 image.save(filename)
     image_tensor = torch.stack(image_tensor)
     image_token = DEFAULT_IMAGE_TOKEN * num_new_images
-    inp = image_token + "\n" + text
     our_chatbot.conversation.append_message(our_chatbot.conversation.roles[0], inp)
     our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], None)
     prompt = our_chatbot.conversation.get_prompt()
-    input_ids = (
-        tokenizer_image_token(
             prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
-        )
-        .unsqueeze(0)
-        .to(our_chatbot.model.device)
-    )
     stop_str = (
         our_chatbot.conversation.sep
         if our_chatbot.conversation.sep_style != SeparatorStyle.TWO
@@ -261,54 +305,85 @@ def bot(history):
     stopping_criteria = KeywordsStoppingCriteria(
         keywords, our_chatbot.tokenizer, input_ids
     )
-    # Set up streaming
-    q = Queue()
     streamer = TextIteratorStreamer(
-        our_chatbot.tokenizer,
-        skip_prompt=True,
-        skip_special_tokens=True
     )
-    # Start generation in a separate thread
-    thread = Thread(
-        target=process_stream,
-        args=(streamer, history, q)
     )
-    thread.start()
-    # Start the generation
-    with torch.inference_mode():
-        output_ids = our_chatbot.model.generate(
-            input_ids,
-            images=image_tensor,
-            do_sample=True,
-            temperature=0.2,
-            max_new_tokens=1024,
-            streamer=streamer,
-            use_cache=True,
-            stopping_criteria=[stopping_criteria],
-        )
-    finish_tstamp = time.time()
-    # Log conversation
     with open(get_conv_log_filename(), "a") as fout:
         data = {
-            "tstamp": round(finish_tstamp, 4),
             "type": "chat",
             "model": "Pangea-7b",
-            "start": round(start_tstamp, 4),
-            "finish": round(finish_tstamp, 4),
             "state": history,
             "images": all_image_hash,
         }
         fout.write(json.dumps(data) + "\n")
-    # Return a generator that will yield updated history
-    return stream_output(history, q)
-with gr.Blocks(css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4 img {min-width: 40px}") as demo:
     gr.HTML(html_header)
     with gr.Column():
@@ -319,8 +394,10 @@ with gr.Blocks(css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4 img {min-wid
             upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
             downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
             flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
             regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
             clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
         chat_input = gr.MultimodalTextbox(
             interactive=True,
@@ -330,11 +407,11 @@ with gr.Blocks(css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4 img {min-wid
             submit_btn="🚀"
         )
-        cur_dir = os.path.dirname(os.path.abspath(__file__))
         gr.Examples(
-            examples_per_page=20,
-            examples=[
-                [
                         {
                             "files": [
                                 f"{cur_dir}/examples/user_example_07.jpg",
@@ -358,45 +435,158 @@ with gr.Blocks(css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4 img {min-wid
                             "text": "Why this image funny?",
                         },
                     ],
-            ],
-            inputs=[chat_input],
-            label="Image",
-        )
     chat_msg = chat_input.submit(
-        add_message,
-        [chatbot, chat_input],
-        [chatbot, chat_input],
-        queue=False
-    ).then(
-        bot,
-        chatbot,
-        chatbot,
-        api_name="bot_response"
-    ).then(
-        lambda: gr.MultimodalTextbox(interactive=True),
-        None,
-        [chat_input]
     )
     clear_btn.click(
-        fn=clear_history,
-        inputs=[chatbot],
-        outputs=[chatbot],
-        api_name="clear_all",
-        queue=False
     )
-    regenerate_btn.click(
-        fn=lambda history: history[:-1],
-        inputs=[chatbot],
-        outputs=[chatbot],
-        queue=False
-    ).then(
-        bot,
-        chatbot,
-        chatbot
-    )
 demo.queue()

+# from .demo_modelpart import InferenceDemo
 import gradio as gr
 import os
 from threading import Thread
+# import time
 import cv2
 import datetime
+# import copy
 import torch
 import spaces
 import numpy as np
 from llava import conversation as conversation_lib
 from llava.constants import DEFAULT_IMAGE_TOKEN
 from llava.constants import (
     IMAGE_TOKEN_INDEX,
     DEFAULT_IMAGE_TOKEN,
     get_model_name_from_path,
     KeywordsStoppingCriteria,
 )
 from serve_constants import html_header
 import requests
 from PIL import Image
 from io import BytesIO
+from transformers import TextStreamer, TextIteratorStreamer
+import hashlib
+import PIL
+import base64
+import json
+import datetime
+import gradio as gr
+import gradio_client
 import subprocess
+import sys
 external_log_dir = "./logs"
 LOGDIR = external_log_dir
     else:
         print("Gradio 4.35.0 is already installed.")
+# Call the function to install Gradio 4.35.0 if needed
 install_gradio_4_35_0()
+import gradio as gr
+import gradio_client
 print(f"Gradio version: {gr.__version__}")
+print(f"Gradio-client version: {gradio_client.__version__}")
 def get_conv_log_filename():
     t = datetime.datetime.now()
     ) -> None:
         disable_torch_init()
+        self.tokenizer, self.model, self.image_processor, self.context_len = (
+            tokenizer,
+            model,
+            image_processor,
+            context_len,
+        )
         if "llama-2" in model_name.lower():
             conv_mode = "llava_llama_2"
             )
         else:
             args.conv_mode = conv_mode
         self.conv_mode = conv_mode
         self.conversation = conv_templates[args.conv_mode].copy()
         self.num_frames = args.num_frames
 def is_valid_video_filename(name):
     video_extensions = ["avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg"]
     ext = name.split(".")[-1].lower()
+    if ext in video_extensions:
+        return True
+    else:
+        return False
 def is_valid_image_filename(name):
+    image_extensions = ["jpg", "jpeg", "png", "bmp", "gif", "tiff", "webp", "heic", "heif", "jfif", "svg", "eps", "raw"]
     ext = name.split(".")[-1].lower()
+    if ext in image_extensions:
+        return True
+    else:
+        return False
 def sample_frames(video_file, num_frames):
     video = cv2.VideoCapture(video_file)
     frames = []
     for i in range(total_frames):
         ret, frame = video.read()
+        pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
         if not ret:
             continue
         if i % interval == 0:
             frames.append(pil_img)
     video.release()
     return frames
 def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
         response = requests.get(image_file)
         if response.status_code == 200:
             image = Image.open(BytesIO(response.content)).convert("RGB")
         else:
+            print("failed to load the image")
     else:
+        print("Load image from local file")
+        print(image_file)
         image = Image.open(image_file).convert("RGB")
     return image
 def clear_history(history):
     our_chatbot.conversation = conv_templates[our_chatbot.conv_mode].copy()
     return None
+def clear_response(history):
+    for index_conv in range(1, len(history)):
+        # loop until get a text response from our model.
+        conv = history[-index_conv]
+        if not (conv[0] is None):
+            break
+    question = history[-index_conv][0]
+    history = history[:-index_conv]
+    return history, question
+# def print_like_dislike(x: gr.LikeData):
+#     print(x.index, x.value, x.liked)
 def add_message(history, message):
+    # history=[]
     global our_chatbot
     if len(history) == 0:
         our_chatbot = InferenceDemo(
         history.append((message["text"], None))
     return history, gr.MultimodalTextbox(value=None, interactive=False)
 @spaces.GPU
 def bot(history):
     text = history[-1][0]
     images_this_term = []
+    text_this_term = ""
+    # import pdb;pdb.set_trace()
     num_new_images = 0
     for i, message in enumerate(history[:-1]):
+        if type(message[0]) is tuple:
             images_this_term.append(message[0][0])
             if is_valid_video_filename(message[0][0]):
+                # 不接受视频
                 raise ValueError("Video is not supported")
+                num_new_images += our_chatbot.num_frames
             elif is_valid_image_filename(message[0][0]):
+                print("#### Load image from local file",message[0][0])
                 num_new_images += 1
             else:
                 raise ValueError("Invalid image file")
         else:
             num_new_images = 0
+    # for message in history[-i-1:]:
+    #     images_this_term.append(message[0][0])
+    assert len(images_this_term) > 0, "must have an image"
+    # image_files = (args.image_file).split(',')
+    # image = [load_image(f) for f in images_this_term if f]
     all_image_hash = []
+    all_image_path = []
     for image_path in images_this_term:
         with open(image_path, "rb") as image_file:
             image_data = image_file.read()
                 f"{t.year}-{t.month:02d}-{t.day:02d}",
                 f"{image_hash}.jpg",
             )
+            all_image_path.append(filename)
             if not os.path.isfile(filename):
                 os.makedirs(os.path.dirname(filename), exist_ok=True)
+                print("image save to",filename)
                 image.save(filename)
+    image_list = []
+    for f in images_this_term:
+        if is_valid_video_filename(f):
+            image_list += sample_frames(f, our_chatbot.num_frames)
+        elif is_valid_image_filename(f):
+            image_list.append(load_image(f))
+        else:
+            raise ValueError("Invalid image file")
+    image_tensor = [
+        our_chatbot.image_processor.preprocess(f, return_tensors="pt")["pixel_values"][
+            0
+        ]
+        .half()
+        .to(our_chatbot.model.device)
+        for f in image_list
+    ]
     image_tensor = torch.stack(image_tensor)
     image_token = DEFAULT_IMAGE_TOKEN * num_new_images
+    # if our_chatbot.model.config.mm_use_im_start_end:
+    #     inp = DEFAULT_IM_START_TOKEN + image_token + DEFAULT_IM_END_TOKEN + "\n" + inp
+    # else:
+    inp = text
+    inp = image_token + "\n" + inp
     our_chatbot.conversation.append_message(our_chatbot.conversation.roles[0], inp)
+    # image = None
     our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], None)
     prompt = our_chatbot.conversation.get_prompt()
+    # input_ids = (
+    #     tokenizer_image_token(
+    #         prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+    #     )
+    #     .unsqueeze(0)
+    #     .to(our_chatbot.model.device)
+    # )
+    input_ids = tokenizer_image_token(
             prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+        ).unsqueeze(0).to(our_chatbot.model.device)
+    # print("### input_id",input_ids)
     stop_str = (
         our_chatbot.conversation.sep
         if our_chatbot.conversation.sep_style != SeparatorStyle.TWO
     stopping_criteria = KeywordsStoppingCriteria(
         keywords, our_chatbot.tokenizer, input_ids
     )
+    # streamer = TextStreamer(
+    #     our_chatbot.tokenizer, skip_prompt=True, skip_special_tokens=True
+    # )
     streamer = TextIteratorStreamer(
+        our_chatbot.tokenizer, skip_prompt=True, skip_special_tokens=True
     )
+    print(our_chatbot.model.device)
+    print(input_ids.device)
+    print(image_tensor.device)
+    # with torch.inference_mode():
+    #     output_ids = our_chatbot.model.generate(
+    #         input_ids,
+    #         images=image_tensor,
+    #         do_sample=True,
+    #         temperature=0.7,
+    #         top_p=1.0,
+    #         max_new_tokens=4096,
+    #         streamer=streamer,
+    #         use_cache=False,
+    #         stopping_criteria=[stopping_criteria],
+    #     )
+    # outputs = our_chatbot.tokenizer.decode(output_ids[0]).strip()
+    # if outputs.endswith(stop_str):
+    #     outputs = outputs[: -len(stop_str)]
+    # our_chatbot.conversation.messages[-1][-1] = outputs
+    # history[-1] = [text, outputs]
+    # return history
+    generate_kwargs = dict(
+        inputs=input_ids,
+        streamer=streamer,
+        images=image_tensor,
+        max_new_tokens=1024,
+        do_sample=True,
+        temperature=0.2,
+        num_beams=1,
+        use_cache=False,
+        stopping_criteria=[stopping_criteria],
     )
+    t = Thread(target=our_chatbot.model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        our_chatbot.conversation.messages[-1][-1] = "".join(outputs)
+        history[-1] = [text, "".join(outputs)]
+        yield history
     with open(get_conv_log_filename(), "a") as fout:
         data = {
             "type": "chat",
             "model": "Pangea-7b",
             "state": history,
             "images": all_image_hash,
+            "images_path": all_image_path
         }
+        print("#### conv log",data)
         fout.write(json.dumps(data) + "\n")
+txt = gr.Textbox(
+    scale=4,
+    show_label=False,
+    placeholder="Enter text and press enter.",
+    container=False,
+)
+with gr.Blocks(
+    css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4  img {min-width: 40px}",
+) as demo:
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    # gr.Markdown(title_markdown)
     gr.HTML(html_header)
     with gr.Column():
             upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
             downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
             flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
+            # stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=True)
             regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
             clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
         chat_input = gr.MultimodalTextbox(
             interactive=True,
             submit_btn="🚀"
         )
+        print(cur_dir)
         gr.Examples(
+                examples_per_page=20,
+                examples=[
+                    [
                         {
                             "files": [
                                 f"{cur_dir}/examples/user_example_07.jpg",
                             "text": "Why this image funny?",
                         },
                     ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/norway.jpg",
+                            ],
+                            "text": "Analysieren, in welchem Land diese Szene höchstwahrscheinlich gedreht wurde.",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/totoro.jpg",
+                            ],
+                            "text": "¿En qué anime aparece esta escena? ¿Puedes presentarlo?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/africa.jpg",
+                            ],
+                            "text": "इस तस्वीर में हर एक दृश्य तत्व का क्या प्रतिनिधित्व करता है?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/hot_ballon.jpg",
+                            ],
+                            "text": "ฉากบอลลูนลมร้อนในภาพนี้อาจอยู่ที่ไหน? สถานที่นี้มีความพิเศษอย่างไร?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/bar.jpg",
+                            ],
+                            "text": "Você pode me dar ideias de design baseadas no tema de coquetéis deste letreiro?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/pink_lake.jpg",
+                            ],
+                            "text": "Обясни защо езерото на този остров е в този цвят.",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/hanzi.jpg",
+                            ],
+                            "text": "Can you describe in Hebrew the evolution process of these four Chinese characters from pictographs to modern characters?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/ballon.jpg",
+                            ],
+                            "text": "இந்த காட்சியை விவரிக்கவும், மேலும் இந்த படத்தின் அடிப்படையில் துருக்கியில் இந்த காட்சியுடன் தொடர்பான சில பிரபலமான நிகழ்வுகள் என்ன?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/pie.jpg",
+                            ],
+                            "text": "Décrivez ce graphique. Quelles informations pouvons-nous en tirer?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/camera.jpg",
+                            ],
+                            "text": "Apa arti dari dua angka di sebelah kiri yang ditampilkan di layar kamera?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/dog.jpg",
+                            ],
+                            "text": "이 강아지의 표정을 보고 어떤 기분이나 감정을 느끼고 있는지 설명해 주시겠어요?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/book.jpg",
+                            ],
+                            "text": "What language is the text in, and what does the title mean in English?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/food.jpg",
+                            ],
+                            "text": "Unaweza kunipa kichocheo cha kutengeneza hii pancake?",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/line chart.jpg",
+                            ],
+                            "text": "Hãy trình bày những xu hướng mà bạn quan sát được từ biểu đồ và hiện tượng xã hội tiềm ẩn từ đó.",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/south africa.jpg",
+                            ],
+                            "text": "Waar is hierdie plek? Help my om ’n reisroete vir hierdie land te beplan.",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/girl.jpg",
+                            ],
+                            "text": "لماذا هذه الصورة مضحكة؟",
+                        },
+                    ],
+                    [
+                        {
+                            "files": [
+                                f"{cur_dir}/examples/eagles.jpg",
+                            ],
+                            "text": "Какой креатив должен быть в этом логотипе?",
+                        },
+                    ],
+                ],
+                inputs=[chat_input],
+                label="Image",
+            )
     chat_msg = chat_input.submit(
+        add_message, [chatbot, chat_input], [chatbot, chat_input]
     )
+    bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
+    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
+    # chatbot.like(print_like_dislike, None, None)
     clear_btn.click(
+        fn=clear_history, inputs=[chatbot], outputs=[chatbot], api_name="clear_all"
     )
 demo.queue()