Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -25,23 +25,29 @@ def model_inference(
|
|
| 25 |
user_content = []
|
| 26 |
media_queue = []
|
| 27 |
if history == []:
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
|
| 30 |
media_queue.append({"type": "image", "path": file})
|
| 31 |
elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
|
| 32 |
media_queue.append({"type": "video", "path": file})
|
| 33 |
|
| 34 |
-
text
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
for part in parts:
|
| 38 |
-
if part == "<image>" and media_queue:
|
| 39 |
-
user_content.append(media_queue.pop(0))
|
| 40 |
-
elif part == "<video>" and media_queue:
|
| 41 |
-
user_content.append(media_queue.pop(0))
|
| 42 |
-
elif part.strip():
|
| 43 |
-
user_content.append({"type": "text", "text": part.strip()})
|
| 44 |
-
|
| 45 |
resulting_messages = [{"role": "user", "content": user_content}]
|
| 46 |
|
| 47 |
elif len(history) > 0:
|
|
@@ -51,7 +57,7 @@ def model_inference(
|
|
| 51 |
for hist in history:
|
| 52 |
if hist["role"] == "user" and isinstance(hist["content"], tuple):
|
| 53 |
file_name = hist["content"][0]
|
| 54 |
-
if file_name.endswith((".png", ".jpg", ".jpeg"
|
| 55 |
media_queue.append({"type": "image", "path": file_name})
|
| 56 |
elif file_name.endswith(".mp4"):
|
| 57 |
media_queue.append({"type": "video", "path": file_name})
|
|
@@ -120,19 +126,15 @@ def model_inference(
|
|
| 120 |
|
| 121 |
|
| 122 |
examples=[
|
| 123 |
-
[{"text": "
|
| 124 |
-
[{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
|
| 125 |
-
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
|
| 126 |
[{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
|
| 127 |
[{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
|
| 128 |
[{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
|
| 129 |
[{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
|
| 130 |
-
[{"text": "What is
|
| 131 |
-
[{"text": "What is happening in the video?", "files": ["example_images/barcamadridhighlights.mp4"]}],
|
| 132 |
-
|
| 133 |
]
|
| 134 |
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
|
| 135 |
-
description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples.
|
| 136 |
examples=examples,
|
| 137 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
|
| 138 |
cache_examples=False,
|
|
|
|
| 25 |
user_content = []
|
| 26 |
media_queue = []
|
| 27 |
if history == []:
|
| 28 |
+
text = input_dict["text"].strip()
|
| 29 |
+
|
| 30 |
+
for file in input_dict.get("files", []):
|
| 31 |
if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
|
| 32 |
media_queue.append({"type": "image", "path": file})
|
| 33 |
elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
|
| 34 |
media_queue.append({"type": "video", "path": file})
|
| 35 |
|
| 36 |
+
if "<image>" in text or "<video>" in text:
|
| 37 |
+
parts = re.split(r'(<image>|<video>)', text)
|
| 38 |
+
for part in parts:
|
| 39 |
+
if part == "<image>" and media_queue:
|
| 40 |
+
user_content.append(media_queue.pop(0))
|
| 41 |
+
elif part == "<video>" and media_queue:
|
| 42 |
+
user_content.append(media_queue.pop(0))
|
| 43 |
+
elif part.strip():
|
| 44 |
+
user_content.append({"type": "text", "text": part.strip()})
|
| 45 |
+
else:
|
| 46 |
+
user_content.append({"type": "text", "text": text})
|
| 47 |
+
|
| 48 |
+
for media in media_queue:
|
| 49 |
+
user_content.append(media)
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
resulting_messages = [{"role": "user", "content": user_content}]
|
| 52 |
|
| 53 |
elif len(history) > 0:
|
|
|
|
| 57 |
for hist in history:
|
| 58 |
if hist["role"] == "user" and isinstance(hist["content"], tuple):
|
| 59 |
file_name = hist["content"][0]
|
| 60 |
+
if file_name.endswith((".png", ".jpg", ".jpeg")):
|
| 61 |
media_queue.append({"type": "image", "path": file_name})
|
| 62 |
elif file_name.endswith(".mp4"):
|
| 63 |
media_queue.append({"type": "video", "path": file_name})
|
|
|
|
| 126 |
|
| 127 |
|
| 128 |
examples=[
|
| 129 |
+
[[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
|
|
|
|
|
|
|
| 130 |
[{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
|
| 131 |
[{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
|
| 132 |
[{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
|
| 133 |
[{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
|
| 134 |
+
[{"text": "What is happening in the video?", "files": ["example_images/barcamadridhighlights.mp4"]}],
|
|
|
|
|
|
|
| 135 |
]
|
| 136 |
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
|
| 137 |
+
description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
|
| 138 |
examples=examples,
|
| 139 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
|
| 140 |
cache_examples=False,
|