echarlaix HF Staff commited on
Commit
67f0068
·
1 Parent(s): 5ffe9aa
Files changed (1) hide show
  1. app.py +98 -61
app.py CHANGED
@@ -1,5 +1,9 @@
1
  import gradio as gr
2
- from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
 
 
 
 
3
  from threading import Thread
4
  import re
5
  import time
@@ -7,21 +11,21 @@ import time
7
  from optimum.intel import OVModelForVisualCausalLM
8
 
9
 
10
-
11
- # model_id = "echarlaix/SmolVLM2-2.2B-Instruct-openvino"
12
- model_id = "echarlaix/SmolVLM-256M-Instruct-openvino"
13
 
14
  processor = AutoProcessor.from_pretrained(model_id)
15
  model = OVModelForVisualCausalLM.from_pretrained(model_id)
16
 
 
17
  def model_inference(input_dict, history, max_tokens):
18
  text = input_dict["text"]
19
  images = []
20
  user_content = []
21
  media_queue = []
22
  if history == []:
23
- text = input_dict["text"].strip()
24
-
25
  for file in input_dict.get("files", []):
26
  if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
27
  media_queue.append({"type": "image", "path": file})
@@ -29,17 +33,17 @@ def model_inference(input_dict, history, max_tokens):
29
  media_queue.append({"type": "video", "path": file})
30
 
31
  if "<image>" in text or "<video>" in text:
32
- parts = re.split(r'(<image>|<video>)', text)
33
  for part in parts:
34
  if part == "<image>" and media_queue:
35
- user_content.append(media_queue.pop(0))
36
  elif part == "<video>" and media_queue:
37
- user_content.append(media_queue.pop(0))
38
- elif part.strip():
39
  user_content.append({"type": "text", "text": part.strip()})
40
  else:
41
  user_content.append({"type": "text", "text": text})
42
-
43
  for media in media_queue:
44
  user_content.append(media)
45
 
@@ -50,55 +54,54 @@ def model_inference(input_dict, history, max_tokens):
50
  user_content = []
51
  media_queue = []
52
  for hist in history:
53
- if hist["role"] == "user" and isinstance(hist["content"], tuple):
54
  file_name = hist["content"][0]
55
  if file_name.endswith((".png", ".jpg", ".jpeg")):
56
  media_queue.append({"type": "image", "path": file_name})
57
  elif file_name.endswith(".mp4"):
58
  media_queue.append({"type": "video", "path": file_name})
59
 
60
-
61
  for hist in history:
62
- if hist["role"] == "user" and isinstance(hist["content"], str):
63
  text = hist["content"]
64
- parts = re.split(r'(<image>|<video>)', text)
65
-
66
  for part in parts:
67
  if part == "<image>" and media_queue:
68
- user_content.append(media_queue.pop(0))
69
  elif part == "<video>" and media_queue:
70
- user_content.append(media_queue.pop(0))
71
- elif part.strip():
72
  user_content.append({"type": "text", "text": part.strip()})
73
-
74
- elif hist["role"] == "assistant":
75
- resulting_messages.append({
76
- "role": "user",
77
- "content": user_content
78
- })
79
- resulting_messages.append({
80
- "role": "assistant",
81
- "content": [{"type": "text", "text": hist["content"]}]
82
- })
83
- user_content = []
84
 
 
 
 
 
 
 
 
 
 
85
 
86
  if text == "" and not images:
87
  gr.Error("Please input a query and optionally image(s).")
88
 
89
  if text == "" and images:
90
  gr.Error("Please input a text query along the images(s).")
91
- print("resulting_messages", resulting_messages)
92
  inputs = processor.apply_chat_template(
93
- resulting_messages,
94
- add_generation_prompt=True,
95
- tokenize=True,
96
- return_dict=True,
97
- return_tensors="pt",
98
- )
99
 
100
  # Generate
101
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
 
102
  generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
103
  # generated_text = ""
104
 
@@ -107,33 +110,67 @@ def model_inference(input_dict, history, max_tokens):
107
 
108
  yield "..."
109
  buffer = ""
110
-
111
  for new_text in streamer:
112
-
113
  buffer += new_text
114
  # generated_text_without_prompt = buffer#[len(ext_buffer):]
115
  time.sleep(0.01)
116
  yield buffer
117
 
118
 
119
- examples=[
120
- [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
121
- [{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
122
- [{"text": "Describe this image.", "files": ["example_images/mosque.jpg"]}],
123
- [{"text": "When was this purchase made and how much did it cost?", "files": ["example_images/fiche.jpg"]}],
124
- [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
125
- [{"text": "What is happening in the video?", "files": ["example_images/short.mp4"]}],
126
- ]
127
- demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
128
- description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
129
- examples=examples,
130
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
131
- cache_examples=False,
132
- additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
133
- type="messages"
134
- )
135
-
136
-
137
-
138
- demo.launch(debug=True)
139
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import (
3
+ AutoProcessor,
4
+ AutoModelForImageTextToText,
5
+ TextIteratorStreamer,
6
+ )
7
  from threading import Thread
8
  import re
9
  import time
 
11
  from optimum.intel import OVModelForVisualCausalLM
12
 
13
 
14
+ # model_id = "echarlaix/SmolVLM2-2.2B-Instruct-openvino"
15
+ model_id = "echarlaix/SmolVLM-256M-Instruct-openvino"
 
16
 
17
  processor = AutoProcessor.from_pretrained(model_id)
18
  model = OVModelForVisualCausalLM.from_pretrained(model_id)
19
 
20
+
21
  def model_inference(input_dict, history, max_tokens):
22
  text = input_dict["text"]
23
  images = []
24
  user_content = []
25
  media_queue = []
26
  if history == []:
27
+ text = input_dict["text"].strip()
28
+
29
  for file in input_dict.get("files", []):
30
  if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
31
  media_queue.append({"type": "image", "path": file})
 
33
  media_queue.append({"type": "video", "path": file})
34
 
35
  if "<image>" in text or "<video>" in text:
36
+ parts = re.split(r"(<image>|<video>)", text)
37
  for part in parts:
38
  if part == "<image>" and media_queue:
39
+ user_content.append(media_queue.pop(0))
40
  elif part == "<video>" and media_queue:
41
+ user_content.append(media_queue.pop(0))
42
+ elif part.strip():
43
  user_content.append({"type": "text", "text": part.strip()})
44
  else:
45
  user_content.append({"type": "text", "text": text})
46
+
47
  for media in media_queue:
48
  user_content.append(media)
49
 
 
54
  user_content = []
55
  media_queue = []
56
  for hist in history:
57
+ if hist["role"] == "user" and isinstance(hist["content"], tuple):
58
  file_name = hist["content"][0]
59
  if file_name.endswith((".png", ".jpg", ".jpeg")):
60
  media_queue.append({"type": "image", "path": file_name})
61
  elif file_name.endswith(".mp4"):
62
  media_queue.append({"type": "video", "path": file_name})
63
 
 
64
  for hist in history:
65
+ if hist["role"] == "user" and isinstance(hist["content"], str):
66
  text = hist["content"]
67
+ parts = re.split(r"(<image>|<video>)", text)
68
+
69
  for part in parts:
70
  if part == "<image>" and media_queue:
71
+ user_content.append(media_queue.pop(0))
72
  elif part == "<video>" and media_queue:
73
+ user_content.append(media_queue.pop(0))
74
+ elif part.strip():
75
  user_content.append({"type": "text", "text": part.strip()})
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ elif hist["role"] == "assistant":
78
+ resulting_messages.append({"role": "user", "content": user_content})
79
+ resulting_messages.append(
80
+ {
81
+ "role": "assistant",
82
+ "content": [{"type": "text", "text": hist["content"]}],
83
+ }
84
+ )
85
+ user_content = []
86
 
87
  if text == "" and not images:
88
  gr.Error("Please input a query and optionally image(s).")
89
 
90
  if text == "" and images:
91
  gr.Error("Please input a text query along the images(s).")
92
+ # print("resulting_messages", resulting_messages)
93
  inputs = processor.apply_chat_template(
94
+ resulting_messages,
95
+ add_generation_prompt=True,
96
+ tokenize=True,
97
+ return_dict=True,
98
+ return_tensors="pt",
99
+ )
100
 
101
  # Generate
102
+ streamer = TextIteratorStreamer(
103
+ processor, skip_prompt=True, skip_special_tokens=True
104
+ )
105
  generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
106
  # generated_text = ""
107
 
 
110
 
111
  yield "..."
112
  buffer = ""
113
+
114
  for new_text in streamer:
 
115
  buffer += new_text
116
  # generated_text_without_prompt = buffer#[len(ext_buffer):]
117
  time.sleep(0.01)
118
  yield buffer
119
 
120
 
121
+ examples = [
122
+ [
123
+ {
124
+ "text": "Where do the severe droughts happen according to this diagram?",
125
+ "files": ["example_images/examples_weather_events.png"],
126
+ }
127
+ ],
128
+ [
129
+ {
130
+ "text": "What art era this artpiece <image> and this artpiece <image> belong to?",
131
+ "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"],
132
+ }
133
+ ],
134
+ [ {
135
+ "text": "Describe this image.",
136
+ "files": ["example_images/mosque.jpg"]
137
+ }
138
+ ],
139
+ [
140
+ {
141
+ "text": "When was this purchase made and how much did it cost?",
142
+ "files": ["example_images/fiche.jpg"],
143
+ }
144
+ ],
145
+ [
146
+ {
147
+ "text": "What is the date in this document?",
148
+ "files": ["example_images/document.jpg"],
149
+ }
150
+ ],
151
+ [
152
+ {
153
+ "text": "What is happening in the video?",
154
+ "files": ["example_images/short.mp4"],
155
+ }
156
+ ],
157
+ ]
158
+ demo = gr.ChatInterface(
159
+ fn=model_inference,
160
+ title="SmolVLM2: The Smollest Video Model Ever 📺",
161
+ description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
162
+ examples=examples,
163
+ textbox=gr.MultimodalTextbox(
164
+ label="Query Input", file_types=["image", ".mp4"], file_count="multiple"
165
+ ),
166
+ stop_btn="Stop Generation",
167
+ multimodal=True,
168
+ cache_examples=False,
169
+ additional_inputs=[
170
+ gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")
171
+ ],
172
+ type="messages",
173
+ )
174
+
175
+
176
+ demo.launch(debug=True)