Spaces:
Running
Running
| # Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed | |
| import os | |
| import re | |
| import cv2 | |
| import json | |
| import time | |
| import numpy as np | |
| import gradio as gr | |
| from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN | |
| from visualizer import draw_boxes_points_with_labels | |
| infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'), api_key=os.getenv('API_KEY')) | |
| label_translations = { | |
| "gr_chatinterface_ofl": { | |
| "English": "Chatbot", | |
| "中文": "对话界面" | |
| }, | |
| "gr_chatinterface_ol": { | |
| "English": "Chatbot", | |
| "中文": "对话界面" | |
| }, | |
| "gr_tab_ol": { | |
| "English": "Online", | |
| "中文": "在线模式" | |
| }, | |
| "gr_tab_ofl": { | |
| "English": "Offline", | |
| "中文": "离线模式" | |
| }, | |
| "gr_thinking": { | |
| "English": ConversationModeI18N.D, | |
| "中文": ConversationModeCN.D, | |
| }, | |
| "gr_temperature": { | |
| "English": "Temperature", | |
| "中文": "温度系数" | |
| }, | |
| "gr_webcam_image": { | |
| "English": "🤳 Open Webcam", | |
| "中文": "🤳 打开摄像头" | |
| }, | |
| "gr_webcam_images": { | |
| "English": "📹 Recorded Frames", | |
| "中文": "📹 录制的视频帧" | |
| }, | |
| "gr_chatinterface_ofl.textbox.placeholder": { | |
| "English": | |
| "Ask me anything. You can also drop in images and .mp4 videos.", | |
| "中文": "有什么想问的?支持上传图片和.mp4视频。" | |
| }, | |
| "gr_chatinterface_ol.textbox.placeholder": { | |
| "English": "Ask me anything...", | |
| "中文": "有什么想问的?" | |
| }, | |
| "gr_clear_button": { | |
| "English": "🧹 Clear History", | |
| "中文": "🧹 清除历史对话" | |
| } | |
| } | |
| def add_escape(text: str): | |
| return text.replace('<', '\<').replace('>', '\>') | |
| def remove_escape(text: str): | |
| return text.replace('\<', '<').replace('\>', '>') | |
| def plot_boxes_points_detections(image_path, message): | |
| detection_pattern = r'\[\s*{.*?}\s*\]' | |
| detection_matches = re.finditer(detection_pattern, message, flags=re.DOTALL) | |
| bboxes, categories = [], [] | |
| for match in detection_matches: | |
| matched_str = match.group(0) | |
| detections = json.loads(matched_str) | |
| for detection in detections: | |
| cat, bbox_str = detection['category'], detection['bbox'] | |
| bbox_str = bbox_str.replace('<bbox>', '').replace('</bbox>', '').replace('</bbox', '') | |
| bbox = list(map(float, bbox_str.split(' '))) | |
| bboxes.append(bbox) | |
| categories.append(cat) | |
| if not bboxes: | |
| box_pattern = r'<bbox>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</bbox>' | |
| box_matches = re.finditer(box_pattern, message) | |
| bboxes = [ | |
| [float(match.group(1)), float(match.group(2)), | |
| float(match.group(3)), float(match.group(4))] | |
| for match in box_matches | |
| ] | |
| points = [] | |
| if not bboxes: | |
| point_pattern = r'<point>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</point>' | |
| point_matches = re.finditer(point_pattern, message) | |
| points = [ | |
| [float(match.group(1)), float(match.group(2))] | |
| for match in point_matches | |
| ] | |
| if not bboxes and not points: | |
| return | |
| bboxes = np.array(bboxes, dtype='float') / 1000 | |
| points = np.array(points, dtype='float') / 1000 | |
| image = cv2.imread(image_path) | |
| h, w, c = image.shape | |
| if bboxes.size: | |
| bboxes[:, 0::2] *= w | |
| bboxes[:, 1::2] *= h | |
| if points.size: | |
| points[:, 0] *= w | |
| points[:, 1] *= h | |
| output_image = draw_boxes_points_with_labels(image, bboxes, points, categories) | |
| return output_image | |
| def general_chat(inputs: dict, gr_history: list, infer_history: list, | |
| if_thinking: bool, temperature: float, online: bool = False): | |
| if 'text' in inputs: | |
| inputs['text'] = remove_escape(inputs['text']) | |
| mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G | |
| for response_text, infer_history, finished in infer(inputs=inputs, | |
| history=infer_history, | |
| mode=mode, | |
| temperature=temperature, | |
| online=online): | |
| if if_thinking: | |
| reasoning_text, response_text = response_text.split('</think>') | |
| reasoning_text = reasoning_text.lstrip('<think>') | |
| response_message = [{ | |
| "role": "assistant", | |
| "content": add_escape(reasoning_text), | |
| 'metadata': { | |
| 'title': '🤔 Thinking' | |
| } | |
| }, { | |
| "role": "assistant", | |
| "content": add_escape(response_text) | |
| }] | |
| else: | |
| response_message = [{ | |
| "role": "assistant", | |
| "content": add_escape(response_text) | |
| }] | |
| if finished and len(inputs.get('files', [])) == 1 and not inputs['files'][0].endswith('.mp4'): | |
| image_path = inputs['files'][0] | |
| response_text = infer_history[-1]['content'] | |
| try: | |
| if if_thinking: | |
| reasoning_text, response_text = response_text.split('</think>') | |
| output_image = plot_boxes_points_detections(image_path, response_text) | |
| if output_image is not None: | |
| response_message.append({ | |
| "role": "assistant", | |
| "content": gr.Image(output_image), | |
| }) | |
| except Exception as e: | |
| print(e) | |
| yield response_message, infer_history | |
| def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, | |
| gr_counter: int, infer_history: list, if_thinking: bool, | |
| temperature: float): | |
| if not gr_webcam_images: | |
| gr_webcam_images = [] | |
| gr_webcam_images = gr_webcam_images[gr_counter:] | |
| inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]} | |
| yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len( | |
| gr_webcam_images), infer_history | |
| for response_message, infer_history in general_chat( | |
| inputs, gr_history, infer_history, if_thinking, temperature, online=True): | |
| yield response_message, gr.skip(), infer_history | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr_title = gr.Markdown('# Seed1.5-VL') | |
| with gr.Row(): | |
| gr.Markdown( | |
| """ | |
| <div style="display:flex; flex-direction:column; gap:10px;"> | |
| <a | |
| href="https://github.com/ByteDance-Seed/Seed1.5-VL" | |
| target="_blank" | |
| style=" | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 8px; | |
| white-space: nowrap; | |
| text-decoration: none; | |
| " | |
| > | |
| <img | |
| src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/github/github-original.svg" | |
| alt="GitHub" | |
| width="24" | |
| > | |
| Seed1.5-VL Cookbook | |
| </a> | |
| </div> | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| <div style="display:flex; flex-direction:column; gap:10px;"> | |
| <a | |
| href="https://huggingface.co/papers/2505.07062" | |
| target="_blank" | |
| style=" | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 8px; | |
| white-space: nowrap; | |
| text-decoration: none; | |
| " | |
| > | |
| <img | |
| src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" | |
| alt="Paper" | |
| width="24" | |
| > | |
| Seed1.5-VL Paper | |
| </a> | |
| </div> | |
| """, | |
| ) | |
| gr.Markdown('') | |
| gr.Markdown('') | |
| gr.Markdown('') | |
| gr_lang_selector = gr.Dropdown(choices=["English", "中文"], | |
| value="English", | |
| label="🌐 English Interface/中文界面", | |
| interactive=True, | |
| min_width=400, | |
| scale=0) | |
| with gr.Tabs(): | |
| with gr.Tab("Offline") as gr_tab_ofl: | |
| gr_infer_history = gr.State([]) | |
| gr_thinking_hidden = gr.Checkbox(value=True, visible=False) | |
| gr_temperature_hidden = gr.Slider(minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=0.0, | |
| interactive=True, | |
| visible=False) | |
| gr_chatinterface_ofl = gr.ChatInterface( | |
| fn=general_chat, | |
| type="messages", | |
| multimodal=True, | |
| chatbot=gr.Chatbot(height=600), | |
| textbox=gr.MultimodalTextbox( | |
| file_count="multiple", | |
| file_types=["image", ".mp4"], | |
| sources=["upload"], | |
| stop_btn=True, | |
| placeholder=label_translations[ | |
| 'gr_chatinterface_ofl.textbox.placeholder']['English'], | |
| ), | |
| additional_inputs=[ | |
| gr_infer_history, gr_thinking_hidden, gr_temperature_hidden | |
| ], | |
| additional_outputs=[gr_infer_history], | |
| ) | |
| def add_escape_fn(inputs: dict): | |
| if inputs and 'text' in inputs: | |
| inputs['text'] = add_escape(inputs['text']) | |
| return inputs | |
| gr_chatinterface_ofl.textbox.submit( | |
| fn=add_escape_fn, | |
| inputs=[gr_chatinterface_ofl.saved_input], | |
| outputs=[gr_chatinterface_ofl.saved_input] | |
| ) | |
| gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear], | |
| fn=lambda: [], | |
| outputs=[gr_infer_history]) | |
| with gr.Row(): | |
| gr_thinking_ofl = gr.Checkbox( | |
| value=True, | |
| label=label_translations['gr_thinking']['English'], | |
| ) | |
| gr_thinking_ofl.change(lambda x: x, | |
| inputs=gr_thinking_ofl, | |
| outputs=gr_thinking_hidden) | |
| gr_temperature_ofl = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=0.0, | |
| label=label_translations['gr_temperature']['English'], | |
| interactive=True) | |
| gr_temperature_ofl.change(lambda x: x, | |
| inputs=gr_temperature_ofl, | |
| outputs=gr_temperature_hidden) | |
| gr_clear_button_ofl = gr.Button(value=label_translations['gr_clear_button']['English']) | |
| def clear_history_fn(): | |
| return None, [], [], [], [] | |
| gr_clear_button_ofl.click( | |
| fn=clear_history_fn, | |
| outputs=[ | |
| gr_chatinterface_ofl.conversation_id, | |
| gr_chatinterface_ofl.saved_conversations, | |
| gr_chatinterface_ofl.chatbot, | |
| gr_chatinterface_ofl.chatbot_state, | |
| gr_infer_history | |
| ] | |
| ) | |
| with gr.Column(visible=True) as gr_examples_en: | |
| gr.Examples( | |
| label='7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.', | |
| examples=[ | |
| { | |
| "text": "Who are you?", | |
| "files": [] | |
| }, | |
| { | |
| "text": "Introduce this.", | |
| "files": ["examples/bancopy.jpg"] | |
| }, | |
| { | |
| "text": | |
| """Find Curry's "Good Night" celebration time.""", | |
| "files": | |
| ["examples/I7pTpMjqNRM_1080p_small.mp4"] | |
| }, | |
| { | |
| "text": | |
| "Share your feelings.", | |
| "files": [ | |
| "examples/newyork.jpg", | |
| "examples/beijing.jpg" | |
| ] | |
| }, | |
| { | |
| "text": "Look and answer.", | |
| "files": ["examples/puzzle.jpg"] | |
| }, | |
| { | |
| "text": "Please point out all the hats on people's heads in the image, output concatenated point coordinates like <point>x y</point><point>x y</point>", | |
| "files": ["examples/000000001000.jpeg"] | |
| }, | |
| { | |
| "text": """Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""", | |
| "files": ["examples/000000018380.jpeg"] | |
| } | |
| ], | |
| inputs=[gr_chatinterface_ofl.textbox], | |
| ) | |
| with gr.Column(visible=False) as gr_examples_cn: | |
| gr.Examples( | |
| label='七个示例:文本,图像,视频,多个图像/视频,视觉解谜,坐标定位,开放式物体检测。', | |
| examples=[ | |
| { | |
| "text": "你是谁?", | |
| "files": [] | |
| }, | |
| { | |
| "text": "介绍一下。", | |
| "files": ["examples/bancopy.jpg"] | |
| }, | |
| { | |
| "text": | |
| "找到库里的“晚安”庆祝时间段。", | |
| "files": | |
| ["examples/I7pTpMjqNRM_1080p_small.mp4"] | |
| }, | |
| { | |
| "text": | |
| "你有什么感想?", | |
| "files": [ | |
| "examples/newyork.jpg", | |
| "examples/beijing.jpg" | |
| ] | |
| }, | |
| { | |
| "text": "看图回答。", | |
| "files": ["examples/puzzle.jpg"] | |
| }, | |
| { | |
| "text": "请点出图像中所有戴在头上的帽子, 输出串联的点坐标<point>x y</point><point>x y</point>", | |
| "files": ["examples/000000001000.jpeg"] | |
| }, | |
| { | |
| "text": """请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表,就像:[{"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""", | |
| "files": ["examples/000000018380.jpeg"] | |
| } | |
| ], | |
| inputs=[gr_chatinterface_ofl.textbox], | |
| ) | |
| with gr.Tab("Online") as gr_tab_ol: | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr_infer_history_ol = gr.State([]) | |
| gr_thinking_hidden = gr.Checkbox(value=True, visible=False) | |
| gr_temperature_hidden = gr.Slider(minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=1.0, | |
| interactive=True, | |
| visible=False) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr_webcam_image = gr.Image( | |
| label=label_translations['gr_webcam_image'] | |
| ['English'], | |
| sources="webcam", | |
| height=250, | |
| type='filepath') | |
| gr_webcam_images = gr.Gallery( | |
| label=label_translations['gr_webcam_images'] | |
| ['English'], | |
| show_label=True, | |
| format='webp', | |
| columns=1, | |
| height=250, | |
| preview=True, | |
| interactive=False) | |
| gr_counter = gr.Number(value=0, visible=False) | |
| with gr.Column(scale=3): | |
| gr_chatinterface_ol = gr.ChatInterface( | |
| fn=online_record_chat, | |
| type="messages", | |
| multimodal=False, | |
| chatbot=gr.Chatbot(height=600), | |
| textbox=gr. | |
| Textbox(placeholder=label_translations[ | |
| 'gr_chatinterface_ol.textbox.placeholder'] | |
| ['English'], | |
| submit_btn=True, | |
| stop_btn=True), | |
| additional_inputs=[ | |
| gr_webcam_images, gr_counter, | |
| gr_infer_history_ol, gr_thinking_hidden, | |
| gr_temperature_hidden | |
| ], | |
| additional_outputs=[ | |
| gr_counter, gr_infer_history_ol | |
| ], | |
| ) | |
| def cache_webcam(recorded_image: str, | |
| recorded_images: list): | |
| if not recorded_images: | |
| recorded_images = [] | |
| return recorded_images + [recorded_image] | |
| gr_webcam_image.stream( | |
| fn=cache_webcam, | |
| inputs=[gr_webcam_image, gr_webcam_images], | |
| outputs=[gr_webcam_images], | |
| stream_every=1, | |
| concurrency_limit=30, | |
| ) | |
| with gr.Row(): | |
| gr_thinking_ol = gr.Checkbox( | |
| value=True, | |
| label=label_translations['gr_thinking'] | |
| ['English'], | |
| ) | |
| gr_thinking_ol.change( | |
| lambda x: x, | |
| inputs=gr_thinking_ol, | |
| outputs=gr_thinking_hidden) | |
| gr_temperature_ol = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=1.0, | |
| label=label_translations['gr_temperature'] | |
| ['English'], | |
| interactive=True) | |
| gr_temperature_ol.change( | |
| lambda x: x, | |
| inputs=gr_temperature_ol, | |
| outputs=gr_temperature_hidden) | |
| gr_clear_button_ol = gr.Button(value=label_translations['gr_clear_button']['English']) | |
| def clear_history_fn(): | |
| return None, [], [], [], [] | |
| gr_clear_button_ol.click( | |
| fn=clear_history_fn, | |
| outputs=[ | |
| gr_chatinterface_ol.conversation_id, | |
| gr_chatinterface_ol.saved_conversations, | |
| gr_chatinterface_ol.chatbot, | |
| gr_chatinterface_ol.chatbot_state, | |
| gr_infer_history_ol | |
| ] | |
| ) | |
| def update_lang(lang: str): | |
| return ( | |
| gr.update(label=label_translations['gr_chatinterface_ofl'][lang]), | |
| gr.update(label=label_translations['gr_chatinterface_ol'][lang]), | |
| gr.update(placeholder=label_translations[ | |
| 'gr_chatinterface_ofl.textbox.placeholder'][lang]), | |
| gr.update(placeholder=label_translations[ | |
| 'gr_chatinterface_ol.textbox.placeholder'][lang]), | |
| gr.update(label=label_translations['gr_tab_ofl'][lang]), | |
| gr.update(label=label_translations['gr_tab_ol'][lang]), | |
| gr.update(label=label_translations['gr_thinking'][lang]), | |
| gr.update(label=label_translations['gr_thinking'][lang]), | |
| gr.update(label=label_translations['gr_temperature'][lang]), | |
| gr.update(label=label_translations['gr_temperature'][lang]), | |
| gr.update(visible=lang == 'English'), | |
| gr.update(visible=lang != 'English'), | |
| gr.update(label=label_translations['gr_webcam_image'][lang]), | |
| gr.update(label=label_translations['gr_webcam_images'][lang]), | |
| gr.update(value=label_translations['gr_clear_button'][lang]), | |
| gr.update(value=label_translations['gr_clear_button'][lang]), | |
| ) | |
| gr_lang_selector.change(fn=update_lang, | |
| inputs=[gr_lang_selector], | |
| outputs=[ | |
| gr_chatinterface_ofl.chatbot, | |
| gr_chatinterface_ol.chatbot, | |
| gr_chatinterface_ofl.textbox, | |
| gr_chatinterface_ol.textbox, | |
| gr_tab_ofl, | |
| gr_tab_ol, | |
| gr_thinking_ofl, | |
| gr_thinking_ol, | |
| gr_temperature_ofl, | |
| gr_temperature_ol, | |
| gr_examples_en, | |
| gr_examples_cn, | |
| gr_webcam_image, | |
| gr_webcam_images, | |
| gr_clear_button_ofl, | |
| gr_clear_button_ol, | |
| ]) | |
| demo.queue(default_concurrency_limit=100, max_size=100).launch(share=True, | |
| max_threads=100, | |
| ssr_mode=False) | |