import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer from threading import Thread import re import time from optimum.intel import OVModelForVisualCausalLM # model_id = "echarlaix/SmolVLM2-2.2B-Instruct-openvino" model_id = "echarlaix/SmolVLM-256M-Instruct-openvino" processor = AutoProcessor.from_pretrained(model_id) model = OVModelForVisualCausalLM.from_pretrained(model_id) def model_inference(input_dict, history, max_tokens): text = input_dict["text"] images = [] user_content = [] media_queue = [] if history == []: text = input_dict["text"].strip() for file in input_dict.get("files", []): if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")): media_queue.append({"type": "image", "path": file}) elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")): media_queue.append({"type": "video", "path": file}) if "" in text or "