File size: 6,169 Bytes
ce3b288
 
 
 
 
 
0e1bb1e
ce3b288
 
 
3755dbb
ce3b288
3755dbb
ce3b288
7bf2a69
 
ce3b288
 
0e1bb1e
5b3c141
0e1bb1e
4a3e69a
ce3b288
4a3e69a
ce3b288
4a3e69a
0e1bb1e
ce3b288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64778e5
ce3b288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e1bb1e
 
ce3b288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bf2a69
ce3b288
 
 
 
3755dbb
ce3b288
3755dbb
7bf2a69
ce3b288
eccb754
510191b
eccb754
ce3b288
 
 
5b3c141
ce3b288
 
7bf2a69
8f5eed9
ce3b288
 
5b3c141
ce3b288
 
 
 
3755dbb
ce3b288
 
 
 
 
 
 
 
eebdd94
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import gradio as gr
import cv2
import torch
from PIL import Image
from pathlib import Path
from threading import Thread
from transformers import AutoModelForCausalLM, AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
import spaces
import time

TITLE = " google/gemma-3-270m-it  "
DESCRIPTION= """    
       It's so small
       """
IS_RTL = False
TEXT_ALIGN = "left"

# model config
model_name = "google/gemma-3-270m-it"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    attn_implementation="eager"
).eval()

processor = AutoProcessor.from_pretrained(model_name)
# I will add timestamp later
def extract_video_frames(video_path, num_frames=8):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(total_frames // num_frames, 1)
    
    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(frame))
    cap.release()
    return frames

def format_message(content, files):
    
    message_content = []

    if content:
        parts = content.split('<image>')
        for i, part in enumerate(parts):
            if part.strip():
                message_content.append({"type": "text", "text": part.strip()})
            if i < len(parts) - 1 and files:
                img = Image.open(files.pop(0))
                message_content.append({"type": "image", "image": img})
    for file in files:
        file_path = file if isinstance(file, str) else file.name
        if Path(file_path).suffix.lower() in ['.jpg', '.jpeg', '.png']:
            img = Image.open(file_path)
            message_content.append({"type": "image", "image": img})
        elif Path(file_path).suffix.lower() in ['.mp4', '.mov']:
            frames = extract_video_frames(file_path)
            for frame in frames:
                message_content.append({"type": "image", "image": frame})
    return message_content

def format_conversation_history(chat_history):
    messages = []
    current_user_content = []
    for item in chat_history:
        role = item["role"]
        content = item["content"]
        if role == "user":
            if isinstance(content, str):
                current_user_content.append({"type": "text", "text": content})
            elif isinstance(content, list):
                current_user_content.extend(content)
            else:
                current_user_content.append({"type": "text", "text": str(content)})
        elif role == "assistant":
            if current_user_content:
                messages.append({"role": "user", "content": current_user_content})
                current_user_content = []
            messages.append({"role": "assistant", "content": [{"type": "text", "text": str(content)}]})
    if current_user_content:
        messages.append({"role": "user", "content": current_user_content})
    return messages

@spaces.GPU(duration=120)
def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
    
    if isinstance(input_data, dict) and "text" in input_data:
        text = input_data["text"]
        files = input_data.get("files", [])
    else:
        text = str(input_data)
        files = []

    new_message_content = format_message(text, files)
    new_message = {"role": "user", "content": new_message_content}
    system_message = [{"role": "system", "content": [{"type": "text", "text": system_prompt}]}] if system_prompt else []
    processed_history = format_conversation_history(chat_history)
    messages = system_message + processed_history
    if messages and messages[-1]["role"] == "user":
        messages[-1]["content"].extend(new_message["content"])
    else:
        messages.append(new_message)    
    
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt",
        return_dict=True
    ).to(model.device)
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty
    )
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)

chat_interface = gr.ChatInterface(
    fn=generate_response,
    chatbot=gr.Chatbot(rtl=IS_RTL, show_copy_button=True,type="messages"),    
    additional_inputs=[
        gr.Slider(label="Max new tokens", minimum=100, maximum=2000, step=1, value=512),
        gr.Textbox(
            label="System Prompt",
            value="You are a very helpful multimodal assistant",
            lines=4,
            placeholder="Change the settings",
            text_align = TEXT_ALIGN, rtl = IS_RTL
        ),
        gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.2),
        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.4),
        gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=30),
        gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1),
    ],
    examples=[
        [{"text": "Write a poem which describes potatoes"}],
    ],
    textbox=gr.MultimodalTextbox(
        rtl=IS_RTL,
        label="input",
        file_types=["image", "video"],
        file_count="multiple",
        placeholder="Input text, Any image or video will be ignored",
    ),
    cache_examples=False,
    type="messages",
    fill_height=True,
    stop_btn="Stop",
    css_paths=["style.css"],
    multimodal=True,
    title=TITLE,
    description=DESCRIPTION,
    theme=gr.themes.Soft(),
)

if __name__ == "__main__":
     chat_interface.queue(max_size=20).launch()