Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,169 Bytes
ce3b288 0e1bb1e ce3b288 3755dbb ce3b288 3755dbb ce3b288 7bf2a69 ce3b288 0e1bb1e 5b3c141 0e1bb1e 4a3e69a ce3b288 4a3e69a ce3b288 4a3e69a 0e1bb1e ce3b288 64778e5 ce3b288 0e1bb1e ce3b288 7bf2a69 ce3b288 3755dbb ce3b288 3755dbb 7bf2a69 ce3b288 eccb754 510191b eccb754 ce3b288 5b3c141 ce3b288 7bf2a69 8f5eed9 ce3b288 5b3c141 ce3b288 3755dbb ce3b288 eebdd94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import gradio as gr
import cv2
import torch
from PIL import Image
from pathlib import Path
from threading import Thread
from transformers import AutoModelForCausalLM, AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
import spaces
import time
TITLE = " google/gemma-3-270m-it "
DESCRIPTION= """
It's so small
"""
IS_RTL = False
TEXT_ALIGN = "left"
# model config
model_name = "google/gemma-3-270m-it"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto",
attn_implementation="eager"
).eval()
processor = AutoProcessor.from_pretrained(model_name)
# I will add timestamp later
def extract_video_frames(video_path, num_frames=8):
cap = cv2.VideoCapture(video_path)
frames = []
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
step = max(total_frames // num_frames, 1)
for i in range(num_frames):
cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
ret, frame = cap.read()
if ret:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append(Image.fromarray(frame))
cap.release()
return frames
def format_message(content, files):
message_content = []
if content:
parts = content.split('<image>')
for i, part in enumerate(parts):
if part.strip():
message_content.append({"type": "text", "text": part.strip()})
if i < len(parts) - 1 and files:
img = Image.open(files.pop(0))
message_content.append({"type": "image", "image": img})
for file in files:
file_path = file if isinstance(file, str) else file.name
if Path(file_path).suffix.lower() in ['.jpg', '.jpeg', '.png']:
img = Image.open(file_path)
message_content.append({"type": "image", "image": img})
elif Path(file_path).suffix.lower() in ['.mp4', '.mov']:
frames = extract_video_frames(file_path)
for frame in frames:
message_content.append({"type": "image", "image": frame})
return message_content
def format_conversation_history(chat_history):
messages = []
current_user_content = []
for item in chat_history:
role = item["role"]
content = item["content"]
if role == "user":
if isinstance(content, str):
current_user_content.append({"type": "text", "text": content})
elif isinstance(content, list):
current_user_content.extend(content)
else:
current_user_content.append({"type": "text", "text": str(content)})
elif role == "assistant":
if current_user_content:
messages.append({"role": "user", "content": current_user_content})
current_user_content = []
messages.append({"role": "assistant", "content": [{"type": "text", "text": str(content)}]})
if current_user_content:
messages.append({"role": "user", "content": current_user_content})
return messages
@spaces.GPU(duration=120)
def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
if isinstance(input_data, dict) and "text" in input_data:
text = input_data["text"]
files = input_data.get("files", [])
else:
text = str(input_data)
files = []
new_message_content = format_message(text, files)
new_message = {"role": "user", "content": new_message_content}
system_message = [{"role": "system", "content": [{"type": "text", "text": system_prompt}]}] if system_prompt else []
processed_history = format_conversation_history(chat_history)
messages = system_message + processed_history
if messages and messages[-1]["role"] == "user":
messages[-1]["content"].extend(new_message["content"])
else:
messages.append(new_message)
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
return_dict=True
).to(model.device)
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
outputs = []
for text in streamer:
outputs.append(text)
yield "".join(outputs)
chat_interface = gr.ChatInterface(
fn=generate_response,
chatbot=gr.Chatbot(rtl=IS_RTL, show_copy_button=True,type="messages"),
additional_inputs=[
gr.Slider(label="Max new tokens", minimum=100, maximum=2000, step=1, value=512),
gr.Textbox(
label="System Prompt",
value="You are a very helpful multimodal assistant",
lines=4,
placeholder="Change the settings",
text_align = TEXT_ALIGN, rtl = IS_RTL
),
gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.2),
gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.4),
gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=30),
gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1),
],
examples=[
[{"text": "Write a poem which describes potatoes"}],
],
textbox=gr.MultimodalTextbox(
rtl=IS_RTL,
label="input",
file_types=["image", "video"],
file_count="multiple",
placeholder="Input text, Any image or video will be ignored",
),
cache_examples=False,
type="messages",
fill_height=True,
stop_btn="Stop",
css_paths=["style.css"],
multimodal=True,
title=TITLE,
description=DESCRIPTION,
theme=gr.themes.Soft(),
)
if __name__ == "__main__":
chat_interface.queue(max_size=20).launch()
|