Softie / app.py
Pectics's picture
First trial
03d2f46
raw
history blame
2.28 kB
import gradio as gr
import spaces
from threading import Thread
from torch import bfloat16
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor
from qwen_vl_utils import process_vision_info
model_path = "Pectics/Softie-VL-7B-250123"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=bfloat16,
#attn_implementation="flash_attention_2",
device_map="auto",
)
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
@spaces.GPU
def respond(
message,
history,
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for m in history:
messages.append({"role": m["role"], "content": m["content"]})
messages.append({"role": "user", "content": message})
text_inputs = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_inputs],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda")
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
)
thread = Thread(target=model.generate, kwargs=kwargs)
thread.start()
response = ""
for token in streamer:
response += token
yield response
app = gr.ChatInterface(
respond,
type="messages",
additional_inputs=[
gr.Textbox(value="You are Softie, a helpful assistant.", label="系统设定"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="最大生成长度"),
gr.Slider(minimum=0.01, maximum=4.0, value=0.75, step=0.01, label="温度系数(Temperature)"),
gr.Slider(minimum=0.01, maximum=1.0, value=0.5, step=0.01, label="核取样系数(Top-p)"),
],
)
if __name__ == "__main__":
app.launch()