Dec ZeroGPU usage
Browse files- app.py +17 -10
- requirements.txt +2 -1
app.py
CHANGED
@@ -11,7 +11,7 @@ model_path = "Pectics/Softie-VL-7B-250123"
|
|
11 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
12 |
model_path,
|
13 |
torch_dtype=bfloat16,
|
14 |
-
|
15 |
device_map="auto",
|
16 |
)
|
17 |
min_pixels = 256 * 28 * 28
|
@@ -19,19 +19,12 @@ max_pixels = 1280 * 28 * 28
|
|
19 |
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
|
20 |
|
21 |
@spaces.GPU
|
22 |
-
def
|
23 |
-
|
24 |
-
history,
|
25 |
-
system_message,
|
26 |
max_tokens,
|
27 |
temperature,
|
28 |
top_p,
|
29 |
):
|
30 |
-
messages = [{"role": "system", "content": system_message}]
|
31 |
-
for m in history:
|
32 |
-
messages.append({"role": m["role"], "content": m["content"]})
|
33 |
-
messages.append({"role": "user", "content": message})
|
34 |
-
|
35 |
text_inputs = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
36 |
image_inputs, video_inputs = process_vision_info(messages)
|
37 |
inputs = processor(
|
@@ -58,6 +51,20 @@ def respond(
|
|
58 |
response += token
|
59 |
yield response
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
app = gr.ChatInterface(
|
62 |
respond,
|
63 |
type="messages",
|
|
|
11 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
12 |
model_path,
|
13 |
torch_dtype=bfloat16,
|
14 |
+
attn_implementation="flash_attention_2",
|
15 |
device_map="auto",
|
16 |
)
|
17 |
min_pixels = 256 * 28 * 28
|
|
|
19 |
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
|
20 |
|
21 |
@spaces.GPU
|
22 |
+
def infer(
|
23 |
+
messages,
|
|
|
|
|
24 |
max_tokens,
|
25 |
temperature,
|
26 |
top_p,
|
27 |
):
|
|
|
|
|
|
|
|
|
|
|
28 |
text_inputs = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
29 |
image_inputs, video_inputs = process_vision_info(messages)
|
30 |
inputs = processor(
|
|
|
51 |
response += token
|
52 |
yield response
|
53 |
|
54 |
+
def respond(
|
55 |
+
message,
|
56 |
+
history,
|
57 |
+
system_message,
|
58 |
+
max_tokens,
|
59 |
+
temperature,
|
60 |
+
top_p,
|
61 |
+
):
|
62 |
+
messages = [{"role": "system", "content": system_message}]
|
63 |
+
for m in history:
|
64 |
+
messages.append({"role": m["role"], "content": m["content"]})
|
65 |
+
messages.append({"role": "user", "content": message})
|
66 |
+
return infer(messages, max_tokens, temperature, top_p)
|
67 |
+
|
68 |
app = gr.ChatInterface(
|
69 |
respond,
|
70 |
type="messages",
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ torchvision
|
|
3 |
transformers
|
4 |
accelerate
|
5 |
qwen-vl-utils
|
6 |
-
gradio
|
|
|
|
3 |
transformers
|
4 |
accelerate
|
5 |
qwen-vl-utils
|
6 |
+
gradio
|
7 |
+
flash-attn
|