File size: 2,821 Bytes
dcef8cb 2c94591 3153182 b75d3d8 dcef8cb 758f4b1 dcef8cb 758f4b1 b75d3d8 3153182 b75d3d8 3153182 b75d3d8 758f4b1 b75d3d8 3153182 b75d3d8 dcef8cb b75d3d8 a675f47 b75d3d8 a675f47 dcef8cb d150731 a675f47 d150731 a675f47 758f4b1 d150731 b75d3d8 d150731 a675f47 758f4b1 b75d3d8 a675f47 b75d3d8 a675f47 d150731 a675f47 758f4b1 a675f47 b75d3d8 3153182 a675f47 dcef8cb 2c94591 b75d3d8 2c94591 b75d3d8 2c94591 b75d3d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
# ๆจกๅ่ทฏๅพ
model_path = "deepseek-ai/deepseek-vl-7b-chat"
# ==== BitsAndBytes 4-bit ้ๅ่จญๅฎ ====
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16, # ๅผทๅถ float16
bnb_4bit_use_double_quant=True
)
# ่ผๅ
ฅ processor ๅ tokenizer
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
# ่ผๅ
ฅๆจกๅ (4-bit ้ๅ + float16)
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True
).eval()
# ==== ๅฎๅผตๅ็ๆจ็ๅฝๅผ ====
def chat_with_image(image, user_message):
try:
conversation = [
{"role": "User", "content": "<image_placeholder>" + user_message, "images": [image]},
{"role": "Assistant", "content": ""}
]
# ็ดๆฅๅณๅ
ฅ PIL.Image๏ผไธๅไฝฟ็จ load_pil_images
prepare_inputs = vl_chat_processor(
conversations=conversation,
images=[image],
force_batchify=True
).to(vl_gpt.device)
# ๆญฃ็ขบ dtype ่็
new_inputs = {}
for k, v in prepare_inputs.items():
if torch.is_tensor(v):
if k in ["input_ids", "labels"]:
new_inputs[k] = v.to(torch.long)
else:
new_inputs[k] = v.to(torch.float16)
else:
new_inputs[k] = v
prepare_inputs = new_inputs
# ๅๅพ embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
# ็ๆๅ็ญ
outputs = vl_gpt.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs["attention_mask"],
pad_token_id=tokenizer.eos_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=128,
do_sample=False,
use_cache=True
)
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer
except Exception as e:
return f"Error: {str(e)}"
# ==== Gradio Web UI ====
demo = gr.Interface(
fn=chat_with_image,
inputs=[gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=2, placeholder="Ask about the image...")],
outputs="text",
title="DeepSeek-VL-7B-Chat Demo (4-bit, float16)",
description="ไธๅณๅ็ไธฆ่ผธๅ
ฅๅ้ก๏ผๆจกๅๆ็ๆ่ๅ็็ธ้็ๅ็ญ"
)
if __name__ == "__main__":
demo.launch()
|