File size: 2,821 Bytes
dcef8cb
2c94591
3153182
b75d3d8
dcef8cb
758f4b1
dcef8cb
758f4b1
b75d3d8
3153182
 
b75d3d8
3153182
 
 
b75d3d8
 
 
 
 
 
758f4b1
b75d3d8
3153182
b75d3d8
 
dcef8cb
b75d3d8
 
a675f47
 
b75d3d8
a675f47
 
dcef8cb
d150731
a675f47
 
d150731
a675f47
 
758f4b1
d150731
b75d3d8
 
 
 
 
 
 
 
 
 
 
d150731
a675f47
758f4b1
b75d3d8
a675f47
 
b75d3d8
a675f47
 
 
d150731
a675f47
 
 
758f4b1
a675f47
b75d3d8
3153182
a675f47
 
dcef8cb
2c94591
 
b75d3d8
 
 
2c94591
b75d3d8
 
2c94591
 
 
b75d3d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM

# ๆจกๅž‹่ทฏๅพ‘
model_path = "deepseek-ai/deepseek-vl-7b-chat"

# ==== BitsAndBytes 4-bit ้‡ๅŒ–่จญๅฎš ====
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # ๅผทๅˆถ float16
    bnb_4bit_use_double_quant=True
)

# ่ผ‰ๅ…ฅ processor ๅ’Œ tokenizer
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

# ่ผ‰ๅ…ฅๆจกๅž‹ (4-bit ้‡ๅŒ– + float16)
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
).eval()

# ==== ๅ–ฎๅผตๅœ–็‰‡ๆŽจ็†ๅ‡ฝๅผ ====
def chat_with_image(image, user_message):
    try:
        conversation = [
            {"role": "User", "content": "<image_placeholder>" + user_message, "images": [image]},
            {"role": "Assistant", "content": ""}
        ]

        # ็›ดๆŽฅๅ‚ณๅ…ฅ PIL.Image๏ผŒไธๅ†ไฝฟ็”จ load_pil_images
        prepare_inputs = vl_chat_processor(
            conversations=conversation,
            images=[image],
            force_batchify=True
        ).to(vl_gpt.device)

        # ๆญฃ็ขบ dtype ่™•็†
        new_inputs = {}
        for k, v in prepare_inputs.items():
            if torch.is_tensor(v):
                if k in ["input_ids", "labels"]:
                    new_inputs[k] = v.to(torch.long)
                else:
                    new_inputs[k] = v.to(torch.float16)
            else:
                new_inputs[k] = v
        prepare_inputs = new_inputs

        # ๅ–ๅพ— embeddings
        inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

        # ็”Ÿๆˆๅ›ž็ญ”
        outputs = vl_gpt.language_model.generate(
            inputs_embeds=inputs_embeds,
            attention_mask=prepare_inputs["attention_mask"],
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=128,
            do_sample=False,
            use_cache=True
        )

        answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
        return answer

    except Exception as e:
        return f"Error: {str(e)}"

# ==== Gradio Web UI ====
demo = gr.Interface(
    fn=chat_with_image,
    inputs=[gr.Image(type="pil", label="Upload Image"),
            gr.Textbox(lines=2, placeholder="Ask about the image...")],
    outputs="text",
    title="DeepSeek-VL-7B-Chat Demo (4-bit, float16)",
    description="ไธŠๅ‚ณๅœ–็‰‡ไธฆ่ผธๅ…ฅๅ•้กŒ๏ผŒๆจกๅž‹ๆœƒ็”Ÿๆˆ่ˆ‡ๅœ–็‰‡็›ธ้—œ็š„ๅ›ž็ญ”"
)

if __name__ == "__main__":
    demo.launch()