Spaces:

robot0820
/

VLM_Test

Running

App Files Files Community

robot0820 commited on 6 days ago

Commit

513a480

verified ·

1 Parent(s): 7c695c3

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -27

app.py CHANGED Viewed

@@ -1,15 +1,16 @@
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
 from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
-# 模型路徑
 model_path = "deepseek-ai/deepseek-vl-7b-chat"
-# ==== BitsAndBytes 4-bit 量化設定 ====
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16,  # 強制 float16
     bnb_4bit_use_double_quant=True
 )
@@ -17,7 +18,7 @@ bnb_config = BitsAndBytesConfig(
 vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
-# 載入模型 (4-bit 量化 + float16)
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
     model_path,
     quantization_config=bnb_config,
@@ -25,30 +26,35 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
-# ==== 單張圖片推理函式 ====
 def chat_with_image(image, user_message):
     try:
-        # 建立對話
-        conversation = [
-            {"role": "User", "content": "<image_placeholder>" + user_message, "images": [image]},
-            {"role": "Assistant", "content": ""}
-        ]
-        # 直接傳入 PIL.Image，不使用 load_pil_images
         prepare_inputs = vl_chat_processor(
             conversations=conversation,
-            images=[image],
             force_batchify=True
         ).to(vl_gpt.device)
-        # 🚨 將 BatchedVLChatProcessorOutput 轉 dict
         prepare_inputs = {k: getattr(prepare_inputs, k) for k in prepare_inputs.__dataclass_fields__.keys()}
-        # 正確 dtype：input_ids/labels 保持 long，其他 tensor 轉 float16
         new_inputs = {}
         for k, v in prepare_inputs.items():
             if torch.is_tensor(v):
-                if k in ["input_ids", "labels","attention_mask"]:
                     new_inputs[k] = v.to(torch.long)
                 else:
                     new_inputs[k] = v.to(torch.float16)
@@ -66,27 +72,40 @@ def chat_with_image(image, user_message):
             pad_token_id=tokenizer.eos_token_id,
             bos_token_id=tokenizer.bos_token_id,
             eos_token_id=tokenizer.eos_token_id,
-            max_new_tokens=128,  # 減少記憶體
             do_sample=False,
             use_cache=True
         )
         # 解碼
         answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
-        return answer
     except Exception as e:
-        return f"Error: {str(e)}"
 # ==== Gradio Web UI ====
-demo = gr.Interface(
-    fn=chat_with_image,
-    inputs=[gr.Image(type="pil", label="Upload Image"),
-            gr.Textbox(lines=2, placeholder="Ask about the image...")],
-    outputs="text",
-    title="DeepSeek-VL-7B-Chat Demo (4-bit, float16)",
-    description="上傳圖片並輸入問題，模型會生成與圖片相關的回答"
-)
 if __name__ == "__main__":
     demo.launch()

+# app.py
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
 from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
+# ==== 模型設定 ====
 model_path = "deepseek-ai/deepseek-vl-7b-chat"
+# BitsAndBytes 4-bit 量化設定
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
     bnb_4bit_use_double_quant=True
 )
 vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
+# 載入模型
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
     model_path,
     quantization_config=bnb_config,
     trust_remote_code=True
 ).eval()
+# ==== 對話歷史 ====
+chat_history = []
+# ==== 文字+圖片推理函式 ====
 def chat_with_image(image, user_message):
+    global chat_history
     try:
+        # 建立對話內容
+        conversation = chat_history.copy()
+        conversation.append({
+            "role": "User",
+            "content": "<image_placeholder>" + user_message,
+            "images": [image] if image else []
+        })
+        conversation.append({"role": "Assistant", "content": ""})
+        # 準備輸入
         prepare_inputs = vl_chat_processor(
             conversations=conversation,
+            images=[image] if image else [],
             force_batchify=True
         ).to(vl_gpt.device)
+        # 轉成 dict，並正確處理 dtype
         prepare_inputs = {k: getattr(prepare_inputs, k) for k in prepare_inputs.__dataclass_fields__.keys()}
         new_inputs = {}
         for k, v in prepare_inputs.items():
             if torch.is_tensor(v):
+                if k in ["input_ids", "labels"]:
                     new_inputs[k] = v.to(torch.long)
                 else:
                     new_inputs[k] = v.to(torch.float16)
             pad_token_id=tokenizer.eos_token_id,
             bos_token_id=tokenizer.bos_token_id,
             eos_token_id=tokenizer.eos_token_id,
+            max_new_tokens=128,
             do_sample=False,
             use_cache=True
         )
         # 解碼
         answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+        # 更新歷史
+        chat_history.append((user_message, answer))
+        return answer, chat_history
     except Exception as e:
+        return f"Error: {str(e)}", chat_history
+def reset_chat():
+    global chat_history
+    chat_history = []
+    return "", []
 # ==== Gradio Web UI ====
+with gr.Blocks() as demo:
+    gr.Markdown("# DeepSeek-VL-7B-Chat Demo (4-bit, float16)")
+    with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload Image")
+        text_input = gr.Textbox(lines=2, placeholder="Ask about the image...")
+    with gr.Row():
+        submit_btn = gr.Button("Submit")
+        reset_btn = gr.Button("Reset Chat")
+    output_text = gr.Textbox(label="Answer")
+    chat_display = gr.Chatbot(label="Chat History")
+    submit_btn.click(chat_with_image, inputs=[image_input, text_input], outputs=[output_text, chat_display])
+    reset_btn.click(reset_chat, inputs=[], outputs=[output_text, chat_display])
 if __name__ == "__main__":
     demo.launch()