robot0820 commited on
Commit
dcef8cb
Β·
verified Β·
1 Parent(s): df33f8b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -0
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM
3
+
4
+ from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
5
+ from deepseek_vl.utils.io import load_pil_images
6
+
7
+
8
+ # specify the path to the model
9
+ model_path = "deepseek-ai/deepseek-vl-7b-chat"
10
+ vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
11
+ tokenizer = vl_chat_processor.tokenizer
12
+
13
+ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
14
+ vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
15
+
16
+ conversation = [
17
+ {
18
+ "role": "User",
19
+ "content": "<image_placeholder>Describe each stage of this image.",
20
+ "images": ["./images/training_pipelines.png"]
21
+ },
22
+ {
23
+ "role": "Assistant",
24
+ "content": ""
25
+ }
26
+ ]
27
+
28
+ # load images and prepare for inputs
29
+ pil_images = load_pil_images(conversation)
30
+ prepare_inputs = vl_chat_processor(
31
+ conversations=conversation,
32
+ images=pil_images,
33
+ force_batchify=True
34
+ ).to(vl_gpt.device)
35
+
36
+ # run image encoder to get the image embeddings
37
+ inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
38
+
39
+ # run the model to get the response
40
+ outputs = vl_gpt.language_model.generate(
41
+ inputs_embeds=inputs_embeds,
42
+ attention_mask=prepare_inputs.attention_mask,
43
+ pad_token_id=tokenizer.eos_token_id,
44
+ bos_token_id=tokenizer.bos_token_id,
45
+ eos_token_id=tokenizer.eos_token_id,
46
+ max_new_tokens=512,
47
+ do_sample=False,
48
+ use_cache=True
49
+ )
50
+
51
+ answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
52
+ print(f"{prepare_inputs['sft_format'][0]}", answer)