File size: 8,559 Bytes
ed213a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eb580f54d0ec4077b5c39da99f35c4f0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/38 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
    "from qwen_vl_utils import process_vision_info\n",
    "import torch\n",
    "\n",
    "#torch_device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "local_model_path = \"models/QVQ-72B-Preview\"\n",
    "\n",
    "# default: Load the model on the available device(s)\n",
    "model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
    "    #\"Qwen/QVQ-72B-Preview\", \n",
    "    local_model_path,\n",
    "    torch_dtype=\"auto\", \n",
    "    device_map=\"auto\"\n",
    ")\n",
    "#model.to(\"cuda\") \n",
    "# default processer\n",
    "processor = AutoProcessor.from_pretrained(local_model_path)\n",
    "\n",
    "# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.\n",
    "# min_pixels = 256*28*28\n",
    "# max_pixels = 1280*28*28\n",
    "# processor = AutoProcessor.from_pretrained(\"Qwen/QVQ-72B-Preview\", min_pixels=min_pixels, max_pixels=max_pixels)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "######################################################\n",
      "{'visual': 0, 'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.layers.32': 1, 'model.layers.33': 1, 'model.layers.34': 1, 'model.layers.35': 1, 'model.layers.36': 1, 'model.layers.37': 1, 'model.layers.38': 1, 'model.layers.39': 1, 'model.layers.40': 2, 'model.layers.41': 2, 'model.layers.42': 2, 'model.layers.43': 2, 'model.layers.44': 2, 'model.layers.45': 2, 'model.layers.46': 2, 'model.layers.47': 2, 'model.layers.48': 2, 'model.layers.49': 2, 'model.layers.50': 2, 'model.layers.51': 2, 'model.layers.52': 2, 'model.layers.53': 2, 'model.layers.54': 2, 'model.layers.55': 2, 'model.layers.56': 2, 'model.layers.57': 2, 'model.layers.58': 2, 'model.layers.59': 2, 'model.layers.60': 2, 'model.layers.61': 2, 'model.layers.62': 3, 'model.layers.63': 3, 'model.layers.64': 3, 'model.layers.65': 3, 'model.layers.66': 3, 'model.layers.67': 3, 'model.layers.68': 3, 'model.layers.69': 3, 'model.layers.70': 3, 'model.layers.71': 3, 'model.layers.72': 3, 'model.layers.73': 3, 'model.layers.74': 3, 'model.layers.75': 3, 'model.layers.76': 3, 'model.layers.77': 3, 'model.layers.78': 3, 'model.layers.79': 3, 'model.norm': 3, 'model.rotary_emb': 3, 'lm_head': 3}\n",
      "cuda:0\n",
      "input_ids: cuda:0\n",
      "attention_mask: cuda:0\n",
      "pixel_values: cuda:0\n",
      "image_grid_thw: cuda:0\n",
      "######################################################\n",
      "[\"So I've got this puzzle here with emojis representing numbers, and I need to figure out what goes in the blank space. Let's see, there are four equations, and each one uses hearts, bows, and dogs as symbols. I need to assign numbers to these symbols based on the equations provided.\\n\\nFirst equation: four hearts added together equal 24. So, 4 hearts = 24. That seems straightforward. If I divide both sides by 4, then one heart equals 6. Okay, so heart = 6.\\n\\nSecond equation: one heart minus one bow equals 1. So, heart - bow = 1. I already know that heart is 6, so 6 - bow = 1. To find bow, I can subtract 6 from both sides, but wait, 6 - bow = 1 would mean bow is 5, because 6 - 5 = 1. Yeah, that makes sense.\\n\\nThird equation: one heart plus one bow plus one dog equals 19. So, heart + bow + dog = 19. I know heart is 6 and bow is 5, so 6 + 5 + dog = 19. That means 11 + dog = 19. Subtracting 11 from both sides, dog = 8. Okay, dog = 8.\\n\\nNow, the fourth equation is: one heart plus one bow times one dog equals what? So, heart + bow × dog = ?. Plugging in the values I have: 6 + 5 × 8 = ?\\n\\nWait a minute, I need to remember the order of operations here. Multiplication comes before addition in PEMDAS, so I should do the multiplication first and then add.\\n\\nSo, 5 × 8 is 40, and then 6 + 40 is 46. Therefore, the blank space should be 46.\\n\\nLet me double-check to make sure I didn't make any mistakes. Starting with heart = 6, bow = 5, and dog = 8.\\n\\nFirst equation: 4 hearts = 24. 4 × 6 = 24. Correct.\\n\\nSecond equation: 6 - 5 = 1. Correct.\\n\\nThird equation: 6 + 5 + 8 = 19. Correct.\\n\\nFourth equation: 6 + (5 × 8) = 46. That seems right.\\n\\nI think that's the answer\"]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "messages = [\n",
    "    {\n",
    "        \"role\": \"system\",\n",
    "        \"content\": [\n",
    "            {\"type\": \"text\", \"text\": \"You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.\"}\n",
    "        ],\n",
    "    },\n",
    "    {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": [\n",
    "            {\n",
    "                \"type\": \"image\",\n",
    "                \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png\",\n",
    "            },\n",
    "            {\"type\": \"text\", \"text\": \"What value should be filled in the blank space?\"},\n",
    "        ],\n",
    "    }\n",
    "]\n",
    "\n",
    "# Preparation for inference\n",
    "text = processor.apply_chat_template(\n",
    "    messages, tokenize=False, add_generation_prompt=True\n",
    ")\n",
    "\n",
    "image_inputs, video_inputs = process_vision_info(messages)\n",
    "inputs = processor(\n",
    "    text=[text] ,\n",
    "    images=image_inputs ,\n",
    "    videos=video_inputs ,\n",
    "    padding=True,\n",
    "    return_tensors=\"pt\",\n",
    ")\n",
    "inputs = inputs.to(\"cuda\")\n",
    "torch.set_num_threads(16)\n",
    "print(\"######################################################\")\n",
    "print(model.hf_device_map)\n",
    "print(next(model.parameters()).device)\n",
    "for key, value in inputs.items():\n",
    "    if isinstance(value, torch.Tensor):\n",
    "        print(f\"{key}: {value.device}\")\n",
    "print(\"######################################################\")\n",
    "# Inference: Generation of the output\n",
    "\n",
    "generated_ids = model.generate(**inputs, max_new_tokens=512)\n",
    "\n",
    "generated_ids_trimmed = [\n",
    "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
    "]\n",
    "output_text = processor.batch_decode(\n",
    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    ")\n",
    "print(output_text)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "test_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}