Wendy-Fly commited on
Commit
ed213a5
·
verified ·
1 Parent(s): 2f0ef24

Upload test.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. test.ipynb +155 -0
test.ipynb ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n"
13
+ ]
14
+ },
15
+ {
16
+ "data": {
17
+ "application/vnd.jupyter.widget-view+json": {
18
+ "model_id": "eb580f54d0ec4077b5c39da99f35c4f0",
19
+ "version_major": 2,
20
+ "version_minor": 0
21
+ },
22
+ "text/plain": [
23
+ "Loading checkpoint shards: 0%| | 0/38 [00:00<?, ?it/s]"
24
+ ]
25
+ },
26
+ "metadata": {},
27
+ "output_type": "display_data"
28
+ }
29
+ ],
30
+ "source": [
31
+ "from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
32
+ "from qwen_vl_utils import process_vision_info\n",
33
+ "import torch\n",
34
+ "\n",
35
+ "#torch_device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
36
+ "\n",
37
+ "local_model_path = \"models/QVQ-72B-Preview\"\n",
38
+ "\n",
39
+ "# default: Load the model on the available device(s)\n",
40
+ "model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
41
+ " #\"Qwen/QVQ-72B-Preview\", \n",
42
+ " local_model_path,\n",
43
+ " torch_dtype=\"auto\", \n",
44
+ " device_map=\"auto\"\n",
45
+ ")\n",
46
+ "#model.to(\"cuda\") \n",
47
+ "# default processer\n",
48
+ "processor = AutoProcessor.from_pretrained(local_model_path)\n",
49
+ "\n",
50
+ "# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.\n",
51
+ "# min_pixels = 256*28*28\n",
52
+ "# max_pixels = 1280*28*28\n",
53
+ "# processor = AutoProcessor.from_pretrained(\"Qwen/QVQ-72B-Preview\", min_pixels=min_pixels, max_pixels=max_pixels)\n"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 3,
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "name": "stdout",
63
+ "output_type": "stream",
64
+ "text": [
65
+ "######################################################\n",
66
+ "{'visual': 0, 'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.layers.32': 1, 'model.layers.33': 1, 'model.layers.34': 1, 'model.layers.35': 1, 'model.layers.36': 1, 'model.layers.37': 1, 'model.layers.38': 1, 'model.layers.39': 1, 'model.layers.40': 2, 'model.layers.41': 2, 'model.layers.42': 2, 'model.layers.43': 2, 'model.layers.44': 2, 'model.layers.45': 2, 'model.layers.46': 2, 'model.layers.47': 2, 'model.layers.48': 2, 'model.layers.49': 2, 'model.layers.50': 2, 'model.layers.51': 2, 'model.layers.52': 2, 'model.layers.53': 2, 'model.layers.54': 2, 'model.layers.55': 2, 'model.layers.56': 2, 'model.layers.57': 2, 'model.layers.58': 2, 'model.layers.59': 2, 'model.layers.60': 2, 'model.layers.61': 2, 'model.layers.62': 3, 'model.layers.63': 3, 'model.layers.64': 3, 'model.layers.65': 3, 'model.layers.66': 3, 'model.layers.67': 3, 'model.layers.68': 3, 'model.layers.69': 3, 'model.layers.70': 3, 'model.layers.71': 3, 'model.layers.72': 3, 'model.layers.73': 3, 'model.layers.74': 3, 'model.layers.75': 3, 'model.layers.76': 3, 'model.layers.77': 3, 'model.layers.78': 3, 'model.layers.79': 3, 'model.norm': 3, 'model.rotary_emb': 3, 'lm_head': 3}\n",
67
+ "cuda:0\n",
68
+ "input_ids: cuda:0\n",
69
+ "attention_mask: cuda:0\n",
70
+ "pixel_values: cuda:0\n",
71
+ "image_grid_thw: cuda:0\n",
72
+ "######################################################\n",
73
+ "[\"So I've got this puzzle here with emojis representing numbers, and I need to figure out what goes in the blank space. Let's see, there are four equations, and each one uses hearts, bows, and dogs as symbols. I need to assign numbers to these symbols based on the equations provided.\\n\\nFirst equation: four hearts added together equal 24. So, 4 hearts = 24. That seems straightforward. If I divide both sides by 4, then one heart equals 6. Okay, so heart = 6.\\n\\nSecond equation: one heart minus one bow equals 1. So, heart - bow = 1. I already know that heart is 6, so 6 - bow = 1. To find bow, I can subtract 6 from both sides, but wait, 6 - bow = 1 would mean bow is 5, because 6 - 5 = 1. Yeah, that makes sense.\\n\\nThird equation: one heart plus one bow plus one dog equals 19. So, heart + bow + dog = 19. I know heart is 6 and bow is 5, so 6 + 5 + dog = 19. That means 11 + dog = 19. Subtracting 11 from both sides, dog = 8. Okay, dog = 8.\\n\\nNow, the fourth equation is: one heart plus one bow times one dog equals what? So, heart + bow × dog = ?. Plugging in the values I have: 6 + 5 × 8 = ?\\n\\nWait a minute, I need to remember the order of operations here. Multiplication comes before addition in PEMDAS, so I should do the multiplication first and then add.\\n\\nSo, 5 × 8 is 40, and then 6 + 40 is 46. Therefore, the blank space should be 46.\\n\\nLet me double-check to make sure I didn't make any mistakes. Starting with heart = 6, bow = 5, and dog = 8.\\n\\nFirst equation: 4 hearts = 24. 4 × 6 = 24. Correct.\\n\\nSecond equation: 6 - 5 = 1. Correct.\\n\\nThird equation: 6 + 5 + 8 = 19. Correct.\\n\\nFourth equation: 6 + (5 × 8) = 46. That seems right.\\n\\nI think that's the answer\"]\n"
74
+ ]
75
+ }
76
+ ],
77
+ "source": [
78
+ "\n",
79
+ "messages = [\n",
80
+ " {\n",
81
+ " \"role\": \"system\",\n",
82
+ " \"content\": [\n",
83
+ " {\"type\": \"text\", \"text\": \"You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.\"}\n",
84
+ " ],\n",
85
+ " },\n",
86
+ " {\n",
87
+ " \"role\": \"user\",\n",
88
+ " \"content\": [\n",
89
+ " {\n",
90
+ " \"type\": \"image\",\n",
91
+ " \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png\",\n",
92
+ " },\n",
93
+ " {\"type\": \"text\", \"text\": \"What value should be filled in the blank space?\"},\n",
94
+ " ],\n",
95
+ " }\n",
96
+ "]\n",
97
+ "\n",
98
+ "# Preparation for inference\n",
99
+ "text = processor.apply_chat_template(\n",
100
+ " messages, tokenize=False, add_generation_prompt=True\n",
101
+ ")\n",
102
+ "\n",
103
+ "image_inputs, video_inputs = process_vision_info(messages)\n",
104
+ "inputs = processor(\n",
105
+ " text=[text] ,\n",
106
+ " images=image_inputs ,\n",
107
+ " videos=video_inputs ,\n",
108
+ " padding=True,\n",
109
+ " return_tensors=\"pt\",\n",
110
+ ")\n",
111
+ "inputs = inputs.to(\"cuda\")\n",
112
+ "torch.set_num_threads(16)\n",
113
+ "print(\"######################################################\")\n",
114
+ "print(model.hf_device_map)\n",
115
+ "print(next(model.parameters()).device)\n",
116
+ "for key, value in inputs.items():\n",
117
+ " if isinstance(value, torch.Tensor):\n",
118
+ " print(f\"{key}: {value.device}\")\n",
119
+ "print(\"######################################################\")\n",
120
+ "# Inference: Generation of the output\n",
121
+ "\n",
122
+ "generated_ids = model.generate(**inputs, max_new_tokens=512)\n",
123
+ "\n",
124
+ "generated_ids_trimmed = [\n",
125
+ " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
126
+ "]\n",
127
+ "output_text = processor.batch_decode(\n",
128
+ " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
129
+ ")\n",
130
+ "print(output_text)\n"
131
+ ]
132
+ }
133
+ ],
134
+ "metadata": {
135
+ "kernelspec": {
136
+ "display_name": "test_env",
137
+ "language": "python",
138
+ "name": "python3"
139
+ },
140
+ "language_info": {
141
+ "codemirror_mode": {
142
+ "name": "ipython",
143
+ "version": 3
144
+ },
145
+ "file_extension": ".py",
146
+ "mimetype": "text/x-python",
147
+ "name": "python",
148
+ "nbconvert_exporter": "python",
149
+ "pygments_lexer": "ipython3",
150
+ "version": "3.11.9"
151
+ }
152
+ },
153
+ "nbformat": 4,
154
+ "nbformat_minor": 2
155
+ }