Wendy-Fly
/

Truck2

Model card Files Files and versions Community

File size: 8,559 Bytes

ed213a5

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eb580f54d0ec4077b5c39da99f35c4f0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/38 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
    "from qwen_vl_utils import process_vision_info\n",
    "import torch\n",
    "\n",
    "#torch_device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "local_model_path = \"models/QVQ-72B-Preview\"\n",
    "\n",
    "# default: Load the model on the available device(s)\n",
    "model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
    "    #\"Qwen/QVQ-72B-Preview\", \n",
    "    local_model_path,\n",
    "    torch_dtype=\"auto\", \n",
    "    device_map=\"auto\"\n",
    ")\n",
    "#model.to(\"cuda\") \n",
    "# default processer\n",
    "processor = AutoProcessor.from_pretrained(local_model_path)\n",
    "\n",
    "# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.\n",
    "# min_pixels = 256*28*28\n",
    "# max_pixels = 1280*28*28\n",
    "# processor = AutoProcessor.from_pretrained(\"Qwen/QVQ-72B-Preview\", min_pixels=min_pixels, max_pixels=max_pixels)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "######################################################\n",
      "{'visual': 0, 'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.layers.32': 1, 'model.layers.33': 1, 'model.layers.34': 1, 'model.layers.35': 1, 'model.layers.36': 1, 'model.layers.37': 1, 'model.layers.38': 1, 'model.layers.39': 1, 'model.layers.40': 2, 'model.layers.41': 2, 'model.layers.42': 2, 'model.layers.43': 2, 'model.layers.44': 2, 'model.layers.45': 2, 'model.layers.46': 2, 'model.layers.47': 2, 'model.layers.48': 2, 'model.layers.49': 2, 'model.layers.50': 2, 'model.layers.51': 2, 'model.layers.52': 2, 'model.layers.53': 2, 'model.layers.54': 2, 'model.layers.55': 2, 'model.layers.56': 2, 'model.layers.57': 2, 'model.layers.58': 2, 'model.layers.59': 2, 'model.layers.60': 2, 'model.layers.61': 2, 'model.layers.62': 3, 'model.layers.63': 3, 'model.layers.64': 3, 'model.layers.65': 3, 'model.layers.66': 3, 'model.layers.67': 3, 'model.layers.68': 3, 'model.layers.69': 3, 'model.layers.70': 3, 'model.layers.71': 3, 'model.layers.72': 3, 'model.layers.73': 3, 'model.layers.74': 3, 'model.layers.75': 3, 'model.layers.76': 3, 'model.layers.77': 3, 'model.layers.78': 3, 'model.layers.79': 3, 'model.norm': 3, 'model.rotary_emb': 3, 'lm_head': 3}\n",
      "cuda:0\n",
      "input_ids: cuda:0\n",
      "attention_mask: cuda:0\n",
      "pixel_values: cuda:0\n",
      "image_grid_thw: cuda:0\n",
      "######################################################\n",
      "[\"So I've got this puzzle here with emojis representing numbers, and I need to figure out what goes in the blank space. Let's see, there are four equations, and each one uses hearts, bows, and dogs as symbols. I need to assign numbers to these symbols based on the equations provided.\\n\\nFirst equation: four hearts added together equal 24. So, 4 hearts = 24. That seems straightforward. If I divide both sides by 4, then one heart equals 6. Okay, so heart = 6.\\n\\nSecond equation: one heart minus one bow equals 1. So, heart - bow = 1. I already know that heart is 6, so 6 - bow = 1. To find bow, I can subtract 6 from both sides, but wait, 6 - bow = 1 would mean bow is 5, because 6 - 5 = 1. Yeah, that makes sense.\\n\\nThird equation: one heart plus one bow plus one dog equals 19. So, heart + bow + dog = 19. I know heart is 6 and bow is 5, so 6 + 5 + dog = 19. That means 11 + dog = 19. Subtracting 11 from both sides, dog = 8. Okay, dog = 8.\\n\\nNow, the fourth equation is: one heart plus one bow times one dog equals what? So, heart + bow × dog = ?. Plugging in the values I have: 6 + 5 × 8 = ?\\n\\nWait a minute, I need to remember the order of operations here. Multiplication comes before addition in PEMDAS, so I should do the multiplication first and then add.\\n\\nSo, 5 × 8 is 40, and then 6 + 40 is 46. Therefore, the blank space should be 46.\\n\\nLet me double-check to make sure I didn't make any mistakes. Starting with heart = 6, bow = 5, and dog = 8.\\n\\nFirst equation: 4 hearts = 24. 4 × 6 = 24. Correct.\\n\\nSecond equation: 6 - 5 = 1. Correct.\\n\\nThird equation: 6 + 5 + 8 = 19. Correct.\\n\\nFourth equation: 6 + (5 × 8) = 46. That seems right.\\n\\nI think that's the answer\"]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "messages = [\n",
    "    {\n",
    "        \"role\": \"system\",\n",
    "        \"content\": [\n",
    "            {\"type\": \"text\", \"text\": \"You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.\"}\n",
    "        ],\n",
    "    },\n",
    "    {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": [\n",
    "            {\n",
    "                \"type\": \"image\",\n",
    "                \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png\",\n",
    "            },\n",
    "            {\"type\": \"text\", \"text\": \"What value should be filled in the blank space?\"},\n",
    "        ],\n",
    "    }\n",
    "]\n",
    "\n",
    "# Preparation for inference\n",
    "text = processor.apply_chat_template(\n",
    "    messages, tokenize=False, add_generation_prompt=True\n",
    ")\n",
    "\n",
    "image_inputs, video_inputs = process_vision_info(messages)\n",
    "inputs = processor(\n",
    "    text=[text] ,\n",
    "    images=image_inputs ,\n",
    "    videos=video_inputs ,\n",
    "    padding=True,\n",
    "    return_tensors=\"pt\",\n",
    ")\n",
    "inputs = inputs.to(\"cuda\")\n",
    "torch.set_num_threads(16)\n",
    "print(\"######################################################\")\n",
    "print(model.hf_device_map)\n",
    "print(next(model.parameters()).device)\n",
    "for key, value in inputs.items():\n",
    "    if isinstance(value, torch.Tensor):\n",
    "        print(f\"{key}: {value.device}\")\n",
    "print(\"######################################################\")\n",
    "# Inference: Generation of the output\n",
    "\n",
    "generated_ids = model.generate(**inputs, max_new_tokens=512)\n",
    "\n",
    "generated_ids_trimmed = [\n",
    "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
    "]\n",
    "output_text = processor.batch_decode(\n",
    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    ")\n",
    "print(output_text)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "test_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}