{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/llava/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Loading checkpoint shards: 100%|██████████| 17/17 [01:22<00:00,  4.86s/it]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "import json\n",
    "import pprint \n",
    "\n",
    "\n",
    "def read_json(file_path): \n",
    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
    "        data = json.load(file)\n",
    "    return data\n",
    "\n",
    "def write_json(file_path, data):\n",
    "    with open(file_path, 'w', encoding='utf-8') as file:\n",
    "        json.dump(data, file, ensure_ascii=False, indent=4)\n",
    "        \n",
    "# data = read_json(\"/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/DataSet/MiniCPM-V/all_blip_train_llava_coco_layout_caption_s1s3.json\")       \n",
    "data = read_json(\"/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/merged_data.json\")       \n",
    "        \n",
    "\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "\n",
    "model_name = \"Model/QwQ-32B-Preview\"\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_name,\n",
    "    torch_dtype=\"auto\",\n",
    "    device_map=\"auto\"\n",
    ")\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def chat_QwQ(prompt):\n",
    "    \n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.\"},\n",
    "        {\"role\": \"user\", \"content\": prompt}\n",
    "    ]\n",
    "    \n",
    "    text = tokenizer.apply_chat_template(\n",
    "        messages,\n",
    "        tokenize=False,\n",
    "        add_generation_prompt=True\n",
    "    )\n",
    "    \n",
    "    model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n",
    "\n",
    "    generated_ids = model.generate(\n",
    "        **model_inputs,\n",
    "        max_new_tokens=512\n",
    "    )\n",
    "    \n",
    "    generated_ids = [\n",
    "        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
    "    ]\n",
    "\n",
    "    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
    "    # print(response)\n",
    "    return response\n",
    "\n",
    "\n",
    "\n",
    "def chat_QwQ_batch(batch):\n",
    "    \n",
    "    all_text = []\n",
    "    for prompt in batch:\n",
    "        \n",
    "        messages = [\n",
    "            {\"role\": \"system\", \"content\": \"You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.\"},\n",
    "            {\"role\": \"user\", \"content\": prompt}\n",
    "        ]\n",
    "        \n",
    "        text = tokenizer.apply_chat_template(\n",
    "            messages,\n",
    "            tokenize=False,\n",
    "            add_generation_prompt=True\n",
    "        )\n",
    "        \n",
    "        all_text.append(text)\n",
    "        \n",
    "    # model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n",
    "    model_inputs = tokenizer(all_text, return_tensors=\"pt\",padding=True,).to(model.device)\n",
    "\n",
    "    generated_ids = model.generate(\n",
    "        **model_inputs,\n",
    "        max_new_tokens=512\n",
    "    )\n",
    "    \n",
    "    return generated_ids\n",
    "    \n",
    "    # print(generated_ids.shape)\n",
    "    \n",
    "    # generated_ids = [\n",
    "    #     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
    "    # ]\n",
    "\n",
    "    # response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
    "    # # print(response)\n",
    "    # return response\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2000/2000 [00:00<00:00, 1097410.78it/s]\n",
      "A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.\n"
     ]
    },
    {
     "ename": "OutOfMemoryError",
     "evalue": "CUDA out of memory. Tried to allocate 32.85 GiB. GPU 0 has a total capacty of 79.33 GiB of which 11.21 GiB is free. Process 2176602 has 68.11 GiB memory in use. Of the allocated memory 67.07 GiB is allocated by PyTorch, and 577.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
      "\u001b[1;32m/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb Cell 3\u001b[0m line \u001b[0;36m3\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=26'>27</a>\u001b[0m \u001b[39mfor\u001b[39;00m batch_idx \u001b[39min\u001b[39;00m tqdm(\u001b[39mrange\u001b[39m(\u001b[39m1\u001b[39m, \u001b[39mlen\u001b[39m(data)\u001b[39m+\u001b[39m\u001b[39m1\u001b[39m, batch_size)):\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=27'>28</a>\u001b[0m     \u001b[39m# batch = data[batch_idx:batch_idx + batch_size]\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=28'>29</a>\u001b[0m     text_list\u001b[39m.\u001b[39mappend(data[\u001b[39mstr\u001b[39m(batch_idx)][\u001b[39m'\u001b[39m\u001b[39mcontent\u001b[39m\u001b[39m'\u001b[39m])\n\u001b[0;32m---> <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=31'>32</a>\u001b[0m output \u001b[39m=\u001b[39m chat_QwQ_batch(text_list)\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=34'>35</a>\u001b[0m \u001b[39mfor\u001b[39;00m idx, i \u001b[39min\u001b[39;00m \u001b[39menumerate\u001b[39m(batch):\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=35'>36</a>\u001b[0m         i[\u001b[39m'\u001b[39m\u001b[39manswer_QwQ\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m processor\u001b[39m.\u001b[39mdecode(output[idx])\n",
      "\u001b[1;32m/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb Cell 3\u001b[0m line \u001b[0;36m5\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=48'>49</a>\u001b[0m \u001b[39m# model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=49'>50</a>\u001b[0m model_inputs \u001b[39m=\u001b[39m tokenizer(all_text, return_tensors\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mpt\u001b[39m\u001b[39m\"\u001b[39m,padding\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,)\u001b[39m.\u001b[39mto(model\u001b[39m.\u001b[39mdevice)\n\u001b[0;32m---> <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=51'>52</a>\u001b[0m generated_ids \u001b[39m=\u001b[39m model\u001b[39m.\u001b[39;49mgenerate(\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=52'>53</a>\u001b[0m     \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mmodel_inputs,\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=53'>54</a>\u001b[0m     max_new_tokens\u001b[39m=\u001b[39;49m\u001b[39m512\u001b[39;49m\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=54'>55</a>\u001b[0m )\n\u001b[1;32m     <a href='vscode-notebook-cell://nb-a100.aistudio.infly.tech/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/QwQ_infer.ipynb#W2sdnNjb2RlLXJlbW90ZQ%3D%3D?line=56'>57</a>\u001b[0m \u001b[39mreturn\u001b[39;00m generated_ids\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[39m@functools\u001b[39m\u001b[39m.\u001b[39mwraps(func)\n\u001b[1;32m    113\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdecorate_context\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m    114\u001b[0m     \u001b[39mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m         \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/transformers/generation/utils.py:2252\u001b[0m, in \u001b[0;36mGenerationMixin.generate\u001b[0;34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)\u001b[0m\n\u001b[1;32m   2244\u001b[0m     input_ids, model_kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_expand_inputs_for_generation(\n\u001b[1;32m   2245\u001b[0m         input_ids\u001b[39m=\u001b[39minput_ids,\n\u001b[1;32m   2246\u001b[0m         expand_size\u001b[39m=\u001b[39mgeneration_config\u001b[39m.\u001b[39mnum_return_sequences,\n\u001b[1;32m   2247\u001b[0m         is_encoder_decoder\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mis_encoder_decoder,\n\u001b[1;32m   2248\u001b[0m         \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mmodel_kwargs,\n\u001b[1;32m   2249\u001b[0m     )\n\u001b[1;32m   2251\u001b[0m     \u001b[39m# 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)\u001b[39;00m\n\u001b[0;32m-> 2252\u001b[0m     result \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sample(\n\u001b[1;32m   2253\u001b[0m         input_ids,\n\u001b[1;32m   2254\u001b[0m         logits_processor\u001b[39m=\u001b[39;49mprepared_logits_processor,\n\u001b[1;32m   2255\u001b[0m         stopping_criteria\u001b[39m=\u001b[39;49mprepared_stopping_criteria,\n\u001b[1;32m   2256\u001b[0m         generation_config\u001b[39m=\u001b[39;49mgeneration_config,\n\u001b[1;32m   2257\u001b[0m         synced_gpus\u001b[39m=\u001b[39;49msynced_gpus,\n\u001b[1;32m   2258\u001b[0m         streamer\u001b[39m=\u001b[39;49mstreamer,\n\u001b[1;32m   2259\u001b[0m         \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mmodel_kwargs,\n\u001b[1;32m   2260\u001b[0m     )\n\u001b[1;32m   2262\u001b[0m \u001b[39melif\u001b[39;00m generation_mode \u001b[39min\u001b[39;00m (GenerationMode\u001b[39m.\u001b[39mBEAM_SAMPLE, GenerationMode\u001b[39m.\u001b[39mBEAM_SEARCH):\n\u001b[1;32m   2263\u001b[0m     \u001b[39m# 11. prepare beam search scorer\u001b[39;00m\n\u001b[1;32m   2264\u001b[0m     beam_scorer \u001b[39m=\u001b[39m BeamSearchScorer(\n\u001b[1;32m   2265\u001b[0m         batch_size\u001b[39m=\u001b[39mbatch_size,\n\u001b[1;32m   2266\u001b[0m         num_beams\u001b[39m=\u001b[39mgeneration_config\u001b[39m.\u001b[39mnum_beams,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2271\u001b[0m         max_length\u001b[39m=\u001b[39mgeneration_config\u001b[39m.\u001b[39mmax_length,\n\u001b[1;32m   2272\u001b[0m     )\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/transformers/generation/utils.py:3251\u001b[0m, in \u001b[0;36mGenerationMixin._sample\u001b[0;34m(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)\u001b[0m\n\u001b[1;32m   3248\u001b[0m model_inputs\u001b[39m.\u001b[39mupdate({\u001b[39m\"\u001b[39m\u001b[39moutput_hidden_states\u001b[39m\u001b[39m\"\u001b[39m: output_hidden_states} \u001b[39mif\u001b[39;00m output_hidden_states \u001b[39melse\u001b[39;00m {})\n\u001b[1;32m   3250\u001b[0m \u001b[39mif\u001b[39;00m is_prefill:\n\u001b[0;32m-> 3251\u001b[0m     outputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mmodel_inputs, return_dict\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m   3252\u001b[0m     is_prefill \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m   3253\u001b[0m \u001b[39melse\u001b[39;00m:\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_compiled_call_impl(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)  \u001b[39m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_call_impl(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1529\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/accelerate/hooks.py:165\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    163\u001b[0m         output \u001b[39m=\u001b[39m module\u001b[39m.\u001b[39m_old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    164\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 165\u001b[0m     output \u001b[39m=\u001b[39m module\u001b[39m.\u001b[39;49m_old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    166\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:1165\u001b[0m, in \u001b[0;36mQwen2ForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, num_logits_to_keep, **loss_kwargs)\u001b[0m\n\u001b[1;32m   1162\u001b[0m return_dict \u001b[39m=\u001b[39m return_dict \u001b[39mif\u001b[39;00m return_dict \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39muse_return_dict\n\u001b[1;32m   1164\u001b[0m \u001b[39m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m-> 1165\u001b[0m outputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel(\n\u001b[1;32m   1166\u001b[0m     input_ids\u001b[39m=\u001b[39;49minput_ids,\n\u001b[1;32m   1167\u001b[0m     attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m   1168\u001b[0m     position_ids\u001b[39m=\u001b[39;49mposition_ids,\n\u001b[1;32m   1169\u001b[0m     past_key_values\u001b[39m=\u001b[39;49mpast_key_values,\n\u001b[1;32m   1170\u001b[0m     inputs_embeds\u001b[39m=\u001b[39;49minputs_embeds,\n\u001b[1;32m   1171\u001b[0m     use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m   1172\u001b[0m     output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m   1173\u001b[0m     output_hidden_states\u001b[39m=\u001b[39;49moutput_hidden_states,\n\u001b[1;32m   1174\u001b[0m     return_dict\u001b[39m=\u001b[39;49mreturn_dict,\n\u001b[1;32m   1175\u001b[0m     cache_position\u001b[39m=\u001b[39;49mcache_position,\n\u001b[1;32m   1176\u001b[0m )\n\u001b[1;32m   1178\u001b[0m hidden_states \u001b[39m=\u001b[39m outputs[\u001b[39m0\u001b[39m]\n\u001b[1;32m   1179\u001b[0m \u001b[39m# Only compute necessary logits, and do not upcast them to float if we are not computing the loss\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_compiled_call_impl(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)  \u001b[39m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_call_impl(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1529\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:895\u001b[0m, in \u001b[0;36mQwen2Model.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)\u001b[0m\n\u001b[1;32m    883\u001b[0m     layer_outputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m    884\u001b[0m         decoder_layer\u001b[39m.\u001b[39m\u001b[39m__call__\u001b[39m,\n\u001b[1;32m    885\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    892\u001b[0m         position_embeddings,\n\u001b[1;32m    893\u001b[0m     )\n\u001b[1;32m    894\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 895\u001b[0m     layer_outputs \u001b[39m=\u001b[39m decoder_layer(\n\u001b[1;32m    896\u001b[0m         hidden_states,\n\u001b[1;32m    897\u001b[0m         attention_mask\u001b[39m=\u001b[39;49mcausal_mask,\n\u001b[1;32m    898\u001b[0m         position_ids\u001b[39m=\u001b[39;49mposition_ids,\n\u001b[1;32m    899\u001b[0m         past_key_value\u001b[39m=\u001b[39;49mpast_key_values,\n\u001b[1;32m    900\u001b[0m         output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m    901\u001b[0m         use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m    902\u001b[0m         cache_position\u001b[39m=\u001b[39;49mcache_position,\n\u001b[1;32m    903\u001b[0m         position_embeddings\u001b[39m=\u001b[39;49mposition_embeddings,\n\u001b[1;32m    904\u001b[0m     )\n\u001b[1;32m    906\u001b[0m hidden_states \u001b[39m=\u001b[39m layer_outputs[\u001b[39m0\u001b[39m]\n\u001b[1;32m    908\u001b[0m \u001b[39mif\u001b[39;00m use_cache:\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_compiled_call_impl(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)  \u001b[39m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_call_impl(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1529\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/accelerate/hooks.py:165\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    163\u001b[0m         output \u001b[39m=\u001b[39m module\u001b[39m.\u001b[39m_old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    164\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 165\u001b[0m     output \u001b[39m=\u001b[39m module\u001b[39m.\u001b[39;49m_old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    166\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:620\u001b[0m, in \u001b[0;36mQwen2DecoderLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, position_embeddings, **kwargs)\u001b[0m\n\u001b[1;32m    596\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    597\u001b[0m \u001b[39mArgs:\u001b[39;00m\n\u001b[1;32m    598\u001b[0m \u001b[39m    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    615\u001b[0m \u001b[39m        into the model\u001b[39;00m\n\u001b[1;32m    616\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    618\u001b[0m residual \u001b[39m=\u001b[39m hidden_states\n\u001b[0;32m--> 620\u001b[0m hidden_states \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49minput_layernorm(hidden_states)\n\u001b[1;32m    622\u001b[0m \u001b[39m# Self Attention\u001b[39;00m\n\u001b[1;32m    623\u001b[0m hidden_states, self_attn_weights, present_key_value \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mself_attn(\n\u001b[1;32m    624\u001b[0m     hidden_states\u001b[39m=\u001b[39mhidden_states,\n\u001b[1;32m    625\u001b[0m     attention_mask\u001b[39m=\u001b[39mattention_mask,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    631\u001b[0m     position_embeddings\u001b[39m=\u001b[39mposition_embeddings,\n\u001b[1;32m    632\u001b[0m )\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_compiled_call_impl(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)  \u001b[39m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_call_impl(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1529\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/accelerate/hooks.py:165\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    163\u001b[0m         output \u001b[39m=\u001b[39m module\u001b[39m.\u001b[39m_old_forward(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    164\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 165\u001b[0m     output \u001b[39m=\u001b[39m module\u001b[39m.\u001b[39;49m_old_forward(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    166\u001b[0m \u001b[39mreturn\u001b[39;00m module\u001b[39m.\u001b[39m_hf_hook\u001b[39m.\u001b[39mpost_forward(module, output)\n",
      "File \u001b[0;32m/opt/conda/envs/llava/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:79\u001b[0m, in \u001b[0;36mQwen2RMSNorm.forward\u001b[0;34m(self, hidden_states)\u001b[0m\n\u001b[1;32m     77\u001b[0m input_dtype \u001b[39m=\u001b[39m hidden_states\u001b[39m.\u001b[39mdtype\n\u001b[1;32m     78\u001b[0m hidden_states \u001b[39m=\u001b[39m hidden_states\u001b[39m.\u001b[39mto(torch\u001b[39m.\u001b[39mfloat32)\n\u001b[0;32m---> 79\u001b[0m variance \u001b[39m=\u001b[39m hidden_states\u001b[39m.\u001b[39;49mpow(\u001b[39m2\u001b[39;49m)\u001b[39m.\u001b[39mmean(\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m, keepdim\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m     80\u001b[0m hidden_states \u001b[39m=\u001b[39m hidden_states \u001b[39m*\u001b[39m torch\u001b[39m.\u001b[39mrsqrt(variance \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvariance_epsilon)\n\u001b[1;32m     81\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mweight \u001b[39m*\u001b[39m hidden_states\u001b[39m.\u001b[39mto(input_dtype)\n",
      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 32.85 GiB. GPU 0 has a total capacty of 79.33 GiB of which 11.21 GiB is free. Process 2176602 has 68.11 GiB memory in use. Of the allocated memory 67.07 GiB is allocated by PyTorch, and 577.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
     ]
    }
   ],
   "source": [
    "# for i in data:\n",
    "#     # sent1 = i['conversations'][0]['value']\n",
    "#     # sent2 = i['conversations'][1]['value']\n",
    "#     # sentence = sent1 + sent2 \n",
    "    \n",
    "#     sent1 = i['caption']\n",
    "#     sent2 = i['action_target']\n",
    "#     goal = i['ori_question'].split('Goal:')[1]\n",
    "    \n",
    "    \n",
    "#     prompt = \"The description of image is: \" + sent1 + \" The goal is :\" + goal + \"  ###### Infer from the content of the image and the goal, why the following actions were performed in this step.###### \" + \" The action is :\" + sent2\n",
    "#     answer = chat_QwQ(prompt)\n",
    "    \n",
    " \n",
    "#     # english_text = answer\n",
    "#     # inputs = tokenizer.encode(english_text, return_tensors=\"pt\", truncation=True)\n",
    "#     # translated = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)\n",
    "#     # chinese_translation = tokenizer.decode(translated[0], skip_special_tokens=True)\n",
    "#     pprint.pprint(prompt)\n",
    "#     pprint.pprint(answer)\n",
    "#     break\n",
    "\n",
    "batch_size = 2\n",
    "from tqdm import tqdm\n",
    "\n",
    "text_list = []\n",
    "for batch_idx in tqdm(range(1, len(data)+1, batch_size)):\n",
    "    # batch = data[batch_idx:batch_idx + batch_size]\n",
    "    text_list.append(data[str(batch_idx)]['content'])\n",
    "        \n",
    "    \n",
    "output = chat_QwQ_batch(text_list)\n",
    "\n",
    "\n",
    "for idx, i in enumerate(batch):\n",
    "        i['answer_QwQ'] = processor.decode(output[idx])\n",
    "        print(i['answer_QwQ'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'你是一位数学专家，请解答以下问题并提供详细解答步骤：\\n\\n将点 $(0,3)$ 从直角坐标系转换到极坐标系。请以 $(r,\\\\theta)$ 的形式输入答案，其中 $r > 0$ 且 $0 \\\\le \\\\theta < 2 \\\\pi$。'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# batch_idx,batch_size, data[0:4]\n",
    "data['1']['content']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4000"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}