Spaces:

ShaoRun
/

AllSparkv2

Runtime error

App Files Files Community

ShaoRun commited on Mar 1

Commit

b2a1d7a

verified ·

1 Parent(s): 076e1ef

Upload 36 files

Browse files

Files changed (37) hide show

.gitattributes +3 -0
inference/chat_vision_point.ipynb +0 -0
inference/demo_assets/e393be9a47a24a7cae6142e13f5686d1_8192.npy +3 -0
inference/demo_assets/image1.png +3 -0
inference/demo_assets/image1_384.png +3 -0
inference/demo_assets/image2.png +3 -0
inference/forward_speed.ipynb +182 -0
mm_models/__init__.py +2 -0
mm_models/configuration_mm.py +23 -0
mm_models/llms/__pycache__/llama_modal_moe.cpython-310.pyc +0 -0
mm_models/llms/__pycache__/qwen_model_moe.cpython-310.pyc +0 -0
mm_models/llms/qwen_model_moe.py +338 -0
mm_models/modal_module/__init__.py +22 -0
mm_models/modal_module/__pycache__/__init__.cpython-310.pyc +0 -0
mm_models/modal_module/point/__pycache__/reconv2.cpython-310.pyc +0 -0
mm_models/modal_module/point/recon/__pycache__/transformer.cpython-310.pyc +0 -0
mm_models/modal_module/point/recon/reconv2_utils/AverageMeter.py +42 -0
mm_models/modal_module/point/recon/reconv2_utils/__pycache__/knn.cpython-310.pyc +0 -0
mm_models/modal_module/point/recon/reconv2_utils/__pycache__/logger.cpython-310.pyc +0 -0
mm_models/modal_module/point/recon/reconv2_utils/__pycache__/misc.cpython-310.pyc +0 -0
mm_models/modal_module/point/recon/reconv2_utils/checkpoint.py +129 -0
mm_models/modal_module/point/recon/reconv2_utils/config.py +69 -0
mm_models/modal_module/point/recon/reconv2_utils/data.py +109 -0
mm_models/modal_module/point/recon/reconv2_utils/dist_utils.py +49 -0
mm_models/modal_module/point/recon/reconv2_utils/knn.py +37 -0
mm_models/modal_module/point/recon/reconv2_utils/logger.py +127 -0
mm_models/modal_module/point/recon/reconv2_utils/misc.py +294 -0
mm_models/modal_module/point/recon/reconv2_utils/parser.py +117 -0
mm_models/modal_module/point/recon/reconv2_utils/randaugment.py +216 -0
mm_models/modal_module/point/recon/reconv2_utils/registry.py +289 -0
mm_models/modal_module/point/recon/reconv2_utils/transforms.py +78 -0
mm_models/modal_module/point/recon/transformer.py +647 -0
mm_models/modal_module/point/reconv2.py +266 -0
mm_models/modal_module/vision/__pycache__/siglip.cpython-310.pyc +0 -0
mm_models/modal_module/vision/siglip.py +122 -0
mm_models/modeling_mm.py +259 -0
utils.py +181 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+inference/demo_assets/image1_384.png filter=lfs diff=lfs merge=lfs -text
+inference/demo_assets/image1.png filter=lfs diff=lfs merge=lfs -text
+inference/demo_assets/image2.png filter=lfs diff=lfs merge=lfs -text

inference/chat_vision_point.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

inference/demo_assets/e393be9a47a24a7cae6142e13f5686d1_8192.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e9bee56f432fce81bc1d356f54656064ac4926d02ff95b1b4f2e09d18ba79c7
+size 196736

inference/demo_assets/image1.png ADDED Viewed

Git LFS Details

SHA256: 69f615c932632dfba059d4704da9c530df30167e8ed9dab42d0cd09280b79876
Pointer size: 132 Bytes
Size of remote file: 4.94 MB

inference/demo_assets/image1_384.png ADDED Viewed

Git LFS Details

SHA256: 9a65eb53d31a7b031ce9f5a30355ffde812b1b434e2752ee15dcfa540af0783e
Pointer size: 131 Bytes
Size of remote file: 238 kB

inference/demo_assets/image2.png ADDED Viewed

Git LFS Details

SHA256: 240666bf35e80c5e17b3b5188cd99a04989002ca80da1e2edc1c613cb32481f1
Pointer size: 132 Bytes
Size of remote file: 1.13 MB

inference/forward_speed.ipynb ADDED Viewed

	@@ -0,0 +1,182 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "import random\n",
+    "import torch\n",
+    "sys.path.append(\"../\")\n",
+    "from mm_models import AllSparkForCausalLM\n",
+    "from transformers import AutoImageProcessor, AutoTokenizer\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "from fvcore.nn import FlopCountAnalysis\n",
+    "from plyfile import PlyData\n",
+    "import plotly.graph_objects as go\n",
+    "from mm_datasets.data_utils import point_preprocess, load_pts, process_pts\n",
+    "from utils import SYSTEM_PROMPT\n",
+    "\n",
+    "system_prompt = SYSTEM_PROMPT\n",
+    "\n",
+    "\n",
+    "def show_pointcloud(data, background=None):\n",
+    "    points = data[:, :3]\n",
+    "    colors = data[:, 3:6]\n",
+    "\n",
+    "    if colors is not None:\n",
+    "        # * if colors in range(0-1)\n",
+    "        if np.max(colors) <= 1:\n",
+    "            color_data = np.multiply(colors, 255).astype(int)  # Convert float values (0-1) to integers (0-255)\n",
+    "        # * if colors in range(0-255)\n",
+    "        elif np.max(colors) <= 255:\n",
+    "            color_data = colors.astype(int)\n",
+    "    else:\n",
+    "        color_data = np.zeros_like(points).astype(int)  # Default to black color if RGB information is not available\n",
+    "    colors = color_data.astype(np.float32) / 255 # model input is (0-1)\n",
+    "\n",
+    "    color_strings = ['rgb({},{},{})'.format(r, g, b) for r, g, b in color_data]\n",
+    "\n",
+    "    fig = go.Figure(\n",
+    "        data=[\n",
+    "            go.Scatter3d(\n",
+    "                x=points[:, 0], y=points[:, 1], z=points[:, 2],\n",
+    "                mode='markers',\n",
+    "                marker=dict(\n",
+    "                    size=1.2,\n",
+    "                    color=color_strings,  # Use the list of RGB strings for the marker colors\n",
+    "                )\n",
+    "            )\n",
+    "        ],\n",
+    "        layout=dict(\n",
+    "            scene=dict(\n",
+    "                xaxis=dict(visible=False),\n",
+    "                yaxis=dict(visible=False),\n",
+    "                zaxis=dict(visible=False)\n",
+    "            ),\n",
+    "            paper_bgcolor='rgb(50,50,50)' if background is None else background  # Set the background color to dark gray 50, 50, 50\n",
+    "        ),\n",
+    "    )\n",
+    "    fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.73s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_path = \"[path/to/model]\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
+    "model = AllSparkForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).cuda()\n",
+    "img_processor = AutoImageProcessor.from_pretrained(model_path)\n",
+    "modal_place_token = dict()\n",
+    "for modal_cfg in model.config.modal_configs:\n",
+    "    modal_place_token[modal_cfg['modal_tag']] = modal_cfg['modal_placeholder_token']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Unsupported operator aten::_convolution_mode encountered 1 time(s)\n",
+      "Unsupported operator aten::embedding encountered 3 time(s)\n",
+      "Unsupported operator aten::add encountered 224 time(s)\n",
+      "Unsupported operator aten::mul encountered 342 time(s)\n",
+      "Unsupported operator aten::softmax encountered 26 time(s)\n",
+      "Unsupported operator aten::gelu encountered 28 time(s)\n",
+      "Unsupported operator aten::pad encountered 1 time(s)\n",
+      "Unsupported operator aten::mul_ encountered 1 time(s)\n",
+      "Unsupported operator aten::ones_like encountered 1 time(s)\n",
+      "Unsupported operator aten::sub encountered 1 time(s)\n",
+      "Unsupported operator aten::cos encountered 1 time(s)\n",
+      "Unsupported operator aten::sin encountered 1 time(s)\n",
+      "Unsupported operator aten::pow encountered 57 time(s)\n",
+      "Unsupported operator aten::mean encountered 57 time(s)\n",
+      "Unsupported operator aten::rsqrt encountered 57 time(s)\n",
+      "Unsupported operator aten::neg encountered 56 time(s)\n",
+      "Unsupported operator prim::PythonOp.FlashAttnFunc encountered 28 time(s)\n",
+      "Unsupported operator aten::silu encountered 28 time(s)\n",
+      "Unsupported operator aten::cross_entropy_loss encountered 1 time(s)\n",
+      "The following submodules of the model were never called during the trace of the graph. They may be unused, or they were accessed by direct calls to .forward() or via other python methods. In the latter case they will have zeros for statistics, though their statistics will still contribute to their parent calling module.\n",
+      "llm.model.layers.0.self_attn.rotary_emb, llm.model.layers.1.self_attn.rotary_emb, llm.model.layers.10.self_attn.rotary_emb, llm.model.layers.11.self_attn.rotary_emb, llm.model.layers.12.self_attn.rotary_emb, llm.model.layers.13.self_attn.rotary_emb, llm.model.layers.14.self_attn.rotary_emb, llm.model.layers.15.self_attn.rotary_emb, llm.model.layers.16.self_attn.rotary_emb, llm.model.layers.17.self_attn.rotary_emb, llm.model.layers.18.self_attn.rotary_emb, llm.model.layers.19.self_attn.rotary_emb, llm.model.layers.2.self_attn.rotary_emb, llm.model.layers.20.self_attn.rotary_emb, llm.model.layers.21.self_attn.rotary_emb, llm.model.layers.22.self_attn.rotary_emb, llm.model.layers.23.self_attn.rotary_emb, llm.model.layers.24.self_attn.rotary_emb, llm.model.layers.25.self_attn.rotary_emb, llm.model.layers.26.self_attn.rotary_emb, llm.model.layers.27.self_attn.rotary_emb, llm.model.layers.3.self_attn.rotary_emb, llm.model.layers.4.self_attn.rotary_emb, llm.model.layers.5.self_attn.rotary_emb, llm.model.layers.6.self_attn.rotary_emb, llm.model.layers.7.self_attn.rotary_emb, llm.model.layers.8.self_attn.rotary_emb, llm.model.layers.9.self_attn.rotary_emb, modal_encoders.vision.vision_model.encoder.layers.26, modal_encoders.vision.vision_model.encoder.layers.26.layer_norm1, modal_encoders.vision.vision_model.encoder.layers.26.layer_norm2, modal_encoders.vision.vision_model.encoder.layers.26.mlp, modal_encoders.vision.vision_model.encoder.layers.26.mlp.activation_fn, modal_encoders.vision.vision_model.encoder.layers.26.mlp.fc1, modal_encoders.vision.vision_model.encoder.layers.26.mlp.fc2, modal_encoders.vision.vision_model.encoder.layers.26.self_attn, modal_encoders.vision.vision_model.encoder.layers.26.self_attn.k_proj, modal_encoders.vision.vision_model.encoder.layers.26.self_attn.out_proj, modal_encoders.vision.vision_model.encoder.layers.26.self_attn.q_proj, modal_encoders.vision.vision_model.encoder.layers.26.self_attn.v_proj, modal_encoders.vision.vision_model.head, modal_encoders.vision.vision_model.head.attention, modal_encoders.vision.vision_model.head.attention.out_proj, modal_encoders.vision.vision_model.head.layernorm, modal_encoders.vision.vision_model.head.mlp, modal_encoders.vision.vision_model.head.mlp.activation_fn, modal_encoders.vision.vision_model.head.mlp.fc1, modal_encoders.vision.vision_model.head.mlp.fc2, modal_encoders.vision.vision_model.post_layernorm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2781.7917 GFLOPs\n"
+     ]
+    }
+   ],
+   "source": [
+    "image_path = \"../demo_images/image1_384.png\"\n",
+    "img = Image.open(image_path).convert(\"RGB\")\n",
+    "\n",
+    "img = img_processor(images=img, return_tensors=\"pt\").pixel_values.to(\"cuda\").squeeze().to(model.dtype)\n",
+    "\n",
+    "question_2 = modal_place_token['vision'] + \"\\nDescribe this image.\"\n",
+    "\n",
+    "modal_inputs = [('vision', img)]\n",
+    "\n",
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": system_prompt},\n",
+    "    {\"role\": \"user\", \"content\": \"The 3D object is a football, specifically a soccer ball, which is used in the sport of soccer or football. The ball is designed with a series of black and white panels that form a spherical shape, making it easy to kick and control during gameplay. The design also allows for the ball to spin and bounce when in motion, adding a strategic element to the sport.\\n\" + question_2},\n",
+    "]\n",
+    "inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors='pt').to(model.device)\n",
+    "\n",
+    "flops = FlopCountAnalysis(model, (inputs, [modal_inputs]))\n",
+    "print(f\"{flops.total()/1e9:.4f} GFLOPs\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

mm_models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .configuration_mm import AllSparkConfig
2	+ from .modeling_mm import AllSparkForCausalLM

mm_models/configuration_mm.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers import PretrainedConfig
+from typing import Optional, List, Dict
+class AllSparkConfig(PretrainedConfig):
+    model_type = "allspark"
+    def __init__(self,
+                 llm_name_or_path: str = None,
+                 modal_configs: Optional[List[Dict]] = None,
+                 initializer_range: float = 0.02,
+                 ignore_index: int = -100,
+                 tokenizer_padding_side: str = "right",
+                 add_moe: bool = True,
+                 **kwargs):
+        self.llm_name_or_path = llm_name_or_path
+        self.modal_configs = modal_configs
+        self.initializer_range = initializer_range
+        self.ignore_index = ignore_index
+        self.tokenizer_padding_side = tokenizer_padding_side
+        self.add_moe = add_moe
+        super().__init__(**kwargs)

mm_models/llms/__pycache__/llama_modal_moe.cpython-310.pyc ADDED Viewed

Binary file (9.32 kB). View file

mm_models/llms/__pycache__/qwen_model_moe.cpython-310.pyc ADDED Viewed

Binary file (8.53 kB). View file

mm_models/llms/qwen_model_moe.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import torch
+import torch.nn as nn
+from transformers import Qwen2ForCausalLM, Qwen2Model
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer, Qwen2MLP
+from typing import Optional, Tuple, List, Union
+from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast
+from utils import rank0_print
+import copy
+class Qwen2DecoderLayerMoE(Qwen2DecoderLayer):
+    def __init__(self, config, layer_idx, modal_tags, add_moe):
+        config._attn_implementation = "flash_attention_2"
+        super().__init__(config, layer_idx)
+        self.modal_tags = modal_tags
+        if modal_tags is not None and add_moe:
+            self.modal_moes = nn.ModuleDict()
+            for tag in modal_tags:
+                self.modal_moes[tag] = Qwen2MLP(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        modal_token_idx_matrix,
+        modal_idx_mapping,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if modal_token_idx_matrix is not None and hidden_states.shape[1] > 1 and hasattr(self, 'modal_moes'):
+            batch_size, sequence_length, hidden_dim = hidden_states.shape
+            hidden_states = hidden_states.view(-1, hidden_dim)
+            final_hidden_states = torch.zeros(
+                (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+            )
+            for modal_tag, modal_idx in modal_idx_mapping.items():
+                mask = modal_token_idx_matrix == modal_idx
+                if not torch.any(mask):
+                    continue
+                mask = mask.view(-1)
+                assert mask.shape[0] == hidden_states.shape[0]
+                if modal_tag == 'text':
+                    final_hidden_states[mask] = self.mlp(hidden_states[mask])
+                else:
+                    final_hidden_states[mask] = self.modal_moes[modal_tag](hidden_states[mask])
+            hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
+        else:
+            hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class Qwen2ModelMoE(Qwen2Model):
+    def __init__(self, config, modal_tags=None, add_moe=True):
+        super().__init__(config)
+        self.modal_tags = modal_tags
+        self.layers = nn.ModuleList(
+            [Qwen2DecoderLayerMoE(config, layer_idx, modal_tags, add_moe=add_moe)
+            for layer_idx in range(config.num_hidden_layers)]
+        )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        modal_tag_pos_list: Optional[List[List[Tuple[str, int, int]]]] = None, # batch, modal_num, (tag, start, end)
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                rank0_print(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                rank0_print(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        modal_token_idx_matrix = torch.zeros(hidden_states.shape[:2], dtype=torch.int8)
+        modal_idx_mapping = {"text": 0}
+        if self.modal_tags and modal_tag_pos_list:
+            for modal_idx, tag in enumerate(self.modal_tags):
+                modal_idx_mapping[tag] = modal_idx + 1
+            for sample_id, single_sample_mtp in enumerate(modal_tag_pos_list):
+                for tag, spos, epos in single_sample_mtp:
+                    modal_token_idx_matrix[sample_id, spos:epos+1] = modal_idx_mapping[tag]
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    modal_token_idx_matrix,
+                    modal_idx_mapping,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    modal_token_idx_matrix=modal_token_idx_matrix,
+                    modal_idx_mapping=modal_idx_mapping,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class Qwen2ForCausalLMMoE(Qwen2ForCausalLM):
+    def __init__(self, config, modal_tags=None, add_moe=True):
+        super().__init__(config)
+        self.model = Qwen2ModelMoE(config, modal_tags, add_moe=add_moe)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        modal_tag_pos_list: Optional[List[List[Tuple[str, int, int]]]] = None, # batch, modal_num, (tag, start, end)
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            modal_tag_pos_list=modal_tag_pos_list,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self,
+                                    input_ids: torch.LongTensor,
+                                    past_key_values: Optional[Cache] = None,
+                                    attention_mask: Optional[torch.LongTensor] = None,
+                                    inputs_embeds: Optional[torch.FloatTensor] = None,
+                                    cache_position: Optional[torch.LongTensor] = None,
+                                    **kwargs,):
+        model_inputs = super().prepare_inputs_for_generation(input_ids,
+                                    past_key_values,
+                                    attention_mask,
+                                    inputs_embeds,
+                                    cache_position,
+                                    **kwargs)
+        model_inputs.update(
+            {
+                "modal_tag_pos_list": kwargs.get("modal_tag_pos_list", None),
+            }
+        )
+        return model_inputs
+    def init_modal_moe_params(self, target_tag, src_tag):
+        for i, decoder_layer in enumerate(self.model.layers):
+            if hasattr(decoder_layer, "modal_moes"):
+                if src_tag == "text":
+                    rank0_print(f"Initializing layer{i} {target_tag} moe params for text")
+                    mlp_module = decoder_layer.mlp
+                    decoder_layer.modal_moes[target_tag].load_state_dict(copy.deepcopy(mlp_module.state_dict()))
+                else:
+                    rank0_print(f"Initializing layer{i} {target_tag} moe params for {src_tag}")
+                    assert src_tag in decoder_layer.modal_moes, f"src_tag {src_tag} not found in decoder_layer.modal_moes"
+                    src_module = decoder_layer.modal_moes[src_tag]
+                    decoder_layer.modal_moes[target_tag].load_state_dict(copy.deepcopy(src_module.state_dict()))

mm_models/modal_module/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from .vision.siglip import build_vision_encoder, build_vision_projector, VISION_SIGLIP_MODAL_CFG
+from .point.reconv2 import build_point_encoder, build_point_projector, POINT_RECON2_MODAL_CFG
+# NOTE: import custom modal encoder and projector here
+MODAL_CFG_MAPPING = {
+    'vision': VISION_SIGLIP_MODAL_CFG,
+    'point': POINT_RECON2_MODAL_CFG
+    # NOTE: add other modalities here
+}
+MODAL_ENCODERS_MAPPING = {
+    'vision': build_vision_encoder,
+    'point': build_point_encoder
+    # NOTE: add other modalities here
+}
+MODAL_PROJECTORS_MAPPING = {
+    'vision': build_vision_projector,
+    'point': build_point_projector
+    # NOTE: add other modalities here
+}

mm_models/modal_module/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (546 Bytes). View file

mm_models/modal_module/point/__pycache__/reconv2.cpython-310.pyc ADDED Viewed

Binary file (7.89 kB). View file

mm_models/modal_module/point/recon/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (17.8 kB). View file

mm_models/modal_module/point/recon/reconv2_utils/AverageMeter.py ADDED Viewed

	@@ -0,0 +1,42 @@

+class AverageMeter(object):
+    def __init__(self, items=None):
+        self.items = items
+        self.n_items = 1 if items is None else len(items)
+        self.reset()
+    def reset(self):
+        self._val = [0] * self.n_items
+        self._sum = [0] * self.n_items
+        self._count = [0] * self.n_items
+    def update(self, values):
+        if type(values).__name__ == 'list':
+            for idx, v in enumerate(values):
+                self._val[idx] = v
+                self._sum[idx] += v
+                self._count[idx] += 1
+        else:
+            self._val[0] = values
+            self._sum[0] += values
+            self._count[0] += 1
+    def val(self, idx=None):
+        if idx is None:
+            return self._val[0] if self.items is None else [self._val[i] for i in range(self.n_items)]
+        else:
+            return self._val[idx]
+    def count(self, idx=None):
+        if idx is None:
+            return self._count[0] if self.items is None else [self._count[i] for i in range(self.n_items)]
+        else:
+            return self._count[idx]
+    def avg(self, idx=None):
+        if idx is None:
+            return self._sum[0] / self._count[0] if self.items is None else [
+                self._sum[i] / self._count[i] for i in range(self.n_items)
+            ]
+        else:
+            return self._sum[idx] / self._count[idx]

mm_models/modal_module/point/recon/reconv2_utils/__pycache__/knn.cpython-310.pyc ADDED Viewed

Binary file (1.44 kB). View file

mm_models/modal_module/point/recon/reconv2_utils/__pycache__/logger.cpython-310.pyc ADDED Viewed

Binary file (3.98 kB). View file

mm_models/modal_module/point/recon/reconv2_utils/__pycache__/misc.cpython-310.pyc ADDED Viewed

Binary file (9.2 kB). View file

mm_models/modal_module/point/recon/reconv2_utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from collections import defaultdict
+import torch.nn as nn
+from typing import Any
+from typing import Optional, List, Dict, NamedTuple, Tuple, Iterable
+from termcolor import colored
+def get_missing_parameters_message(keys: List[str]) -> str:
+    """
+    Get a logging-friendly message to report parameter names (keys) that are in
+    the model but not found in a checkpoint.
+    Args:
+        keys (list[str]): List of keys that were not found in the checkpoint.
+    Returns:
+        str: message.
+    """
+    groups = _group_checkpoint_keys(keys)
+    msg = "Some model parameters or buffers are not found in the checkpoint:\n"
+    msg += "\n".join(
+        "  " + colored(k + _group_to_str(v), "blue") for k, v in groups.items()
+    )
+    return msg
+def get_unexpected_parameters_message(keys: List[str]) -> str:
+    """
+    Get a logging-friendly message to report parameter names (keys) that are in
+    the checkpoint but not found in the model.
+    Args:
+        keys (list[str]): List of keys that were not found in the model.
+    Returns:
+        str: message.
+    """
+    groups = _group_checkpoint_keys(keys)
+    msg = "The checkpoint state_dict contains keys that are not used by the model:\n"
+    msg += "\n".join(
+        "  " + colored(k + _group_to_str(v), "magenta") for k, v in groups.items()
+    )
+    return msg
+def _strip_prefix_if_present(state_dict: Dict[str, Any], prefix: str) -> None:
+    """
+    Strip the prefix in metadata, if any.
+    Args:
+        state_dict (OrderedDict): a state-dict to be loaded to the model.
+        prefix (str): prefix.
+    """
+    keys = sorted(state_dict.keys())
+    if not all(len(key) == 0 or key.startswith(prefix) for key in keys):
+        return
+    for key in keys:
+        newkey = key[len(prefix):]
+        state_dict[newkey] = state_dict.pop(key)
+    # also strip the prefix in metadata, if any..
+    try:
+        metadata = state_dict._metadata  # pyre-ignore
+    except AttributeError:
+        pass
+    else:
+        for key in list(metadata.keys()):
+            # for the metadata dict, the key can be:
+            # '': for the DDP module, which we want to remove.
+            # 'module': for the actual model.
+            # 'module.xx.xx': for the rest.
+            if len(key) == 0:
+                continue
+            newkey = key[len(prefix):]
+            metadata[newkey] = metadata.pop(key)
+def _group_checkpoint_keys(keys: List[str]) -> Dict[str, List[str]]:
+    """
+    Group keys based on common prefixes. A prefix is the string up to the final
+    "." in each key.
+    Args:
+        keys (list[str]): list of parameter names, i.e. keys in the model
+            checkpoint dict.
+    Returns:
+        dict[list]: keys with common prefixes are grouped into lists.
+    """
+    groups = defaultdict(list)
+    for key in keys:
+        pos = key.rfind(".")
+        if pos >= 0:
+            head, tail = key[:pos], [key[pos + 1:]]
+        else:
+            head, tail = key, []
+        groups[head].extend(tail)
+    return groups
+def _group_to_str(group: List[str]) -> str:
+    """
+    Format a group of parameter name suffixes into a loggable string.
+    Args:
+        group (list[str]): list of parameter name suffixes.
+    Returns:
+        str: formated string.
+    """
+    if len(group) == 0:
+        return ""
+    if len(group) == 1:
+        return "." + group[0]
+    return ".{" + ", ".join(group) + "}"
+def _named_modules_with_dup(
+        model: nn.Module, prefix: str = ""
+) -> Iterable[Tuple[str, nn.Module]]:
+    """
+    The same as `model.named_modules()`, except that it includes
+    duplicated modules that have more than one name.
+    """
+    yield prefix, model
+    for name, module in model._modules.items():  # pyre-ignore
+        if module is None:
+            continue
+        submodule_prefix = prefix + ("." if prefix else "") + name
+        yield from _named_modules_with_dup(module, submodule_prefix)

mm_models/modal_module/point/recon/reconv2_utils/config.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import yaml
+from easydict import EasyDict
+import os
+from .logger import print_log
+def log_args_to_file(args, pre='args', logger=None):
+    for key, val in args.__dict__.items():
+        print_log(f'{pre}.{key} : {val}', logger=logger)
+def log_config_to_file(cfg, pre='cfg', logger=None):
+    for key, val in cfg.items():
+        if isinstance(cfg[key], EasyDict):
+            print_log(f'{pre}.{key} = edict()', logger=logger)
+            log_config_to_file(cfg[key], pre=pre + '.' + key, logger=logger)
+            continue
+        print_log(f'{pre}.{key} : {val}', logger=logger)
+def merge_new_config(config, new_config):
+    for key, val in new_config.items():
+        if not isinstance(val, dict):
+            if key == '_base_':
+                with open(new_config['_base_'], 'r') as f:
+                    try:
+                        val = yaml.load(f, Loader=yaml.FullLoader)
+                    except:
+                        val = yaml.load(f)
+                config[key] = EasyDict()
+                merge_new_config(config[key], val)
+            else:
+                config[key] = val
+                continue
+        if key not in config:
+            config[key] = EasyDict()
+        merge_new_config(config[key], val)
+    return config
+def cfg_from_yaml_file(cfg_file):
+    config = EasyDict()
+    with open(cfg_file, 'r') as f:
+        try:
+            new_config = yaml.load(f, Loader=yaml.FullLoader)
+        except:
+            new_config = yaml.load(f)
+    merge_new_config(config=config, new_config=new_config)
+    return config
+def get_config(args, logger=None):
+    if args.resume:
+        cfg_path = os.path.join(args.experiment_path, 'config.yaml')
+        if not os.path.exists(cfg_path):
+            print_log("Failed to resume", logger=logger)
+            raise FileNotFoundError()
+        print_log(f'Resume yaml from {cfg_path}', logger=logger)
+        args.config = cfg_path
+    config = cfg_from_yaml_file(args.config)
+    if not args.resume and args.local_rank == 0:
+        save_experiment_config(args, config, logger)
+    return config
+def save_experiment_config(args, config, logger=None):
+    config_path = os.path.join(args.experiment_path, 'config.yaml')
+    os.system('cp %s %s' % (args.config, config_path))
+    print_log(f'Copy the Config file from {args.config} to {config_path}', logger=logger)

mm_models/modal_module/point/recon/reconv2_utils/data.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import numpy as np
+def random_rotate_z(pc):
+    # random roate around z axis
+    theta = np.random.uniform(0, 2 * np.pi)
+    R = np.array([[np.cos(theta), -np.sin(theta), 0],
+                  [np.sin(theta), np.cos(theta), 0],
+                  [0, 0, 1]])
+    return np.matmul(pc, R)
+def normalize_pc(pc):
+    """ pc: NxC, return NxC """
+    centroid = np.mean(pc, axis=0)
+    pc = pc - centroid
+    m = np.max(np.sqrt(np.sum(pc ** 2, axis=1)))
+    if m < 1e-6:
+        pc = np.zeros_like(pc)
+    else:
+        pc = pc / m
+    return pc
+def random_point_dropout(batch_pc, max_dropout_ratio=0.875):
+    """ batch_pc: BxNx3 """
+    for b in range(batch_pc.shape[0]):
+        dropout_ratio = np.random.random() * max_dropout_ratio  # 0~0.875
+        drop_idx = np.where(np.random.random((batch_pc.shape[1])) <= dropout_ratio)[0]
+        if len(drop_idx) > 0:
+            batch_pc[b, drop_idx, :] = batch_pc[b, 0, :]  # set to the first point
+    return batch_pc
+def random_scale_point_cloud(data, scale_low=0.8, scale_high=1.25):
+    scales = np.random.uniform(scale_low, scale_high)
+    data *= scales
+    return data
+def shift_point_cloud(batch_data, shift_range=0.1):
+    """ Randomly shift point cloud. Shift is per point cloud.
+        Input:
+          BxNx3 array, original batch of point clouds
+        Return:
+          BxNx3 array, shifted batch of point clouds
+    """
+    B, N, C = batch_data.shape
+    shifts = np.random.uniform(-shift_range, shift_range, (B, 3))
+    for batch_index in range(B):
+        batch_data[batch_index, :, :] += shifts[batch_index, :]
+    return batch_data
+def rotate_perturbation_point_cloud(batch_data, angle_sigma=0.06, angle_clip=0.18):
+    """ Randomly perturb the point clouds by small rotations
+        Input:
+          BxNx3 array, original batch of point clouds
+        Return:
+          BxNx3 array, rotated batch of point clouds
+    """
+    rotated_data = np.zeros(batch_data.shape, dtype=np.float32)
+    for k in range(batch_data.shape[0]):
+        angles = np.clip(angle_sigma * np.random.randn(3), -angle_clip, angle_clip)
+        Rx = np.array([[1, 0, 0],
+                       [0, np.cos(angles[0]), -np.sin(angles[0])],
+                       [0, np.sin(angles[0]), np.cos(angles[0])]])
+        Ry = np.array([[np.cos(angles[1]), 0, np.sin(angles[1])],
+                       [0, 1, 0],
+                       [-np.sin(angles[1]), 0, np.cos(angles[1])]])
+        Rz = np.array([[np.cos(angles[2]), -np.sin(angles[2]), 0],
+                       [np.sin(angles[2]), np.cos(angles[2]), 0],
+                       [0, 0, 1]])
+        R = np.dot(Rz, np.dot(Ry, Rx))
+        shape_pc = batch_data[k, ...]
+        rotated_data[k, ...] = np.dot(shape_pc.reshape((-1, 3)), R)
+    return rotated_data
+def rotate_point_cloud(batch_data):
+    """ Randomly rotate the point clouds to augument the dataset
+        rotation is per shape based along up direction
+        Input:
+          BxNx3 array, original batch of point clouds
+        Return:
+          BxNx3 array, rotated batch of point clouds
+    """
+    rotated_data = np.zeros(batch_data.shape, dtype=np.float32)
+    for k in range(batch_data.shape[0]):
+        rotation_angle = np.random.uniform() * 2 * np.pi
+        cosval = np.cos(rotation_angle)
+        sinval = np.sin(rotation_angle)
+        rotation_matrix = np.array([[cosval, 0, sinval],
+                                    [0, 1, 0],
+                                    [-sinval, 0, cosval]])
+        shape_pc = batch_data[k, ...]
+        rotated_data[k, ...] = np.dot(shape_pc.reshape((-1, 3)), rotation_matrix)
+    return rotated_data
+def augment_pc(data):
+    # data = random_point_dropout(data[None, ...])
+    data = random_scale_point_cloud(data[None, ...])
+    data = shift_point_cloud(data)
+    data = rotate_perturbation_point_cloud(data)
+    data = rotate_point_cloud(data)
+    data = data.squeeze()
+    return data

mm_models/modal_module/point/recon/reconv2_utils/dist_utils.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+from torch import distributed as dist
+def init_dist(local_rank, backend='nccl', **kwargs):
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend=backend, **kwargs)
+    print(f'init distributed in rank {local_rank}')
+def reduce_tensor(tensor, args):
+    '''
+        for acc kind, get the mean in each gpu
+    '''
+    rt = tensor.clone()
+    torch.distributed.all_reduce(rt, op=torch.distributed.ReduceOp.SUM)
+    rt /= args.world_size
+    return rt
+def gather_tensor(tensor, args):
+    output_tensors = [tensor.clone() for _ in range(args.world_size)]
+    torch.distributed.all_gather(output_tensors, tensor)
+    concat = torch.cat(output_tensors, dim=0)
+    return concat
+def set_batch_size(args, config):
+    if args.distributed:
+        assert config.total_bs % args.world_size == 0
+        if config.dataset.get('train'):
+            config.dataset.train.others.bs = config.total_bs // args.world_size
+        if config.dataset.get('extra_train'):
+            config.dataset.extra_train.others.bs = config.total_bs // args.world_size
+        if config.dataset.get('val'):
+            config.dataset.val.others.bs = config.total_bs // args.world_size
+        if config.dataset.get('test'):
+            config.dataset.test.others.bs = config.total_bs // args.world_size
+    else:
+        if config.dataset.get('train'):
+            config.dataset.train.others.bs = config.total_bs
+        if config.dataset.get('extra_train'):
+            config.dataset.extra_train.others.bs = config.total_bs
+        if config.dataset.get('extra_val'):
+            config.dataset.extra_val.others.bs = config.total_bs
+        if config.dataset.get('val'):
+            config.dataset.val.others.bs = config.total_bs
+        if config.dataset.get('test'):
+            config.dataset.test.others.bs = config.total_bs

mm_models/modal_module/point/recon/reconv2_utils/knn.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+def square_distance(src, dst):
+    """
+    Calculate Euclid distance between each two points.
+    src^T * dst = xn * xm + yn * ym + zn * zm；
+    sum(src^2, dim=-1) = xn*xn + yn*yn + zn*zn;
+    sum(dst^2, dim=-1) = xm*xm + ym*ym + zm*zm;
+    dist = (xn - xm)^2 + (yn - ym)^2 + (zn - zm)^2
+         = sum(src**2,dim=-1)+sum(dst**2,dim=-1)-2*src^T*dst
+    Input:
+        src: source points, [B, N, C]
+        dst: target points, [B, M, C]
+    Output:
+        dist: per-point square distance, [B, N, M]
+    """
+    B, N, _ = src.shape
+    _, M, _ = dst.shape
+    dist = -2 * torch.matmul(src, dst.permute(0, 2, 1))
+    dist += torch.sum(src ** 2, -1).view(B, N, 1)
+    dist += torch.sum(dst ** 2, -1).view(B, 1, M)
+    return dist
+def knn_point(nsample, xyz, new_xyz):
+    """
+    Input:
+        nsample: max sample number in local region
+        xyz: all points, [B, N, C]
+        new_xyz: query points, [B, S, C]
+    Return:
+        group_idx: grouped points index, [B, S, nsample]
+    """
+    sqrdists = square_distance(new_xyz, xyz)
+    _, group_idx = torch.topk(sqrdists, nsample, dim=-1, largest=False, sorted=False)
+    return group_idx

mm_models/modal_module/point/recon/reconv2_utils/logger.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import logging
+import torch.distributed as dist
+logger_initialized = {}
+def get_root_logger(log_file=None, log_level=logging.INFO, name='main'):
+    """Get root logger and add a keyword filter to it.
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "mmdet3d".
+    Args:
+        log_file (str, optional): File path of log. Defaults to None.
+        log_level (int, optional): The level of logger.
+            Defaults to logging.INFO.
+        name (str, optional): The name of the root logger, also used as a
+            filter keyword. Defaults to 'mmdet3d'.
+    Returns:
+        :obj:`logging.Logger`: The obtained logger
+    """
+    logger = get_logger(name=name, log_file=log_file, log_level=log_level)
+    # add a logging filter
+    logging_filter = logging.Filter(name)
+    logging_filter.filter = lambda record: record.find(name) != -1
+    return logger
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified and the process rank is 0, a FileHandler
+    will also be added.
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+        file_mode (str): The file mode used in opening log file.
+            Defaults to 'w'.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    # handle hierarchical names
+    # e.g., logger "a" is initialized, then logger "a.b" will skip the
+    # initialization since it is a child of "a".
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+    # handle duplicate logs to the console
+    # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
+    # to the root logger. As logger.propagate is True by default, this root
+    # level handler causes logging messages from rank>0 processes to
+    # unexpectedly show up on the console, creating much unwanted clutter.
+    # To fix this issue, we set the root logger's StreamHandler, if any, to log
+    # at the ERROR level.
+    for handler in logger.root.handlers:
+        if type(handler) is logging.StreamHandler:
+            handler.setLevel(logging.ERROR)
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+    else:
+        rank = 0
+    # only rank 0 will add a FileHandler
+    if rank == 0 and log_file is not None:
+        # Here, the default behaviour of the official logger is 'a'. Thus, we
+        # provide an interface to change the file mode to the default
+        # behaviour.
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+    logger_initialized[name] = True
+    return logger
+def print_log(msg, logger=None, level=logging.INFO):
+    """Print a log message.
+    Args:
+        msg (str): The message to be logged.
+        logger (logging.Logger | str | None): The logger to be used.
+            Some special loggers are:
+            - "silent": no message will be printed.
+            - other str: the logger obtained with `get_root_logger(logger)`.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object or "root".
+    """
+    if logger is None:
+        print(msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger == 'silent':
+        pass
+    elif isinstance(logger, str):
+        _logger = get_logger(logger)
+        _logger.log(level, msg)
+    else:
+        raise TypeError(
+            'logger should be either a logging.Logger object, str, '
+            f'"silent" or None, but got {type(logger)}')

mm_models/modal_module/point/recon/reconv2_utils/misc.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+from collections import abc
+# from pointnet2_ops import pointnet2_utils
+# def fps(data, number):
+#     '''
+#         data B N 3
+#         number int
+#     '''
+#     fps_idx = pointnet2_utils.furthest_point_sample(data, number)
+#     fps_data = pointnet2_utils.gather_operation(data.transpose(1, 2).contiguous(), fps_idx).transpose(1, 2).contiguous()
+#     return fps_data
+def index_points(points, idx):
+    """
+    Input:
+        points: input points data, [B, N, C]
+        idx: sample index data, [B, S]
+    Return:
+        new_points:, indexed points data, [B, S, C]
+    """
+    device = points.device
+    B = points.shape[0]
+    view_shape = list(idx.shape)
+    view_shape[1:] = [1] * (len(view_shape) - 1)
+    repeat_shape = list(idx.shape)
+    repeat_shape[0] = 1
+    batch_indices = torch.arange(B, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape)
+    new_points = points[batch_indices, idx, :]
+    return new_points
+def fps(point_data, npoint):
+    """
+    Input:
+        xyz: pointcloud data, [B, N, 3]
+        npoint: number of samples
+    Return:
+        centroids: sampled pointcloud index, [B, npoint]
+    """
+    xyz = point_data[:, :, :3]
+    device = xyz.device
+    B, N, C = xyz.shape
+    centroids = torch.zeros(B, npoint, dtype=torch.long).to(device)
+    distance = torch.ones(B, N).to(device) * 1e10
+    farthest = torch.randint(0, N, (B,), dtype=torch.long).to(device)
+    batch_indices = torch.arange(B, dtype=torch.long).to(device)
+    for i in range(npoint):
+        centroids[:, i] = farthest
+        centroid = xyz[batch_indices, farthest, :].view(B, 1, 3)
+        dist = torch.sum((xyz - centroid) ** 2, -1)
+        distance = torch.min(distance, dist)
+        farthest = torch.max(distance, -1)[1]
+    return index_points(point_data, centroids)
+def worker_init_fn(worker_id):
+    np.random.seed(np.random.get_state()[1][0] + worker_id)
+def build_lambda_sche(opti, config):
+    if config.get('decay_step') is not None:
+        lr_lbmd = lambda e: max(config.lr_decay ** (e / config.decay_step), config.lowest_decay)
+        scheduler = torch.optim.lr_scheduler.LambdaLR(opti, lr_lbmd)
+    else:
+        raise NotImplementedError()
+    return scheduler
+def build_lambda_bnsche(model, config):
+    if config.get('decay_step') is not None:
+        bnm_lmbd = lambda e: max(config.bn_momentum * config.bn_decay ** (e / config.decay_step), config.lowest_decay)
+        bnm_scheduler = BNMomentumScheduler(model, bnm_lmbd)
+    else:
+        raise NotImplementedError()
+    return bnm_scheduler
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    # Speed-reproducibility tradeoff https://pytorch.org/docs/stable/notes/randomness.html
+    if cuda_deterministic:  # slower, more reproducible
+        cudnn.deterministic = True
+        cudnn.benchmark = False
+    else:  # faster, less reproducible
+        cudnn.deterministic = False
+        cudnn.benchmark = True
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def is_seq_of(seq, expected_type, seq_type=None):
+    """Check whether it is a sequence of some type.
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type.
+    Returns:
+        bool: Whether the sequence is valid.
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+def set_bn_momentum_default(bn_momentum):
+    def fn(m):
+        if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)):
+            m.momentum = bn_momentum
+    return fn
+class BNMomentumScheduler(object):
+    def __init__(
+            self, model, bn_lambda, last_epoch=-1,
+            setter=set_bn_momentum_default
+    ):
+        if not isinstance(model, nn.Module):
+            raise RuntimeError(
+                "Class '{}' is not a PyTorch nn Module".format(
+                    type(model).__name__
+                )
+            )
+        self.model = model
+        self.setter = setter
+        self.lmbd = bn_lambda
+        self.step(last_epoch + 1)
+        self.last_epoch = last_epoch
+    def step(self, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+        self.last_epoch = epoch
+        self.model.apply(self.setter(self.lmbd(epoch)))
+    def get_momentum(self, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+        return self.lmbd(epoch)
+def seprate_point_cloud(xyz, num_points, crop, fixed_points=None, padding_zeros=False):
+    '''
+     seprate point cloud: usage : using to generate the incomplete point cloud with a setted number.
+    '''
+    _, n, c = xyz.shape
+    assert n == num_points
+    assert c == 3
+    if crop == num_points:
+        return xyz, None
+    INPUT = []
+    CROP = []
+    for points in xyz:
+        if isinstance(crop, list):
+            num_crop = random.randint(crop[0], crop[1])
+        else:
+            num_crop = crop
+        points = points.unsqueeze(0)
+        if fixed_points is None:
+            center = F.normalize(torch.randn(1, 1, 3), p=2, dim=-1).cuda()
+        else:
+            if isinstance(fixed_points, list):
+                fixed_point = random.sample(fixed_points, 1)[0]
+            else:
+                fixed_point = fixed_points
+            center = fixed_point.reshape(1, 1, 3).cuda()
+        distance_matrix = torch.norm(center.unsqueeze(2) - points.unsqueeze(1), p=2, dim=-1)  # 1 1 2048
+        idx = torch.argsort(distance_matrix, dim=-1, descending=False)[0, 0]  # 2048
+        if padding_zeros:
+            input_data = points.clone()
+            input_data[0, idx[:num_crop]] = input_data[0, idx[:num_crop]] * 0
+        else:
+            input_data = points.clone()[0, idx[num_crop:]].unsqueeze(0)  # 1 N 3
+        crop_data = points.clone()[0, idx[:num_crop]].unsqueeze(0)
+        if isinstance(crop, list):
+            INPUT.append(fps(input_data, 2048))
+            CROP.append(fps(crop_data, 2048))
+        else:
+            INPUT.append(input_data)
+            CROP.append(crop_data)
+    input_data = torch.cat(INPUT, dim=0)  # B N 3
+    crop_data = torch.cat(CROP, dim=0)  # B M 3
+    return input_data.contiguous(), crop_data.contiguous()
+def get_ptcloud_img(ptcloud, roll, pitch):
+    fig = plt.figure(figsize=(8, 8))
+    x, z, y = ptcloud.transpose(1, 0)
+    ax = fig.gca(projection=Axes3D.name, adjustable='box')
+    ax.axis('off')
+    # ax.axis('scaled')
+    ax.view_init(roll, pitch)
+    max, min = np.max(ptcloud), np.min(ptcloud)
+    ax.set_xbound(min, max)
+    ax.set_ybound(min, max)
+    ax.set_zbound(min, max)
+    ax.scatter(x, y, z, zdir='z', c=y, cmap='jet')
+    fig.canvas.draw()
+    img = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+    img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    return img
+def visualize_KITTI(path, data_list, titles=['input', 'pred'], cmap=['bwr', 'autumn'], zdir='y',
+                    xlim=(-1, 1), ylim=(-1, 1), zlim=(-1, 1)):
+    fig = plt.figure(figsize=(6 * len(data_list), 6))
+    cmax = data_list[-1][:, 0].max()
+    for i in range(len(data_list)):
+        data = data_list[i][:-2048] if i == 1 else data_list[i]
+        color = data[:, 0] / cmax
+        ax = fig.add_subplot(1, len(data_list), i + 1, projection='3d')
+        ax.view_init(30, -120)
+        b = ax.scatter(data[:, 0], data[:, 1], data[:, 2], zdir=zdir, c=color, vmin=-1, vmax=1, cmap=cmap[0], s=4,
+                       linewidth=0.05, edgecolors='black')
+        ax.set_title(titles[i])
+        ax.set_axis_off()
+        ax.set_xlim(xlim)
+        ax.set_ylim(ylim)
+        ax.set_zlim(zlim)
+    plt.subplots_adjust(left=0, right=1, bottom=0, top=1, wspace=0.2, hspace=0)
+    if not os.path.exists(path):
+        os.makedirs(path)
+    pic_path = path + '.png'
+    fig.savefig(pic_path)
+    np.save(os.path.join(path, 'input.npy'), data_list[0].numpy())
+    np.save(os.path.join(path, 'pred.npy'), data_list[1].numpy())
+    plt.close(fig)
+def random_dropping(pc, e):
+    up_num = max(64, 768 // (e // 50 + 1))
+    pc = pc
+    random_num = torch.randint(1, up_num, (1, 1))[0, 0]
+    pc = fps(pc, random_num)
+    padding = torch.zeros(pc.size(0), 2048 - pc.size(1), 3).to(pc.device)
+    pc = torch.cat([pc, padding], dim=1)
+    return pc
+def random_scale(partial, scale_range=[0.8, 1.2]):
+    scale = torch.rand(1).cuda() * (scale_range[1] - scale_range[0]) + scale_range[0]
+    return partial * scale

mm_models/modal_module/point/recon/reconv2_utils/parser.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import argparse
+from pathlib import Path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--config',
+        type=str,
+        help='yaml config file')
+    parser.add_argument('--distributed', action='store_true', default=False)
+    parser.add_argument('--local-rank', type=int, default=0)
+    parser.add_argument('--num_workers', type=int, default=8)
+    # seed
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    # bn
+    parser.add_argument(
+        '--sync_bn',
+        action='store_true',
+        default=False,
+        help='whether to use sync bn')
+    # some args
+    parser.add_argument('--exp_name', type=str, default='default', help='experiment name')
+    parser.add_argument('--start_ckpts', type=str, default=None, help='reload used ckpt path')
+    parser.add_argument('--ckpts', type=str, default=None, help='test used ckpt path')
+    parser.add_argument('--val_freq', type=int, default=1, help='test freq')
+    parser.add_argument(
+        '--vote',
+        action='store_true',
+        default=False,
+        help='vote acc')
+    parser.add_argument(
+        '--resume',
+        action='store_true',
+        default=False,
+        help='autoresume training (interrupted by accident)')
+    parser.add_argument(
+        '--svm',
+        action='store_true',
+        default=False,
+        help='svm')
+    parser.add_argument(
+        '--zeroshot',
+        action='store_true',
+        default=False,
+        help='zero-shot')
+    parser.add_argument(
+        '--test',
+        action='store_true',
+        default=False,
+        help='test mode for certain ckpt')
+    parser.add_argument(
+        '--reconstruct',
+        action='store_true',
+        default=False,
+        help='reconstruct pretraining stage')
+    parser.add_argument(
+        '--contrast',
+        action='store_true',
+        default=False,
+        help='contrast pretraining stage')
+    parser.add_argument(
+        '--finetune_model',
+        action='store_true',
+        default=False,
+        help='finetune modelnet with pretrained weight')
+    parser.add_argument(
+        '--way', type=int, default=-1)
+    parser.add_argument(
+        '--shot', type=int, default=-1)
+    parser.add_argument(
+        '--fold', type=int, default=-1)
+    args = parser.parse_args()
+    if args.test and args.resume:
+        raise ValueError(
+            '--test and --resume cannot be both activate')
+    if args.resume and args.start_ckpts is not None:
+        raise ValueError(
+            '--resume and --start_ckpts cannot be both activate')
+    if args.test and args.ckpts is None:
+        raise ValueError(
+            'ckpts shouldnt be None while test mode')
+    if args.finetune_model and args.ckpts is None:
+        print(
+            'training from scratch')
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    if args.test:
+        args.exp_name = 'test_' + args.exp_name
+    args.experiment_path = os.path.join('./experiments', Path(args.config).stem, Path(args.config).parent.stem,
+                                        args.exp_name)
+    args.tfboard_path = os.path.join('./experiments', Path(args.config).stem, Path(args.config).parent.stem, 'TFBoard',
+                                     args.exp_name)
+    args.log_name = Path(args.config).stem
+    create_experiment_dir(args)
+    return args
+def create_experiment_dir(args):
+    if not os.path.exists(args.experiment_path):
+        os.makedirs(args.experiment_path, exist_ok=True)
+        print('Create experiment path successfully at %s' % args.experiment_path)
+    if not os.path.exists(args.tfboard_path):
+        os.makedirs(args.tfboard_path, exist_ok=True)
+        print('Create TFBoard path successfully at %s' % args.tfboard_path)

mm_models/modal_module/point/recon/reconv2_utils/randaugment.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import logging
+import random
+import numpy as np
+import PIL
+import PIL.ImageOps
+import PIL.ImageEnhance
+import PIL.ImageDraw
+from PIL import Image
+logger = logging.getLogger(__name__)
+PARAMETER_MAX = 10
+def AutoContrast(img, **kwarg):
+    return PIL.ImageOps.autocontrast(img)
+def Brightness(img, v, max_v, bias=0):
+    v = _float_parameter(v, max_v) + bias
+    return PIL.ImageEnhance.Brightness(img).enhance(v)
+def Color(img, v, max_v, bias=0):
+    v = _float_parameter(v, max_v) + bias
+    return PIL.ImageEnhance.Color(img).enhance(v)
+def Contrast(img, v, max_v, bias=0):
+    v = _float_parameter(v, max_v) + bias
+    return PIL.ImageEnhance.Contrast(img).enhance(v)
+def Cutout(img, v, max_v, bias=0):
+    if v == 0:
+        return img
+    v = _float_parameter(v, max_v) + bias
+    v = int(v * min(img.size))
+    return CutoutAbs(img, v)
+def CutoutAbs(img, v, **kwarg):
+    w, h = img.size
+    x0 = np.random.uniform(0, w)
+    y0 = np.random.uniform(0, h)
+    x0 = int(max(0, x0 - v / 2.))
+    y0 = int(max(0, y0 - v / 2.))
+    x1 = int(min(w, x0 + v))
+    y1 = int(min(h, y0 + v))
+    xy = (x0, y0, x1, y1)
+    # gray
+    color = (127, 127, 127)
+    img = img.copy()
+    PIL.ImageDraw.Draw(img).rectangle(xy, color)
+    return img
+def Equalize(img, **kwarg):
+    return PIL.ImageOps.equalize(img)
+def Identity(img, **kwarg):
+    return img
+def Invert(img, **kwarg):
+    return PIL.ImageOps.invert(img)
+def Posterize(img, v, max_v, bias=0):
+    v = _int_parameter(v, max_v) + bias
+    return PIL.ImageOps.posterize(img, v)
+def Rotate(img, v, max_v, bias=0):
+    v = _int_parameter(v, max_v) + bias
+    if random.random() < 0.5:
+        v = -v
+    return img.rotate(v)
+def Sharpness(img, v, max_v, bias=0):
+    v = _float_parameter(v, max_v) + bias
+    return PIL.ImageEnhance.Sharpness(img).enhance(v)
+def ShearX(img, v, max_v, bias=0):
+    v = _float_parameter(v, max_v) + bias
+    if random.random() < 0.5:
+        v = -v
+    return img.transform(img.size, PIL.Image.AFFINE, (1, v, 0, 0, 1, 0))
+def ShearY(img, v, max_v, bias=0):
+    v = _float_parameter(v, max_v) + bias
+    if random.random() < 0.5:
+        v = -v
+    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, v, 1, 0))
+def Solarize(img, v, max_v, bias=0):
+    v = _int_parameter(v, max_v) + bias
+    return PIL.ImageOps.solarize(img, 256 - v)
+def SolarizeAdd(img, v, max_v, bias=0, threshold=128):
+    v = _int_parameter(v, max_v) + bias
+    if random.random() < 0.5:
+        v = -v
+    img_np = np.array(img).astype(np.int)
+    img_np = img_np + v
+    img_np = np.clip(img_np, 0, 255)
+    img_np = img_np.astype(np.uint8)
+    img = Image.fromarray(img_np)
+    return PIL.ImageOps.solarize(img, threshold)
+def TranslateX(img, v, max_v, bias=0):
+    v = _float_parameter(v, max_v) + bias
+    if random.random() < 0.5:
+        v = -v
+    v = int(v * img.size[0])
+    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, v, 0, 1, 0))
+def TranslateY(img, v, max_v, bias=0):
+    v = _float_parameter(v, max_v) + bias
+    if random.random() < 0.5:
+        v = -v
+    v = int(v * img.size[1])
+    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, 0, 1, v))
+def _float_parameter(v, max_v):
+    return float(v) * max_v / PARAMETER_MAX
+def _int_parameter(v, max_v):
+    return int(v * max_v / PARAMETER_MAX)
+def fixmatch_augment_pool():
+    # FixMatch paper
+    augs = [(AutoContrast, None, None),
+            (Brightness, 0.9, 0.05),
+            (Color, 0.9, 0.05),
+            (Contrast, 0.9, 0.05),
+            (Equalize, None, None),
+            (Identity, None, None),
+            (Posterize, 4, 4),
+            (Rotate, 30, 0),
+            (Sharpness, 0.9, 0.05),
+            (ShearX, 0.3, 0),
+            (ShearY, 0.3, 0),
+            (Solarize, 256, 0),
+            (TranslateX, 0.3, 0),
+            (TranslateY, 0.3, 0)]
+    return augs
+def my_augment_pool():
+    # Test
+    augs = [(AutoContrast, None, None),
+            (Brightness, 1.8, 0.1),
+            (Color, 1.8, 0.1),
+            (Contrast, 1.8, 0.1),
+            (Cutout, 0.2, 0),
+            (Equalize, None, None),
+            (Invert, None, None),
+            (Posterize, 4, 4),
+            (Rotate, 30, 0),
+            (Sharpness, 1.8, 0.1),
+            (ShearX, 0.3, 0),
+            (ShearY, 0.3, 0),
+            (Solarize, 256, 0),
+            (SolarizeAdd, 110, 0),
+            (TranslateX, 0.45, 0),
+            (TranslateY, 0.45, 0)]
+    return augs
+class RandAugmentPC(object):
+    def __init__(self, n, m):
+        assert n >= 1
+        assert 1 <= m <= 10
+        self.n = n
+        self.m = m
+        self.augment_pool = my_augment_pool()
+    def __call__(self, img):
+        ops = random.choices(self.augment_pool, k=self.n)
+        for op, max_v, bias in ops:
+            prob = np.random.uniform(0.2, 0.8)
+            if random.random() + prob >= 1:
+                img = op(img, v=self.m, max_v=max_v, bias=bias)
+        img = CutoutAbs(img, int(32*0.5))
+        return img
+class RandAugmentMC(object):
+    def __init__(self, n, m):
+        assert n >= 1
+        assert 1 <= m <= 10
+        self.n = n
+        self.m = m
+        self.augment_pool = fixmatch_augment_pool()
+    def __call__(self, img):
+        ops = random.choices(self.augment_pool, k=self.n)
+        for op, max_v, bias in ops:
+            v = np.random.randint(1, self.m)
+            if random.random() < 0.5:
+                img = op(img, v=v, max_v=max_v, bias=bias)
+        img = CutoutAbs(img, int(32*0.5))
+        return img

mm_models/modal_module/point/recon/reconv2_utils/registry.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import inspect
+import warnings
+from functools import partial
+from ReConV2.utils import config
+class Registry:
+    """A registry to map strings to classes.
+    Registered object could be built from registry.
+    Example:
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> resnet = MODELS.build(dict(NAME='ResNet'))
+    Please refer to https://mmcv.readthedocs.io/en/latest/registry.html for
+    advanced useage.
+    Args:
+        name (str): Registry name.
+        build_func(func, optional): Build function to construct instance from
+            Registry, func:`build_from_cfg` is used if neither ``parent`` or
+            ``build_func`` is specified. If ``parent`` is specified and
+            ``build_func`` is not given,  ``build_func`` will be inherited
+            from ``parent``. Default: None.
+        parent (Registry, optional): Parent registry. The class registered in
+            children registry could be built from parent. Default: None.
+        scope (str, optional): The scope of registry. It is the key to search
+            for children registry. If not specified, scope will be the name of
+            the package where class is defined, e.g. mmdet, mmcls, mmseg.
+            Default: None.
+    """
+    def __init__(self, name, build_func=None, parent=None, scope=None):
+        self._name = name
+        self._module_dict = dict()
+        self._children = dict()
+        self._scope = self.infer_scope() if scope is None else scope
+        # self.build_func will be set with the following priority:
+        # 1. build_func
+        # 2. parent.build_func
+        # 3. build_from_cfg
+        if build_func is None:
+            if parent is not None:
+                self.build_func = parent.build_func
+            else:
+                self.build_func = build_from_cfg
+        else:
+            self.build_func = build_func
+        if parent is not None:
+            assert isinstance(parent, Registry)
+            parent._add_children(self)
+            self.parent = parent
+        else:
+            self.parent = None
+    def __len__(self):
+        return len(self._module_dict)
+    def __contains__(self, key):
+        return self.get(key) is not None
+    def __repr__(self):
+        format_str = self.__class__.__name__ + \
+                     f'(name={self._name}, ' \
+                     f'items={self._module_dict})'
+        return format_str
+    @staticmethod
+    def infer_scope():
+        """Infer the scope of registry.
+        The name of the package where registry is defined will be returned.
+        Example:
+            # in mmdet/models/backbone/resnet.py
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            The scope of ``ResNet`` will be ``mmdet``.
+        Returns:
+            scope (str): The inferred scope name.
+        """
+        # inspect.stack() trace where this function is called, the index-2
+        # indicates the frame where `infer_scope()` is called
+        filename = inspect.getmodule(inspect.stack()[2][0]).__name__
+        split_filename = filename.split('.')
+        return split_filename[0]
+    @staticmethod
+    def split_scope_key(key):
+        """Split scope and key.
+        The first scope will be split from key.
+        Examples:
+            >>> Registry.split_scope_key('mmdet.ResNet')
+            'mmdet', 'ResNet'
+            >>> Registry.split_scope_key('ResNet')
+            None, 'ResNet'
+        Return:
+            scope (str, None): The first scope.
+            key (str): The remaining key.
+        """
+        split_index = key.find('.')
+        if split_index != -1:
+            return key[:split_index], key[split_index + 1:]
+        else:
+            return None, key
+    @property
+    def name(self):
+        return self._name
+    @property
+    def scope(self):
+        return self._scope
+    @property
+    def module_dict(self):
+        return self._module_dict
+    @property
+    def children(self):
+        return self._children
+    def get(self, key):
+        """Get the registry record.
+        Args:
+            key (str): The class name in string format.
+        Returns:
+            class: The corresponding class.
+        """
+        scope, real_key = self.split_scope_key(key)
+        if scope is None or scope == self._scope:
+            # get from self
+            if real_key in self._module_dict:
+                return self._module_dict[real_key]
+        else:
+            # get from self._children
+            if scope in self._children:
+                return self._children[scope].get(real_key)
+            else:
+                # goto root
+                parent = self.parent
+                while parent.parent is not None:
+                    parent = parent.parent
+                return parent.get(key)
+    def build(self, *args, **kwargs):
+        return self.build_func(*args, **kwargs, registry=self)
+    def _add_children(self, registry):
+        """Add children for a registry.
+        The ``registry`` will be added as children based on its scope.
+        The parent registry could build objects from children registry.
+        Example:
+            >>> models = Registry('models')
+            >>> mmdet_models = Registry('models', parent=models)
+            >>> @mmdet_models.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> resnet = models.build(dict(NAME='mmdet.ResNet'))
+        """
+        assert isinstance(registry, Registry)
+        assert registry.scope is not None
+        assert registry.scope not in self.children, \
+            f'scope {registry.scope} exists in {self.name} registry'
+        self.children[registry.scope] = registry
+    def _register_module(self, module_class, module_name=None, force=False):
+        if not inspect.isclass(module_class):
+            raise TypeError('module must be a class, '
+                            f'but got {type(module_class)}')
+        if module_name is None:
+            module_name = module_class.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in self._module_dict:
+                raise KeyError(f'{name} is already registered '
+                               f'in {self.name}')
+            self._module_dict[name] = module_class
+    def deprecated_register_module(self, cls=None, force=False):
+        warnings.warn(
+            'The old API of register_module(module, force=False) '
+            'is deprecated and will be removed, please use the new API '
+            'register_module(name=None, force=False, module=None) instead.')
+        if cls is None:
+            return partial(self.deprecated_register_module, force=force)
+        self._register_module(cls, force=force)
+        return cls
+    def register_module(self, name=None, force=False, module=None):
+        """Register a module.
+        A record will be added to `self._module_dict`, whose key is the class
+        name or the specified name, and value is the class itself.
+        It can be used as a decorator or a normal function.
+        Example:
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module(name='mnet')
+            >>> class MobileNet:
+            >>>     pass
+            >>> backbones = Registry('backbone')
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones.register_module(ResNet)
+        Args:
+            name (str | None): The module name to be registered. If not
+                specified, the class name will be used.
+            force (bool, optional): Whether to override an existing class with
+                the same name. Default: False.
+            module (type): Module class to be registered.
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+        # NOTE: This is a walkaround to be compatible with the old api,
+        # while it may introduce unexpected bugs.
+        if isinstance(name, type):
+            return self.deprecated_register_module(name, force=force)
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str) or misc.is_seq_of(name, str)):
+            raise TypeError(
+                'name must be either of None, an instance of str or a sequence'
+                f'  of str, but got {type(name)}')
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            self._register_module(
+                module_class=module, module_name=name, force=force)
+            return module
+        # use it as a decorator: @x.register_module()
+        def _register(cls):
+            self._register_module(
+                module_class=cls, module_name=name, force=force)
+            return cls
+        return _register
+def build_from_cfg(cfg, registry, default_args=None):
+    """Build a module from config dict.
+    Args:
+        cfg (edict): Config dict. It should at least contain the key "NAME".
+        registry (:obj:`Registry`): The registry to search the type from.
+    Returns:
+        object: The constructed object.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'NAME' not in cfg:
+        if default_args is None or 'NAME' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "NAME", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an mmcv.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+    if default_args is not None:
+        cfg = config.merge_new_config(cfg, default_args)
+    obj_type = cfg.get('NAME')
+    if isinstance(obj_type, str):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name} registry')
+    elif inspect.isclass(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError(
+            f'type must be a str or valid type, but got {type(obj_type)}')
+    try:
+        return obj_cls(cfg)
+    except Exception as e:
+        # Normal TypeError does not print class name.
+        raise type(e)(f'{obj_cls.__name__}: {e}')

mm_models/modal_module/point/recon/reconv2_utils/transforms.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from PIL import Image
+from torchvision import transforms
+from ReConV2.utils.randaugment import RandAugmentMC
+__all__ = ['get_transforms']
+class ResizeImage():
+    def __init__(self, size):
+        if isinstance(size, int):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, img):
+        th, tw = self.size
+        return img.resize((th, tw))
+class PlaceCrop(object):
+    """Crops the given PIL.Image at the particular index.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (w, h), a square crop (size, size) is
+            made.
+    """
+    def __init__(self, size, start_x, start_y):
+        if isinstance(size, int):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.start_x = start_x
+        self.start_y = start_y
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL.Image): Image to be cropped.
+        Returns:
+            PIL.Image: Cropped image.
+        """
+        th, tw = self.size
+        return img.crop((self.start_x, self.start_y, self.start_x + tw, self.start_y + th))
+class ForceFlip(object):
+    """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL.Image): Image to be flipped.
+        Returns:
+            PIL.Image: Randomly flipped image.
+        """
+        return img.transpose(Image.FLIP_LEFT_RIGHT)
+def transform_train(resize_size=256, crop_size=224):
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    return transforms.Compose([
+        # ResizeImage(resize_size),
+        # transforms.RandomHorizontalFlip(),
+        # transforms.RandomResizedCrop(crop_size, scale=(0.64, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
+        # RandAugmentMC(n=2, m=10),
+        ResizeImage(crop_size),
+        transforms.ToTensor(),
+        normalize
+    ])
+def get_transforms(resize_size=256, crop_size=224):
+    transforms = {
+        'train': transform_train(resize_size, crop_size)
+    }
+    return transforms

mm_models/modal_module/point/recon/transformer.py ADDED Viewed

	@@ -0,0 +1,647 @@

+import math
+import timm
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from .reconv2_utils import misc
+from .reconv2_utils.logger import *
+from .reconv2_utils.knn import knn_point
+from timm.layers import Mlp, DropPath
+from typing import Optional, List
+class PatchEmbedding(nn.Module):  # Embedding module
+    def __init__(self, embed_dim, input_channel=3, large=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_channel = input_channel
+        # embed_dim_list = [c * (embed_dim // 512 + 1) for c in [128, 256, 512]]
+        #
+        # self.first_conv = nn.Sequential(
+        #     nn.Conv1d(self.input_channel, embed_dim_list[0], 1),
+        #     nn.BatchNorm1d(embed_dim_list[0]),
+        #     nn.ReLU(inplace=True),
+        #     nn.Conv1d(embed_dim_list[0], embed_dim_list[1], 1)
+        # )
+        # self.second_conv = nn.Sequential(
+        #     nn.Conv1d(embed_dim_list[2], embed_dim_list[2], 1),
+        #     nn.BatchNorm1d(embed_dim_list[2]),
+        #     nn.ReLU(inplace=True),
+        #     nn.Conv1d(embed_dim_list[2], self.embed_dim, 1)
+        # )
+        if large:
+            self.first_conv = nn.Sequential(
+                nn.Conv1d(self.input_channel, 256, 1),
+                nn.BatchNorm1d(256),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(256, 512, 1),
+                nn.BatchNorm1d(512),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(512, 1024, 1)
+            )
+            self.second_conv = nn.Sequential(
+                nn.Conv1d(2048, 2048, 1),
+                nn.BatchNorm1d(2048),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(2048, embed_dim, 1)
+            )
+        else:
+            self.first_conv = nn.Sequential(
+                nn.Conv1d(self.input_channel, 128, 1),
+                nn.BatchNorm1d(128),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(128, 256, 1)
+            )
+            self.second_conv = nn.Sequential(
+                nn.Conv1d(512, 512, 1),
+                nn.BatchNorm1d(512),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(512, embed_dim, 1)
+            )
+    def forward(self, point_groups):
+        '''
+            point_groups : B G N 3/6
+            -----------------
+            feature_global : B G C
+        '''
+        bs, g, n, _ = point_groups.shape
+        point_groups = point_groups.reshape(bs * g, n, self.input_channel)
+        # encoder
+        feature = self.first_conv(point_groups.transpose(2, 1))
+        feature_global = torch.max(feature, dim=2, keepdim=True)[0]
+        feature = torch.cat([feature_global.expand(-1, -1, n), feature], dim=1)
+        feature = self.second_conv(feature)
+        feature_global = torch.max(feature, dim=2, keepdim=False)[0]
+        return feature_global.reshape(bs, g, self.embed_dim)
+class PositionEmbeddingCoordsSine(nn.Module):
+    """Similar to transformer's position encoding, but generalizes it to
+    arbitrary dimensions and continuous coordinates.
+    Args:
+        n_dim: Number of input dimensions, e.g. 2 for image coordinates.
+        d_model: Number of dimensions to encode into
+        temperature:
+        scale:
+    """
+    def __init__(self, n_dim: int = 1, d_model: int = 256, temperature=1.0, scale=None):
+        super().__init__()
+        self.n_dim = n_dim
+        self.num_pos_feats = d_model // n_dim // 2 * 2
+        self.temperature = temperature
+        self.padding = d_model - self.num_pos_feats * self.n_dim
+        if scale is None:
+            scale = 1.0
+        self.scale = scale * 2 * math.pi
+    def forward(self, xyz: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            xyz: Point positions (*, d_in)
+        Returns:
+            pos_emb (*, d_out)
+        """
+        assert xyz.shape[-1] == self.n_dim
+        dim_t = torch.arange(self.num_pos_feats,
+                             dtype=torch.float32, device=xyz.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t,
+                                                   2, rounding_mode='trunc') / self.num_pos_feats)
+        xyz = xyz * self.scale
+        pos_divided = xyz.unsqueeze(-1) / dim_t
+        pos_sin = pos_divided[..., 0::2].sin()
+        pos_cos = pos_divided[..., 1::2].cos()
+        pos_emb = torch.stack([pos_sin, pos_cos], dim=-1).reshape(*xyz.shape[:-1], -1)
+        # Pad unused dimensions with zeros
+        pos_emb = F.pad(pos_emb, (0, self.padding))
+        return pos_emb
+class Group(nn.Module):  # FPS + KNN
+    def __init__(self, num_group, group_size):
+        super().__init__()
+        self.num_group = num_group
+        self.group_size = group_size
+    def forward(self, pts):
+        '''
+            input: B N 3/6
+            ---------------------------
+            output: B G M 3/6
+            center : B G 3
+        '''
+        xyz = pts[:, :, :3]
+        c = pts.shape[2]
+        batch_size, num_points, _ = xyz.shape
+        # fps the centers out
+        xyz = xyz.float()
+        center = misc.fps(xyz.contiguous(), self.num_group)  # B G 3
+        # knn to get the neighborhood
+        idx = knn_point(self.group_size, xyz, center)
+        assert idx.size(1) == self.num_group
+        assert idx.size(2) == self.group_size
+        idx_base = torch.arange(0, batch_size, device=xyz.device).view(-1, 1, 1) * num_points
+        idx = idx + idx_base
+        idx = idx.view(-1)
+        neighborhood = pts.view(batch_size * num_points, -1)[idx, :]
+        neighborhood = neighborhood.view(batch_size, self.num_group, self.group_size, c).contiguous()
+        # normalize
+        neighborhood[:, :, :, :3] = neighborhood[:, :, :, :3] - center.unsqueeze(2)
+        return neighborhood, center
+class ZGroup(nn.Module):
+    def __init__(self, num_group, group_size):
+        super().__init__()
+        self.num_group = num_group
+        self.group_size = group_size
+    def simplied_morton_sorting(self, xyz, center):
+        """
+        Simplifying the Morton code sorting to iterate and set the nearest patch to the last patch as the next patch, we found this to be more efficient.
+        """
+        batch_size, num_points, _ = xyz.shape
+        distances_batch = torch.cdist(center, center)
+        distances_batch[:, torch.eye(self.num_group).bool()] = float("inf")
+        idx_base = torch.arange(
+            0, batch_size, device=xyz.device) * self.num_group
+        sorted_indices_list = [idx_base]
+        distances_batch = distances_batch.view(batch_size, self.num_group, self.num_group).transpose(
+            1, 2).contiguous().view(batch_size * self.num_group, self.num_group)
+        distances_batch[idx_base] = float("inf")
+        distances_batch = distances_batch.view(
+            batch_size, self.num_group, self.num_group).transpose(1, 2).contiguous()
+        for i in range(self.num_group - 1):
+            distances_batch = distances_batch.view(
+                batch_size * self.num_group, self.num_group)
+            distances_to_last_batch = distances_batch[sorted_indices_list[-1]]
+            closest_point_idx = torch.argmin(distances_to_last_batch, dim=-1)
+            closest_point_idx = closest_point_idx + idx_base
+            sorted_indices_list.append(closest_point_idx)
+            distances_batch = distances_batch.view(batch_size, self.num_group, self.num_group).transpose(
+                1, 2).contiguous().view(batch_size * self.num_group, self.num_group)
+            distances_batch[closest_point_idx] = float("inf")
+            distances_batch = distances_batch.view(
+                batch_size, self.num_group, self.num_group).transpose(1, 2).contiguous()
+        sorted_indices = torch.stack(sorted_indices_list, dim=-1)
+        sorted_indices = sorted_indices.view(-1)
+        return sorted_indices
+    def forward(self, pts):
+        """
+            input: B N 3/6
+            ---------------------------
+            output: B G M 3/6
+            center : B G 3
+        """
+        xyz = pts[:, :, :3]
+        c = pts.shape[2]
+        batch_size, num_points, _ = xyz.shape
+        # fps the centers out
+        xyz = xyz.float()
+        center = misc.fps(xyz.contiguous(), self.num_group)  # B G 3
+        # knn to get the neighborhood
+        idx = knn_point(self.group_size, xyz, center)
+        assert idx.size(1) == self.num_group
+        assert idx.size(2) == self.group_size
+        idx_base = torch.arange(0, batch_size, device=xyz.device).view(-1, 1, 1) * num_points
+        idx = idx + idx_base
+        idx = idx.view(-1)
+        neighborhood = pts.view(batch_size * num_points, -1)[idx, :]
+        neighborhood = neighborhood.view(batch_size, self.num_group, self.group_size, c).contiguous()
+        # normalize
+        neighborhood[:, :, :, :3] = neighborhood[:, :, :, :3] - center.unsqueeze(2)
+        # can utilize morton_sorting by choosing morton_sorting function
+        sorted_indices = self.simplied_morton_sorting(xyz, center)
+        neighborhood = neighborhood.view(
+            batch_size * self.num_group, self.group_size, c)[sorted_indices, :, :]
+        neighborhood = neighborhood.view(
+            batch_size, self.num_group, self.group_size, c).contiguous()
+        center = center.view(
+            batch_size * self.num_group, 3)[sorted_indices, :]
+        center = center.view(
+            batch_size, self.num_group, 3).contiguous()
+        return neighborhood, center
+# Transformers
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = True,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        if mask is not None:
+            attn = attn.masked_fill(mask, float('-inf'))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = True,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, y: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, N, C = y.shape
+        kv = self.kv(y).reshape(B, N, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv.unbind(0)
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, 1, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)[0]
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        if mask is not None:
+            attn = attn.masked_fill(mask, float('-inf'))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class LayerScale(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            init_values: float = 1e-5,
+            inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Block(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int,
+            mlp_ratio: float = 4.,
+            qkv_bias: bool = True,
+            qk_norm: bool = False,
+            proj_drop: float = 0.,
+            attn_drop: float = 0.,
+            init_values: Optional[float] = None,
+            drop_path: float = 0.,
+            act_layer: nn.Module = nn.GELU,
+            norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            norm_layer=norm_layer,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x, attn_mask=None):
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), attn_mask)))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+class CrossBlock(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int,
+            mlp_ratio: float = 4.,
+            qkv_bias: bool = True,
+            qk_norm: bool = False,
+            proj_drop: float = 0.,
+            attn_drop: float = 0.,
+            init_values: Optional[float] = None,
+            drop_path: float = 0.,
+            act_layer: nn.Module = nn.GELU,
+            norm_layer: nn.Module = nn.LayerNorm,
+            stop_grad: bool = False
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = CrossAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            norm_layer=norm_layer,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.stop_grad = stop_grad
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        if self.stop_grad:
+            x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), self.norm1(y.detach()))))
+        else:
+            x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), self.norm1(y))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+class ReConBlocks(nn.Module):
+    def __init__(
+            self,
+            embed_dim: int = 768,
+            depth: int = 12,
+            num_heads: int = 12,
+            mlp_ratio: float = 4.,
+            qkv_bias: bool = True,
+            qk_norm: bool = False,
+            init_values: Optional[float] = None,
+            proj_drop: float = 0.,
+            attn_drop_rate: float = 0.,
+            drop_path_rate: List = [],
+            norm_layer: nn.Module = nn.LayerNorm,
+            act_layer: nn.Module = nn.GELU,
+            stop_grad: bool = False,
+            pretrained_model_name: str = 'vit_base_patch32_clip_224.openai',
+            every_layer_add_pos: bool = True,
+    ):
+        super().__init__()
+        self.depth = depth
+        self.stop_grad = stop_grad
+        self.pretrained_model_name = pretrained_model_name
+        self.every_layer_add_pos = every_layer_add_pos
+        if 'dino' in self.pretrained_model_name:
+            init_values = 1e-5
+        if 'giant' in self.pretrained_model_name:
+            mlp_ratio = 48 / 11
+        self.local_blocks = nn.Sequential(*[
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_norm=qk_norm,
+                init_values=init_values,
+                proj_drop=proj_drop,
+                attn_drop=attn_drop_rate,
+                drop_path=drop_path_rate[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer
+            )
+            for i in range(depth)])
+        self.global_blocks = nn.Sequential(*[
+            CrossBlock(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_norm=qk_norm,
+                init_values=init_values,
+                proj_drop=proj_drop,
+                attn_drop=attn_drop_rate,
+                drop_path=drop_path_rate[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                stop_grad=stop_grad
+            )
+            for i in range(depth)])
+    def load_pretrained_timm_weights(self):
+        model = timm.create_model(self.pretrained_model_name, pretrained=True)
+        state_dict = model.blocks.state_dict()
+        self.local_blocks.load_state_dict(state_dict, strict=True)
+        cross_state_dict = {}
+        for k, v in state_dict.items():
+            if 'qkv' in k:
+                cross_state_dict[k.replace('qkv', 'q')] = v[:int(v.shape[0] / 3)]
+                cross_state_dict[k.replace('qkv', 'kv')] = v[int(v.shape[0] / 3):]
+            else:
+                cross_state_dict[k] = v
+        self.global_blocks.load_state_dict(cross_state_dict, strict=True)
+    def forward(self, x, pos, attn_mask=None, query=None):
+        if self.every_layer_add_pos:
+            for i in range(self.depth):
+                x = self.local_blocks[i](x + pos, attn_mask)
+                if query is not None:
+                    query = self.global_blocks[i](query, x)
+        else:
+            x = x + pos
+            for i in range(self.depth):
+                x = self.local_blocks[i](x, attn_mask)
+                if query is not None:
+                    query = self.global_blocks[i](query, x)
+        return x, query
+class GPTExtractor(nn.Module):
+    def __init__(
+            self,
+            embed_dim: int = 768,
+            num_heads: int = 12,
+            depth: int = 12,
+            group_size: int = 32,
+            drop_path_rate: float = 0.0,
+            stop_grad: bool = False,
+            pretrained_model_name: str = 'vit_base_patch32_clip_224.openai',
+    ):
+        super(GPTExtractor, self).__init__()
+        self.embed_dim = embed_dim
+        self.group_size = group_size
+        # start of sequence token
+        self.sos = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.sos_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        nn.init.normal_(self.sos)
+        nn.init.normal_(self.sos_pos)
+        drop_path_rate = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = ReConBlocks(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            depth=depth,
+            drop_path_rate=drop_path_rate,
+            stop_grad=stop_grad,
+            pretrained_model_name=pretrained_model_name,
+        )
+        self.ln_f1 = nn.LayerNorm(embed_dim)
+        self.ln_f2 = nn.LayerNorm(embed_dim)
+    def forward(self, x, pos, attn_mask, query):
+        """
+        Expect input as shape [sequence len, batch]
+        """
+        batch, length, _ = x.shape
+        # prepend sos token
+        sos = self.sos.expand(batch, -1, -1)
+        sos_pos = self.sos_pos.expand(batch, -1, -1)
+        x = torch.cat([sos, x[:, :-1]], dim=1)
+        pos = torch.cat([sos_pos, pos[:, :-1]], dim=1)
+        # transformer
+        x, query = self.blocks(x, pos, attn_mask, query)
+        encoded_points = self.ln_f1(x)
+        query = self.ln_f2(query)
+        return encoded_points, query
+class MAEExtractor(nn.Module):
+    def __init__(
+            self,
+            embed_dim: int = 768,
+            num_heads: int = 12,
+            depth: int = 12,
+            group_size: int = 32,
+            drop_path_rate: float = 0.0,
+            stop_grad: bool = False,
+            pretrained_model_name: str = 'vit_base_patch32_clip_224.openai',
+    ):
+        super(MAEExtractor, self).__init__()
+        self.embed_dim = embed_dim
+        self.group_size = group_size
+        drop_path_rate = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = ReConBlocks(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            depth=depth,
+            drop_path_rate=drop_path_rate,
+            stop_grad=stop_grad,
+            pretrained_model_name=pretrained_model_name,
+        )
+        self.ln_f1 = nn.LayerNorm(embed_dim)
+        self.ln_f2 = nn.LayerNorm(embed_dim)
+    def forward(self, x, pos, mask=None, query=None):
+        """
+        Expect input as shape [sequence len, batch]
+        """
+        batch, length, C = x.shape
+        if mask is not None:
+            x_vis = x[~mask].reshape(batch, -1, C)
+            pos_vis = pos[~mask].reshape(batch, -1, C)
+        else:
+            x_vis = x
+            pos_vis = pos
+        # transformer
+        x_vis, query = self.blocks(x_vis, pos_vis, None, query)
+        encoded_points = self.ln_f1(x_vis)
+        query = self.ln_f2(query)
+        return encoded_points, query

mm_models/modal_module/point/reconv2.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import numpy as np
+from easydict import EasyDict
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .recon.transformer import Group, ZGroup, PatchEmbedding, PositionEmbeddingCoordsSine, GPTExtractor, MAEExtractor
+POINT_RECON2_MODAL_CFG = {
+    'modal_tag': 'point',
+    'modal_placeholder_token': "<|point_placeholder|>",
+    'model_path': None,
+    'group_size': 32,
+    'num_group': 512,
+    'mask_type': 'rand',
+    'embed_dim': 1024,
+    'depth': 24,
+    'drop_path_rate': 0.1,
+    'num_heads': 16,
+    'with_color': True,
+    'stop_grad': False,
+    'large_embedding': False,
+    'img_queries': 13,
+    'text_queries': 3,
+    'pretrained_model_name': 'eva_large_patch14_336.in22k_ft_in22k_in1k',
+    'output_dim': 896
+}
+class MaskTransformer(nn.Module):
+    def __init__(self, config):
+        super(MaskTransformer, self).__init__()
+        self.embed_dim = config.embed_dim
+        self.num_group = config.num_group
+        self.group_size = config.group_size
+        self.with_color = config.with_color
+        self.input_channel = 6 if self.with_color else 3
+        self.img_queries = config.img_queries
+        self.text_queries = config.text_queries
+        self.global_query_num = self.img_queries + self.text_queries
+        self.mask_type = config.mask_type
+        self.stop_grad = config.stop_grad
+        self.embed = PatchEmbedding(embed_dim=self.embed_dim, input_channel=self.input_channel,
+                                    large=config.large_embedding)
+        print(f'[ReCon] divide point cloud into G{config.num_group} x S{config.group_size} points ...')
+        if self.mask_type == 'causal':
+            self.group_divider = ZGroup(num_group=config.num_group, group_size=config.group_size)
+            self.encoder = GPTExtractor(
+                embed_dim=config.embed_dim,
+                num_heads=config.num_heads,
+                depth=config.depth,
+                group_size=config.group_size,
+                drop_path_rate=config.drop_path_rate,
+                stop_grad=self.stop_grad,
+                pretrained_model_name=config.pretrained_model_name,
+            )
+            self.pos_embed = PositionEmbeddingCoordsSine(3, self.embed_dim, 1.0)
+        else:
+            self.group_divider = Group(num_group=config.num_group, group_size=config.group_size)
+            self.encoder = MAEExtractor(
+                embed_dim=config.embed_dim,
+                num_heads=config.num_heads,
+                depth=config.depth,
+                group_size=config.group_size,
+                drop_path_rate=config.drop_path_rate,
+                stop_grad=self.stop_grad,
+                pretrained_model_name=config.pretrained_model_name,
+            )
+            self.pos_embed = nn.Sequential(
+                nn.Linear(3, 128),
+                nn.GELU(),
+                nn.Linear(128, self.embed_dim)
+            )
+        self.norm = nn.LayerNorm(self.embed_dim)
+        self.global_query = nn.Parameter(torch.zeros(1, self.global_query_num, self.embed_dim))
+        self.apply(self._init_weights)
+        self.num_group = config.num_group
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, 0.02, 0.01)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm1d):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def inference(self, pts):
+        with torch.no_grad():
+            neighborhood, center = self.group_divider(pts)
+            group_input_tokens = self.embed(neighborhood)  # B G C
+            batch_size, seq_len, C = group_input_tokens.size()
+            global_query = self.global_query.expand(batch_size, -1, -1)
+            pos = self.pos_embed(center.to(group_input_tokens.dtype))
+            mask = torch.full(
+                (seq_len, seq_len), -float("Inf"), device=group_input_tokens.device, dtype=group_input_tokens.dtype
+            ).to(torch.bool)
+            if self.mask_type == 'causal':
+                mask = torch.triu(mask, diagonal=1)
+            else:
+                mask = None
+            local_features, global_features = self.encoder(
+                group_input_tokens, pos, mask, global_query)
+        return pos, local_features, global_features
+class ReCon2(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.embed_dim
+        self.with_color = config.with_color
+        self.img_queries = config.img_queries
+        self.text_queries = config.text_queries
+        self.global_query_num = self.img_queries + self.text_queries
+        self.input_channel = 6 if self.with_color else 3
+        self.model = MaskTransformer(config)
+        self.img_proj = nn.Linear(self.embed_dim, 1280)
+        self.img_proj.apply(self._init_weights)
+        self.text_proj = nn.Linear(self.embed_dim, 1280)
+        self.text_proj.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, 0.02, 0.01)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm1d):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+class ReConv2PointEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.vision_tower = ReCon2(self.config)
+    @torch.no_grad()
+    def forward(self, pts):
+        pts = torch.stack(pts, dim=0)
+        pos_features, local_features, global_features = \
+            self.vision_tower.model.inference(pts.to(device=self.device, dtype=self.dtype))
+        local_features = local_features.to(pts.dtype)
+        global_features = global_features.to(pts.dtype)
+        return (pos_features, local_features, global_features)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+class ReConProjector_MLP(nn.Module):
+    def __init__(self, in_channels, out_channels, mlp_depth, prompt_token_num,
+                 with_ape=True, with_local=True, with_global=True):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.mlp_depth = mlp_depth
+        self.prompt_token_num = prompt_token_num
+        self.with_ape = with_ape
+        self.with_local = with_local
+        self.with_global = with_global
+        if prompt_token_num > 0:
+            self.prompt1 = nn.Parameter(torch.zeros(1, prompt_token_num, out_channels))
+            self.prompt2 = nn.Parameter(torch.zeros(1, prompt_token_num, out_channels))
+            self.prompt3 = nn.Parameter(torch.zeros(1, prompt_token_num, out_channels))
+        self.proj1 = self.set_proj()
+        self.proj2 = self.set_proj()
+        self.proj3 = self.set_proj()
+    def set_proj(self):
+        modules = [nn.Linear(self.in_channels, self.out_channels)]
+        for i in range(1, self.mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(self.out_channels, self.out_channels))
+        modules.append(nn.GELU())
+        modules.append(nn.Linear(self.out_channels, self.out_channels))
+        return nn.Sequential(*modules)
+    def forward(self, proj_inps):
+        pos_feat, local_feat, global_feat = proj_inps
+        B = pos_feat.shape[0]
+        pos_feat = self.proj1(pos_feat)
+        local_feat = self.proj2(local_feat)
+        global_feat = self.proj3(global_feat)
+        if self.prompt_token_num > 0:
+            pos_feat = torch.cat([self.prompt1.expand(B, -1, -1), pos_feat], dim=1)
+            local_feat = torch.cat([self.prompt2.expand(B, -1, -1), local_feat], dim=1)
+            global_feat = torch.cat([self.prompt3.expand(B, -1, -1), global_feat], dim=1)
+        pts_feat = [feat for feat, flag in [(pos_feat, self.with_ape), (local_feat, self.with_local), (global_feat, self.with_global)] if flag]
+        pts_feat = torch.cat(pts_feat, dim=1)
+        pts_feat = torch.split(pts_feat, 1)
+        pts_feat = [item.squeeze() for item in pts_feat]
+        return pts_feat
+def build_point_encoder(modal_cfg):
+    assert modal_cfg['modal_tag'] == 'point', f"building point encoder with '{modal_cfg['modal_tag']}' tag is not supported"
+    if "encoder_cfg" not in modal_cfg: # init
+        cfg = EasyDict(modal_cfg)
+        model = ReConv2PointEncoder(cfg)
+        print(f"loading point encoder from {modal_cfg['model_path']}")
+        model_path = modal_cfg['model_path']
+        model.vision_tower.load_state_dict(torch.load(model_path, map_location='cpu'), strict=True)
+    else:
+        cfg = EasyDict(modal_cfg["encoder_cfg"])
+        model = ReConv2PointEncoder(cfg)
+    return model
+def build_point_projector(modal_cfg):
+    assert modal_cfg['modal_tag'] == 'point', f"building point projector with '{modal_cfg['modal_tag']}' tag is not supported"
+    if "encoder_cfg" in modal_cfg:
+        proj_cfg = EasyDict(modal_cfg["encoder_cfg"])
+    else:
+        proj_cfg = EasyDict(modal_cfg)
+    projector = ReConProjector_MLP(in_channels=proj_cfg.embed_dim,
+                                   out_channels=proj_cfg.output_dim,
+                                   mlp_depth=2, prompt_token_num=1)
+    return projector
+# if __name__ == '__main__':
+#     encoder = build_point_encoder(POINT_RECON2_MODAL_CFG)
+#     print(encoder)
+#     data = torch.randn((1, 8192, 6))
+#     pos_features, local_features, global_features = encoder(data)
+#     print(pos_features.shape, local_features.shape, global_features.shape)
+#     proj = build_point_projector(POINT_RECON2_MODAL_CFG)
+#     point_token = proj((pos_features, local_features, global_features))
+#     print(point_token.shape)

mm_models/modal_module/vision/__pycache__/siglip.cpython-310.pyc ADDED Viewed

Binary file (3.92 kB). View file

mm_models/modal_module/vision/siglip.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from transformers import SiglipVisionModel, SiglipVisionConfig
+import torch.nn as nn
+import torch
+import einops
+import torch.nn.functional as F
+import torch.nn.init as init
+VISION_SIGLIP_MODAL_CFG = {
+    'modal_tag': 'vision',
+    'model_name_or_path': None,
+    'modal_placeholder_token': "<|vision_placeholder|>",
+    'proj_num_layers': 2,
+    'proj_input_dim': 1152,
+    'proj_output_dim': 896, # for qwen2.5 0.5B
+    'multi_grid': False
+}
+class SiglipVisionModelForMM(SiglipVisionModel):
+    def forward(self, pixel_values, **kwargs):
+        assert type(pixel_values) == list
+        split_sizes = []
+        temp = []
+        for pixel_value in pixel_values:
+            if pixel_value.dim() == 3:
+                pixel_value = pixel_value.unsqueeze(0)
+            temp.append(pixel_value)
+            split_sizes.append(pixel_value.shape[0])
+        pixel_values = torch.cat(temp, dim=0) # (BG) 3 H W
+        outputs = super().forward(pixel_values, output_hidden_states=True, **kwargs)
+        hidden_states = outputs['hidden_states'][-2]
+        return (hidden_states, split_sizes)
+class VisionProjector_MLP(nn.Module):
+    def __init__(self, proj_num_layers, proj_input_dim, proj_output_dim, multi_grid=False):
+        super().__init__()
+        _proj_input_dim = int(4*proj_input_dim)
+        module = [nn.Linear(_proj_input_dim, proj_output_dim)]
+        for _ in range(proj_num_layers - 1):
+            module.append(nn.GELU())
+            module.append(nn.Linear(proj_output_dim, proj_output_dim))
+        module.append(nn.GELU())
+        module.append(nn.Linear(proj_output_dim, proj_output_dim))
+        self.module = nn.Sequential(*module)
+        self.resample_pad_token = nn.Parameter(torch.randn((1, proj_input_dim)))
+        init.kaiming_normal_(self.resample_pad_token)
+        self.multi_grid = multi_grid
+        if self.multi_grid:
+            self.grid_sep = nn.Parameter(torch.randn((1, proj_output_dim)))
+            init.kaiming_normal_(self.grid_sep)
+    def forward(self, encoder_output):
+        visual_tokens, split_sizes = encoder_output
+        B, L, D = visual_tokens.shape
+        # Pooling
+        n_patch = int(L**0.5)
+        visual_tokens = einops.rearrange(visual_tokens, "B (h w) D -> B D h w", h=n_patch, w=n_patch)
+        if n_patch % 2 != 0:
+            visual_tokens = F.pad(visual_tokens, (0, 1, 0, 1), value=0)
+            visual_tokens = einops.rearrange(visual_tokens, "B D h w -> B h w D")
+            visual_tokens[:, -1, -1, :] = self.resample_pad_token.expand(B, -1)
+            n_patch += 1
+        visual_tokens = visual_tokens.view(B, n_patch // 2, 2, n_patch // 2, 2, D)  # (B, n//2, 2, n//2, 2, D)
+        visual_tokens = visual_tokens.permute(0, 1, 3, 2, 4, 5)  # (B, n//2, n//2, 2, 2, D)
+        visual_tokens = visual_tokens.contiguous().view(B, n_patch // 2, n_patch // 2, D * 4)  # (B, n//2, n//2, D*4)
+        visual_tokens = einops.rearrange(visual_tokens, "B h w D -> B (h w) D")
+        visual_tokens = self.module(visual_tokens)
+        # Grid
+        if self.multi_grid:
+            visual_tokens = torch.split(visual_tokens, split_sizes) # B [G n D]
+            visual_tokens_list = []
+            for grid_visual_tokens in visual_tokens:
+                grid_visual_tokens = torch.cat([grid_visual_tokens,
+                                            self.grid_sep.repeat(grid_visual_tokens.shape[0], 1, 1)], dim=1)
+                grid_visual_tokens = einops.rearrange(grid_visual_tokens, "G n D -> (G n) D")[:-1, :]
+                visual_tokens_list.append(grid_visual_tokens)
+        else:
+            visual_tokens_list = torch.split(visual_tokens, 1)
+            visual_tokens_list = [item.squeeze() for item in visual_tokens_list]
+        return visual_tokens_list
+def build_vision_encoder(modal_cfg):
+    assert modal_cfg['modal_tag'] == 'vision', f"building vision encoder with '{modal_cfg['modal_tag']}' tag is not supported"
+    if "encoder_cfg" not in modal_cfg: # from pretrained
+        model = SiglipVisionModelForMM.from_pretrained(modal_cfg['model_name_or_path'])
+    else:
+        cfg = SiglipVisionConfig(**modal_cfg['encoder_cfg'])
+        model = SiglipVisionModelForMM._from_config(cfg)
+    return model
+def build_vision_projector(modal_cfg):
+    assert modal_cfg['modal_tag'] == 'vision', f"building vision projector with '{modal_cfg['modal_tag']}' tag is not supported"
+    return VisionProjector_MLP(modal_cfg['proj_num_layers'], modal_cfg['proj_input_dim'], modal_cfg['proj_output_dim'],
+                               modal_cfg['multi_grid'])
+if __name__ == '__main__':
+    projector = VisionProjector_MLP(2, 1152, 2048)
+    inputs = torch.randn((5, 729, 1152))
+    outputs = projector(inputs)
+    print(outputs.shape)

mm_models/modeling_mm.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig, PreTrainedModel, Qwen2Config
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from .configuration_mm import AllSparkConfig
+from .modal_module import MODAL_ENCODERS_MAPPING, MODAL_PROJECTORS_MAPPING
+from .llms.qwen_model_moe import Qwen2ForCausalLMMoE
+from typing import Optional, List, Union, Tuple
+import torch.nn.init as init
+from utils import rank0_print
+class AllSparkPreTrainedModel(PreTrainedModel):
+    config_class = AllSparkConfig
+    base_model_prefix = "allspark"
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else 0.02
+        )
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def prepare_multimodal_inputs(self, input_ids, modal_inputs, labels, attention_mask):
+        if modal_inputs is None:
+            return input_ids, None, labels, attention_mask, None
+        modal_tensors = dict()
+        for single_sample_modal_inputs in modal_inputs:
+            for tag, modal_tensor in single_sample_modal_inputs:
+                if tag in modal_tensors:
+                    modal_tensors[tag].append(modal_tensor)
+                else:
+                    modal_tensors[tag] = [modal_tensor]
+        for tag in modal_tensors:
+            modal_tensors[tag] = self.modal_projectors[tag](self.modal_encoders[tag](modal_tensors[tag])) # B [N D]
+        for sample_id, single_sample_modal_inputs in enumerate(modal_inputs):
+            for modal_id, (tag, _) in enumerate(single_sample_modal_inputs):
+                modal_inputs[sample_id][modal_id] = (tag, modal_tensors[tag].pop(0).squeeze(0))
+        for tag in modal_tensors:
+            assert len(modal_tensors[tag]) == 0
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if labels is None:
+            labels = torch.full_like(input_ids, self.config.ignore_index)
+        # input_ids: (batch_size, seq_len)
+        # modal_inputs: (B, M, Tuple[str, Tensor])
+        # labels: (batch_size, seq_len)
+        assert input_ids.shape[0] == len(modal_inputs) == labels.shape[0], \
+            f"Batch size mismatch: {input_ids.shape[0]} vs {len(modal_inputs)} vs {labels.shape[0]}" \
+            "If some sample has no modal inputs, please append a empty list to modal_inputs."
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        modal_tag_pos_list = []
+        new_input_embeds = []
+        new_labels = []
+        for single_sample_input_ids, single_sample_modal_inputs, single_sample_labels in zip(input_ids, modal_inputs, labels):
+            tag_num = dict()
+            cur_id = 0
+            single_sample_embeds = []
+            _labels = []
+            single_sample_modal_tag_pos_list = []
+            for modal_input in single_sample_modal_inputs:
+                tag, modal_tensor = modal_input
+                if tag not in self.modal_tags:
+                    raise ValueError(f"Unknown modal tag: {tag}. Vaid modal tags: {self.modal_tags}")
+                if tag not in tag_num:
+                    tag_num[tag] = 0
+                for modal_config in self.config.modal_configs:
+                    if modal_config["modal_tag"] == tag:
+                        modal_placeholder_token_id = modal_config["modal_placeholder_token_id"]
+                        break
+                cur_modal_idx = torch.where(single_sample_input_ids == modal_placeholder_token_id)[0].tolist()[tag_num[tag]]
+                single_sample_embeds.append(self.llm.get_input_embeddings()(single_sample_input_ids[cur_id:cur_modal_idx]))
+                single_sample_embeds.append(self.modal_embeds[tag][0:1, :]) # start embed
+                single_sample_embeds.append(modal_tensor)
+                single_sample_embeds.append(self.modal_embeds[tag][1:2, :]) # end embed
+                _labels.append(single_sample_labels[cur_id:cur_modal_idx])
+                _labels.append(torch.full((modal_tensor.shape[0]+2,), self.config.ignore_index, device=single_sample_labels.device, dtype=single_sample_labels.dtype))
+                single_sample_modal_tag_pos_list.append((tag, cur_modal_idx, cur_modal_idx+modal_tensor.shape[0]+1))
+                cur_id += cur_modal_idx+1
+                tag_num[tag] += 1
+            single_sample_embeds.append(self.llm.get_input_embeddings()(single_sample_input_ids[cur_id:]))
+            _labels.append(single_sample_labels[cur_id:])
+            new_input_embeds.append(torch.cat(single_sample_embeds, dim=0))
+            new_labels.append(torch.cat(_labels, dim=0))
+            modal_tag_pos_list.append(single_sample_modal_tag_pos_list)
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), self.config.ignore_index, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(torch.cat((torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), cur_new_embed), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+            else:
+                new_input_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        return None, new_input_embeds, new_labels_padded, attention_mask, modal_tag_pos_list
+class AllSparkForCausalLM(AllSparkPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        if self.config.modal_configs is not None:
+            self.modal_tags = []
+            self.modal_encoders, self.modal_projectors = nn.ModuleDict(), nn.ModuleDict()
+            for modal_config in self.config.modal_configs:
+                modal_tag = modal_config['modal_tag']
+                assert modal_tag not in self.modal_tags, f"Duplicate modal tag: {modal_tag}"
+                self.modal_tags.append(modal_tag)
+                self.modal_encoders[modal_tag] = MODAL_ENCODERS_MAPPING[modal_tag](modal_config)
+                encoder_cfg = self.modal_encoders[modal_tag].config
+                if isinstance(encoder_cfg, PretrainedConfig):
+                    encoder_cfg = encoder_cfg.to_dict()
+                modal_config['encoder_cfg'] = encoder_cfg
+                self.modal_projectors[modal_tag] = MODAL_PROJECTORS_MAPPING[modal_tag](modal_config)
+        else:
+            self.modal_tags = None
+        if hasattr(config, 'llm_config'):
+            if "Qwen2" in config.llm_name_or_path:
+                llm_config = Qwen2Config(**config.llm_config)
+                self.llm = Qwen2ForCausalLMMoE._from_config(llm_config, modal_tags=self.modal_tags,
+                                                            add_moe=self.config.add_moe)
+            else:
+                raise ValueError(config.llm_name_or_path)
+        else:
+            if "Qwen2" in config.llm_name_or_path:
+                self.llm = Qwen2ForCausalLMMoE.from_pretrained(config.llm_name_or_path, modal_tags=self.modal_tags,
+                                                               add_moe=self.config.add_moe)
+            else:
+                raise ValueError(config.llm_name_or_path)
+        self.config.llm_config = self.llm.config
+        self.config.hidden_size = self.llm.config.hidden_size
+        if self.config.modal_configs is not None:
+            self.modal_embeds = nn.ParameterDict()
+            for modal_config in self.config.modal_configs:
+                modal_tag = modal_config['modal_tag']
+                self.modal_embeds[modal_tag] = torch.randn((2, self.config.hidden_size)) # start and end embeds
+                init.kaiming_normal_(self.modal_embeds[modal_tag])
+        self.post_init()
+    def initialize_tokenizer_for_multimodal(self, tokenizer, new_tag):
+        config = self.config
+        if config.modal_configs is None:
+            rank0_print("No modal configs provided, skipping multimodal tokenizer initialization.")
+            return None
+        for i, modal_config in enumerate(config.modal_configs):
+            # only add new tokens for the new modal
+            if modal_config['modal_tag'] != new_tag:
+                continue
+            modal_placeholder_token = modal_config['modal_placeholder_token']
+            tokenizer.add_tokens([modal_placeholder_token], special_tokens=True)
+            self.config.modal_configs[i]['modal_placeholder_token_id'] = tokenizer.convert_tokens_to_ids(modal_placeholder_token)
+        self.llm.resize_token_embeddings(len(tokenizer), mean_resizing=False)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        modal_inputs: Optional[List[List[Tuple[str, torch.FloatTensor]]]] = None, # B M (modal_tag, modal_input)
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if modal_inputs is not None:
+            input_ids, inputs_embeds, labels, attention_mask, modal_tag_pos_list = \
+                self.prepare_multimodal_inputs(input_ids, modal_inputs, labels, attention_mask)
+            return self.llm(input_ids=input_ids,
+                            attention_mask=attention_mask,
+                            inputs_embeds=inputs_embeds,
+                            labels=labels,
+                            modal_tag_pos_list=modal_tag_pos_list,
+                            **kwargs)
+        else:
+            return self.llm(input_ids=input_ids,
+                            attention_mask=attention_mask,
+                            labels=labels,
+                            **kwargs)
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor = None,
+        modal_inputs: Optional[List[List[Tuple[str, torch.FloatTensor]]]] = None, # B M (modal_tag, modal_input)
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        if modal_inputs is not None:
+            input_ids, inputs_embeds, labels, attention_mask, modal_tag_pos_list = \
+                self.prepare_multimodal_inputs(input_ids, modal_inputs, None, attention_mask)
+            return self.llm.generate(input_ids=input_ids, attention_mask=attention_mask,
+                                    inputs_embeds=inputs_embeds, modal_tag_pos_list=modal_tag_pos_list, **kwargs)
+        else:
+            return self.llm.generate(input_ids=input_ids, attention_mask=attention_mask, **kwargs)

utils.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import torch.distributed as dist
+import ast
+import re
+import torch
+from PIL import Image
+import math
+def model_params_summary(module, out_fn, verbose=True):
+    out_fn("-"*30)
+    out_fn(f"module name: {module.__class__}")
+    if verbose:
+        out_fn("-"*30)
+        for n, p in module.named_parameters():
+            out_fn(f"{n}: {'trainable' if p.requires_grad else 'freeze'}")
+    out_fn("-"*30)
+    out_fn(f"Total params: {sum(p.numel() for p in module.parameters())/1e6:.4f}M")
+    out_fn(f"Trainable params: {sum(p.numel() for p in module.parameters() if p.requires_grad)/1e6:.4f}M")
+    out_fn("-"*30)
+def rank0_print(*args):
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            print(f"Rank {dist.get_rank()}: ", *args)
+    else:
+        print(*args)
+LLM_DIM_MAPPING = {
+    'Qwen2.5-0.5B': 896,
+    'Qwen2.5-1.5B': 1536,
+    'Qwen2.5-3B': 2048,
+    'Qwen2.5-7B': 3584
+}
+SYSTEM_PROMPT = "You are a multimodal AI assistant named AllSparkv2 capable of understanding and generating content " +\
+                 "in various forms, including text and images. Your primary function is to provide useful and harmless " +\
+                 "information based on user input, assisting with problem-solving, information retrieval, and task completion. "
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    for width, height in possible_resolutions:
+        # Calculate the downscaled size to keep the aspect ratio
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        # Calculate effective and wasted resolutions
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    # Determine which dimension (width or height) to fill
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        # Width will be filled completely
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        # Height will be filled completely
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    # Create a new image with the target size and paste the resized image onto it
+    new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # Convert grid_pinpoints from string to list
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        patch_size = min(processor.size.values())
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image_padded, processor.size["height"])
+    # FIXME: this seems to be a bug that it resizes instead of pad.
+    # but to keep it consistent with previous, i will keep it as it is
+    # TODO: uncomment below to ablate with the padding
+    shortest_edge = min(processor.size.values())
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+    # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+    # image_original_resize = image_padded_square.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)