Upload 4 files

Browse files

Files changed (4) hide show

README.md +41 -0
vision_tower/config.json +23 -0
vision_tower/model.safetensors +3 -0
vision_tower/preprocessor_config.json +24 -0

README.md CHANGED Viewed

@@ -1,3 +1,44 @@
 ---
 license: mit
 ---

 ---
 license: mit
 ---
+# Aux-Think: Exploring Reasoning Strategies for Data-Efficient Vision-Language Navigation
+<div align="center" class="authors">
+    <a href="https://scholar.google.com/citations?user=IYLvsCQAAAAJ&hl" target="_blank">Shuo Wang</a>,
+    <a href="https://yongcaiwang.github.io/" target="_blank">Yongcai Wang</a>,
+    <a>Wanting Li</a>,
+    <a href="https://scholar.google.com/citations?user=TkwComsAAAAJ&hl=en" target="_blank">Xudong Cai</a>, <br>
+    <text>Yucheng Wang</text>,
+    <text>Maiyue Chen</text>,
+    <text>Kaihui Wang</text>,
+    <a href="https://scholar.google.com/citations?user=HQfc8TEAAAAJ&hl=en" target="_blank">Zhizhong Su</a>,
+    <text>Deying Li</text>,
+    <a href="https://zhaoxinf.github.io/" target="_blank">Zhaoxin Fan</a>
+</div>
+<div align="center" style="line-height: 3;">
+  <a href="https://horizonrobotics.github.io/robot_lab/aux-think" target="_blank" style="margin: 2px;">
+    <img alt="Homepage" src="https://img.shields.io/badge/Homepage-green" style="display: inline-block; vertical-align: middle;"/>
+  </a>
+  <a href="https://arxiv.org/abs/2505.11886" target="_blank" style="margin: 2px;">
+    <img alt="Paper" src="https://img.shields.io/badge/Paper-Arxiv-red" style="display: inline-block; vertical-align: middle;"/>
+  </a>
+</div>
+## Introduction
+Aux-Think internalizes Chain-of-Thought (CoT) only during training, enabling efficient Vision-Language Navigation without explicit reasoning at inference, and achieving strong performance with minimal data.
+![](https://horizonrobotics.github.io/robot_lab/aux-think/stats/x3.png)
+## Citation
+```bibtex
+@article{wang2025think,
+  title={Aux-Think: Exploring Reasoning Strategies for Data-Efficient Vision-Language Navigation},
+  author={Wang, Shuo and Wang, Yongcai and Li, Wanting and Cai, Xudong and Wang, Yucheng and Chen, Maiyue and Wang, Kaihui and Su, Zhizhong and Li, Deying and Fan, Zhaoxin},
+  journal={arXiv preprint arXiv:2505.11886},
+  year={2025}
+}
+```

vision_tower/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "",
+  "architectures": [
+    "SiglipVisionModel"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 448,
+  "intermediate_size": 4304,
+  "layer_norm_eps": 1e-06,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 27,
+  "num_image_tokens": 256,
+  "patch_size": 14,
+  "projection_dim": 2048,
+  "projector_hidden_act": "gelu_fast",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0",
+  "vision_use_head": false
+}

vision_tower/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36371b8a2cf650d7fda238929dd493117e58e23e1c498ecd807d8fd50a9788cf
+size 826707904

vision_tower/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SiglipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 448,
+    "width": 448
+  }
+}