Upload folder using huggingface_hub
Browse files- configuration_internvl_chat.py +1 -5
- modeling_internvl_chat.py +42 -47
configuration_internvl_chat.py
CHANGED
|
@@ -27,11 +27,10 @@ class InternVLChatConfig(PretrainedConfig):
|
|
| 27 |
use_backbone_lora=0,
|
| 28 |
use_llm_lora=0,
|
| 29 |
pad2square=False,
|
| 30 |
-
select_layer=-
|
| 31 |
force_image_size=None,
|
| 32 |
downsample_ratio=0.5,
|
| 33 |
template=None,
|
| 34 |
-
image_fold=False,
|
| 35 |
dynamic_image_size=False,
|
| 36 |
use_thumbnail=False,
|
| 37 |
ps_version='v1',
|
|
@@ -62,7 +61,6 @@ class InternVLChatConfig(PretrainedConfig):
|
|
| 62 |
self.force_image_size = force_image_size
|
| 63 |
self.downsample_ratio = downsample_ratio
|
| 64 |
self.template = template
|
| 65 |
-
self.image_fold = image_fold
|
| 66 |
self.dynamic_image_size = dynamic_image_size
|
| 67 |
self.use_thumbnail = use_thumbnail
|
| 68 |
self.ps_version = ps_version # pixel shuffle version
|
|
@@ -70,7 +68,6 @@ class InternVLChatConfig(PretrainedConfig):
|
|
| 70 |
self.max_dynamic_patch = max_dynamic_patch
|
| 71 |
|
| 72 |
logger.info(f'vision_select_layer: {self.select_layer}')
|
| 73 |
-
logger.info(f'image_fold: {self.image_fold}')
|
| 74 |
logger.info(f'ps_version: {self.ps_version}')
|
| 75 |
logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
|
| 76 |
logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
|
|
@@ -93,7 +90,6 @@ class InternVLChatConfig(PretrainedConfig):
|
|
| 93 |
output['force_image_size'] = self.force_image_size
|
| 94 |
output['downsample_ratio'] = self.downsample_ratio
|
| 95 |
output['template'] = self.template
|
| 96 |
-
output['image_fold'] = self.image_fold
|
| 97 |
output['dynamic_image_size'] = self.dynamic_image_size
|
| 98 |
output['use_thumbnail'] = self.use_thumbnail
|
| 99 |
output['ps_version'] = self.ps_version
|
|
|
|
| 27 |
use_backbone_lora=0,
|
| 28 |
use_llm_lora=0,
|
| 29 |
pad2square=False,
|
| 30 |
+
select_layer=-1,
|
| 31 |
force_image_size=None,
|
| 32 |
downsample_ratio=0.5,
|
| 33 |
template=None,
|
|
|
|
| 34 |
dynamic_image_size=False,
|
| 35 |
use_thumbnail=False,
|
| 36 |
ps_version='v1',
|
|
|
|
| 61 |
self.force_image_size = force_image_size
|
| 62 |
self.downsample_ratio = downsample_ratio
|
| 63 |
self.template = template
|
|
|
|
| 64 |
self.dynamic_image_size = dynamic_image_size
|
| 65 |
self.use_thumbnail = use_thumbnail
|
| 66 |
self.ps_version = ps_version # pixel shuffle version
|
|
|
|
| 68 |
self.max_dynamic_patch = max_dynamic_patch
|
| 69 |
|
| 70 |
logger.info(f'vision_select_layer: {self.select_layer}')
|
|
|
|
| 71 |
logger.info(f'ps_version: {self.ps_version}')
|
| 72 |
logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
|
| 73 |
logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
|
|
|
|
| 90 |
output['force_image_size'] = self.force_image_size
|
| 91 |
output['downsample_ratio'] = self.downsample_ratio
|
| 92 |
output['template'] = self.template
|
|
|
|
| 93 |
output['dynamic_image_size'] = self.dynamic_image_size
|
| 94 |
output['use_thumbnail'] = self.use_thumbnail
|
| 95 |
output['ps_version'] = self.ps_version
|
modeling_internvl_chat.py
CHANGED
|
@@ -23,40 +23,6 @@ from .modeling_internlm2 import InternLM2ForCausalLM
|
|
| 23 |
logger = logging.get_logger(__name__)
|
| 24 |
|
| 25 |
|
| 26 |
-
def window_partition(x, window_size):
|
| 27 |
-
"""
|
| 28 |
-
Args:
|
| 29 |
-
x: (B, C, H, W)
|
| 30 |
-
window_size (int): window size, assuming square window
|
| 31 |
-
|
| 32 |
-
Returns:
|
| 33 |
-
windows: (num_windows*B, C, window_size, window_size)
|
| 34 |
-
"""
|
| 35 |
-
B, C, H, W = x.shape
|
| 36 |
-
assert H % window_size == 0 and W % window_size == 0, 'H and W must be divisible by window_size'
|
| 37 |
-
|
| 38 |
-
x = x.view(B, C, H // window_size, window_size, W // window_size, window_size)
|
| 39 |
-
windows = x.permute(0, 2, 4, 1, 3, 5).contiguous().view(-1, C, window_size, window_size)
|
| 40 |
-
return windows
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def window_reverse(windows, window_size, H, W):
|
| 44 |
-
"""
|
| 45 |
-
Args:
|
| 46 |
-
windows: (num_windows*B, window_size, window_size, C)
|
| 47 |
-
window_size (int): Window size
|
| 48 |
-
H (int): Height of image
|
| 49 |
-
W (int): Width of image
|
| 50 |
-
|
| 51 |
-
Returns:
|
| 52 |
-
x: (B, H * W, C)
|
| 53 |
-
"""
|
| 54 |
-
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
| 55 |
-
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
|
| 56 |
-
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H * W, -1)
|
| 57 |
-
return x
|
| 58 |
-
|
| 59 |
-
|
| 60 |
class InternVLChatModel(PreTrainedModel):
|
| 61 |
config_class = InternVLChatConfig
|
| 62 |
main_input_name = 'pixel_values'
|
|
@@ -72,7 +38,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 72 |
self.template = config.template
|
| 73 |
self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
|
| 74 |
self.downsample_ratio = config.downsample_ratio
|
| 75 |
-
self.image_fold = config.image_fold
|
| 76 |
self.ps_version = config.ps_version
|
| 77 |
|
| 78 |
logger.info(f'num_image_token: {self.num_image_token}')
|
|
@@ -242,10 +207,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 242 |
return vit_embeds + noise
|
| 243 |
|
| 244 |
def extract_feature(self, pixel_values):
|
| 245 |
-
if self.image_fold:
|
| 246 |
-
image_size = pixel_values.size(-1) # B, C, H, W
|
| 247 |
-
pixel_values = window_partition(pixel_values, window_size=image_size // self.image_fold) # 4B, C, H/2, W/2
|
| 248 |
-
|
| 249 |
if self.select_layer == -1:
|
| 250 |
vit_embeds = self.vision_model(
|
| 251 |
pixel_values=pixel_values,
|
|
@@ -261,21 +222,55 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 261 |
if self.training and self.neftune_alpha is not None:
|
| 262 |
vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
|
| 263 |
|
| 264 |
-
if self.image_fold:
|
| 265 |
-
vit_embeds = window_reverse(vit_embeds, window_size=image_size // (self.image_fold * self.patch_size),
|
| 266 |
-
H=image_size // self.patch_size, W=image_size // self.patch_size)
|
| 267 |
-
|
| 268 |
-
# if torch.distributed.get_rank() == 0:
|
| 269 |
-
# print("before pixel shuffle:", vit_embeds.shape)
|
| 270 |
h = w = int(vit_embeds.shape[1] ** 0.5)
|
| 271 |
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
|
| 272 |
vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
|
| 273 |
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
|
| 274 |
-
# if torch.distributed.get_rank() == 0:
|
| 275 |
-
# print("after pixel shuffle:", vit_embeds.shape)
|
| 276 |
vit_embeds = self.mlp1(vit_embeds)
|
| 277 |
return vit_embeds
|
| 278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
|
| 280 |
IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
|
| 281 |
|
|
|
|
| 23 |
logger = logging.get_logger(__name__)
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
class InternVLChatModel(PreTrainedModel):
|
| 27 |
config_class = InternVLChatConfig
|
| 28 |
main_input_name = 'pixel_values'
|
|
|
|
| 38 |
self.template = config.template
|
| 39 |
self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
|
| 40 |
self.downsample_ratio = config.downsample_ratio
|
|
|
|
| 41 |
self.ps_version = config.ps_version
|
| 42 |
|
| 43 |
logger.info(f'num_image_token: {self.num_image_token}')
|
|
|
|
| 207 |
return vit_embeds + noise
|
| 208 |
|
| 209 |
def extract_feature(self, pixel_values):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
if self.select_layer == -1:
|
| 211 |
vit_embeds = self.vision_model(
|
| 212 |
pixel_values=pixel_values,
|
|
|
|
| 222 |
if self.training and self.neftune_alpha is not None:
|
| 223 |
vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
h = w = int(vit_embeds.shape[1] ** 0.5)
|
| 226 |
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
|
| 227 |
vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
|
| 228 |
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
|
|
|
|
|
|
|
| 229 |
vit_embeds = self.mlp1(vit_embeds)
|
| 230 |
return vit_embeds
|
| 231 |
|
| 232 |
+
def batch_chat(self, tokenizer, pixel_values, image_counts, questions, generation_config, history=None,
|
| 233 |
+
return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
|
| 234 |
+
IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
|
| 235 |
+
if history is not None or return_history:
|
| 236 |
+
print("Now multi-turn chat is not supported in batch_chat.")
|
| 237 |
+
raise NotImplementedError
|
| 238 |
+
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
| 239 |
+
self.img_context_token_id = img_context_token_id
|
| 240 |
+
if tokenizer.convert_tokens_to_ids('<|im_end|>') != 0:
|
| 241 |
+
eos_token_id = tokenizer.convert_tokens_to_ids('<|im_end|>') # 92542, InternLM2
|
| 242 |
+
else:
|
| 243 |
+
eos_token_id = tokenizer.eos_token_id
|
| 244 |
+
|
| 245 |
+
from .conversation import get_conv_template
|
| 246 |
+
|
| 247 |
+
queries = []
|
| 248 |
+
image_bs = pixel_values.shape[0]
|
| 249 |
+
print(f'dynamic ViT batch size: {image_bs}, image_counts: {image_counts}')
|
| 250 |
+
for idx, image_count in enumerate(image_counts):
|
| 251 |
+
image_token = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * image_count + IMG_END_TOKEN
|
| 252 |
+
question = image_token + '\n' + questions[idx]
|
| 253 |
+
template = get_conv_template(self.template)
|
| 254 |
+
template.append_message(template.roles[0], question)
|
| 255 |
+
template.append_message(template.roles[1], None)
|
| 256 |
+
query = template.get_prompt()
|
| 257 |
+
queries.append(query)
|
| 258 |
+
tokenizer.padding_side = 'left'
|
| 259 |
+
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
| 260 |
+
input_ids = model_inputs['input_ids'].cuda()
|
| 261 |
+
attention_mask = model_inputs['attention_mask'].cuda()
|
| 262 |
+
generation_config['eos_token_id'] = eos_token_id
|
| 263 |
+
|
| 264 |
+
generation_output = self.generate(
|
| 265 |
+
pixel_values=pixel_values,
|
| 266 |
+
input_ids=input_ids,
|
| 267 |
+
attention_mask=attention_mask,
|
| 268 |
+
**generation_config
|
| 269 |
+
)
|
| 270 |
+
responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
|
| 271 |
+
responses = [response.split('<|im_end|>')[0].strip() for response in responses] # for InternLM2
|
| 272 |
+
return responses
|
| 273 |
+
|
| 274 |
def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
|
| 275 |
IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
|
| 276 |
|