use customized code

Browse files

Files changed (4) hide show

got_vision_b.py +0 -10
modeling_GOT.py +74 -142
render_tools.py +0 -25
tokenization_qwen.py +4 -8

got_vision_b.py CHANGED Viewed

@@ -129,7 +129,6 @@ class ImageEncoderViT(nn.Module):
             LayerNorm2d(out_chans),
         )
         self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
         self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
@@ -145,7 +144,6 @@ class ImageEncoderViT(nn.Module):
         x = self.net_2(x)
         x = self.net_3(x)
         return x
@@ -272,7 +270,6 @@ class Attention(nn.Module):
         return x
 def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
     """
     Partition into non-overlapping windows with padding if needed.
@@ -296,7 +293,6 @@ def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, T
     windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
     return windows, (Hp, Wp)
 def window_unpartition(
     windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
 ) -> torch.Tensor:
@@ -321,7 +317,6 @@ def window_unpartition(
         x = x[:, :H, :W, :].contiguous()
     return x
 def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
     """
     Get relative positional embeddings according to the relative positions of
@@ -354,7 +349,6 @@ def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor
     return rel_pos_resized[relative_coords.long()]
 def add_decomposed_rel_pos(
     attn: torch.Tensor,
     q: torch.Tensor,
@@ -425,8 +419,6 @@ class PatchEmbed(nn.Module):
         x = x.permute(0, 2, 3, 1)
         return x
 def build_GOT_vit_b(checkpoint=None):
     return _build_GOT_vision(
         encoder_embed_dim=768,
@@ -436,7 +428,6 @@ def build_GOT_vit_b(checkpoint=None):
         checkpoint=checkpoint,
     )
 def _build_GOT_vision(
     encoder_embed_dim,
     encoder_depth,
@@ -462,7 +453,6 @@ def _build_GOT_vision(
             window_size=14,
             out_chans=prompt_embed_dim,
         )
     return image_encoder

             LayerNorm2d(out_chans),
         )
         self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
         self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
         x = self.net_2(x)
         x = self.net_3(x)
         return x
         return x
 def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
     """
     Partition into non-overlapping windows with padding if needed.
     windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
     return windows, (Hp, Wp)
 def window_unpartition(
     windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
 ) -> torch.Tensor:
         x = x[:, :H, :W, :].contiguous()
     return x
 def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
     """
     Get relative positional embeddings according to the relative positions of
     return rel_pos_resized[relative_coords.long()]
 def add_decomposed_rel_pos(
     attn: torch.Tensor,
     q: torch.Tensor,
         x = x.permute(0, 2, 3, 1)
         return x
 def build_GOT_vit_b(checkpoint=None):
     return _build_GOT_vision(
         encoder_embed_dim=768,
         checkpoint=checkpoint,
     )
 def _build_GOT_vision(
     encoder_embed_dim,
     encoder_depth,
             window_size=14,
             out_chans=prompt_embed_dim,
         )
     return image_encoder

modeling_GOT.py CHANGED Viewed

@@ -12,7 +12,6 @@ from .got_vision_b import build_GOT_vit_b
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 import dataclasses
-###
 DEFAULT_IMAGE_TOKEN = "<image>"
 DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
@@ -20,6 +19,15 @@ DEFAULT_IM_START_TOKEN = '<img>'
 DEFAULT_IM_END_TOKEN = '</img>'
 from enum import auto, Enum
 class SeparatorStyle(Enum):
     """Different separator style."""
     SINGLE = auto()
@@ -79,7 +87,6 @@ class Conversation:
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
     def append_message(self, role, message):
         self.messages.append([role, message])
@@ -94,7 +101,6 @@ class Conversation:
             sep2=self.sep2)
 class KeywordsStoppingCriteria(StoppingCriteria):
     def __init__(self, keywords, tokenizer, input_ids):
         self.keywords = keywords
@@ -116,7 +122,7 @@ class KeywordsStoppingCriteria(StoppingCriteria):
                 if keyword in outputs:
                     return True
         return False
 class GOTImageEvalProcessor:
     def __init__(self, image_size=384, mean=None, std=None):
@@ -140,7 +146,6 @@ class GOTImageEvalProcessor:
         return self.transform(item)
 class GOTConfig(Qwen2Config):
     model_type = "GOT"
@@ -155,7 +160,6 @@ class GOTQwenModel(Qwen2Model):
         self.mm_projector_vary =  nn.Linear(1024, 1024)
     def initialize_vision_modules(
         self,
         vision_tower,
@@ -167,14 +171,12 @@ class GOTQwenModel(Qwen2Model):
         device="cuda"
     ):
         image_processor_high = GOTImageEvalProcessor(image_size=1024)
         self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
         self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
         image_token_len = 256
         self.config.vision_tower = vision_tower
@@ -184,13 +186,12 @@ class GOTQwenModel(Qwen2Model):
         self.config.vision_select_layer = vision_select_layer
         self.config.freeze_vision_tower = freeze_vision_tower
         return dict(
             image_processor_high=image_processor_high,
             image_token_len=image_token_len,
         )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -205,7 +206,6 @@ class GOTQwenModel(Qwen2Model):
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-        # HACK: replace back original embeddings for LLaVA pretraining
         orig_embeds_params = getattr(self, 'orig_embeds_params', None)
         if orig_embeds_params is not None:
             with torch.no_grad():
@@ -214,10 +214,8 @@ class GOTQwenModel(Qwen2Model):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         vision_tower_high = getattr(self, 'vision_tower_high', None)
         if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
             use_im_start_end = getattr(self.config, "use_im_start_end", -1)
@@ -232,9 +230,9 @@ class GOTQwenModel(Qwen2Model):
             im_start_token = 151857
             im_end_token = 151858
             image_features = []
             for image in images:
                 P, C, H, W = image.shape
                 if P == 1:
@@ -249,7 +247,7 @@ class GOTQwenModel(Qwen2Model):
                     image_patches_features = []
                     for image_patch in image_patches:
                         image_p = torch.stack([image_patch])
                         with torch.set_grad_enabled(False):
                             cnn_feature_p = vision_tower_high(image_p)
                             cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
@@ -258,7 +256,6 @@ class GOTQwenModel(Qwen2Model):
                     image_feature = torch.cat(image_patches_features, dim=1)
                     image_features.append(image_feature)
             dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             dummy_image_features = dummy_image_features_2
             use_im_start_end = True
@@ -272,7 +269,7 @@ class GOTQwenModel(Qwen2Model):
                 if use_im_start_end:
                     if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
                         raise ValueError("The number of image start tokens and image end tokens should be the same.")
                     image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
                     for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
                         per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
@@ -280,7 +277,7 @@ class GOTQwenModel(Qwen2Model):
                         if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
                             raise ValueError("The image end token should follow the image start token.")
                         cur_input_embeds = torch.cat(
                             (
                                 cur_input_embeds[:image_start_token_pos+1],
@@ -290,7 +287,6 @@ class GOTQwenModel(Qwen2Model):
                             dim=0
                         )
                     new_input_embeds.append(cur_input_embeds)
                 else:
                     raise NotImplementedError
@@ -305,10 +301,8 @@ class GOTQwenModel(Qwen2Model):
         )
 class GOTQwenForCausalLM(Qwen2ForCausalLM):
     config_class = GOTConfig
-    # supports_gradient_checkpointing = True
     def __init__(self, config):
         super(Qwen2ForCausalLM, self).__init__(config)
@@ -317,7 +311,6 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
         self.post_init()
     def get_model(self):
@@ -336,7 +329,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -362,18 +355,13 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         logits = self.lm_head(hidden_states)
         logits = logits.float()
-        # logits
         loss = None
         if labels is not None:
-            # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
             loss_fct = CrossEntropyLoss()
             shift_logits = shift_logits.view(-1, self.config.vocab_size)
             shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)
@@ -389,63 +377,49 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
-        # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "images": kwargs.get("images", None),
-            }
-        )
         return model_inputs
     def initialize_vision_tokenizer(
@@ -457,7 +431,6 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
     ):
         config = self.get_model().config
         self.resize_token_embeddings(len(tokenizer))
         config.im_patch_token = 151859
@@ -488,7 +461,6 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         self.disable_torch_init()
         image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
@@ -501,7 +473,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             image = self.load_image(image_file)
         w, h = image.size
         if ocr_type == 'format':
             qs = 'OCR with format: '
         else:
@@ -533,10 +505,9 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
-        You should follow the instructions carefully and explain your answers in detail.""",
             # system = None,
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
@@ -566,7 +537,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_tensor_1.unsqueeze(0).half().cuda()],
@@ -578,7 +549,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                     stopping_criteria=[stopping_criteria]
                     )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_tensor_1.unsqueeze(0).half().cuda()],
@@ -589,9 +560,9 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
                     )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
@@ -599,24 +570,13 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         if render:
             print('==============rendering===============')
-            from .render_tools import svg_to_html, content_mmd_to_html, tik_html, translation_table
             if '**kern' in outputs:
-                import verovio
-                tk = verovio.toolkit()
-                tk.loadData(outputs)
-                tk.setOptions({"pageWidth": 2100, "footer": 'none',
-            'barLineWidth': 0.5, 'beamMaxSlope': 15,
-            'staffLineWidth': 0.2, 'spacingStaff': 6})
-                tk.getPageCount()
-                svg = tk.renderToSVG()
-                svg = svg.replace("overflow=\"inherit\"", "overflow=\"visible\"")
-                svg_to_html(svg, save_render_file)
             if ocr_type == 'format' and '**kern' not in outputs:
                 if  '\\begin{tikzpicture}' not in outputs:
                     html_path_2 = save_render_file
                     right_num = outputs.count('\\right')
@@ -625,16 +585,14 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                     if right_num != left_num:
                         outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
                     outputs = outputs.replace('"', '``').replace('$', '')
                     outputs_list = outputs.split('\n')
                     gt= ''
                     for out in outputs_list:
                         gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
-                    gt = gt[:-2]
                     lines = content_mmd_to_html
                     lines = lines.split("const text =")
@@ -652,7 +610,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
                                     if out[-1] != ';':
                                         gt += out[:-1] + ';\n'
@@ -661,7 +619,6 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                             else:
                                 gt += out + '\n'
                     lines = tik_html
                     lines = lines.split("const text =")
                     new_web = lines[0] + gt + lines[1]
@@ -671,7 +628,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
         def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
             best_ratio_diff = float('inf')
             best_ratio = (1, 1)
@@ -685,30 +642,24 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 elif ratio_diff == best_ratio_diff:
                     if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                         best_ratio = ratio
-            # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
             return best_ratio
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
-        # calculate the existing image aspect ratio
         target_ratios = set(
             (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
             i * j <= max_num and i * j >= min_num)
-        # print(target_ratios)
         target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-        # find the closest aspect ratio to the target
         target_aspect_ratio = find_closest_aspect_ratio(
             aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-        # print(target_aspect_ratio)
-        # calculate the target width and height
         target_width = image_size * target_aspect_ratio[0]
         target_height = image_size * target_aspect_ratio[1]
         blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-        # resize the image
         resized_img = image.resize((target_width, target_height))
         processed_images = []
         for i in range(blocks):
@@ -718,7 +669,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 ((i % (target_width // image_size)) + 1) * image_size,
                 ((i // (target_width // image_size)) + 1) * image_size
             )
-            # split the image
             split_img = resized_img.crop(box)
             processed_images.append(split_img)
         assert len(processed_images) == blocks
@@ -727,40 +678,26 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             processed_images.append(thumbnail_img)
         return processed_images
-    def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
-        # Model
         self.disable_torch_init()
         multi_page=False
         image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
         image_token_len = 256
         image_list = []
-        # if len(image_file_list)>1:
-        #     multi_page = True
         if multi_page:
             qs = 'OCR with format across multi pages: '
-            # only for png files
-            # import glob
-            # from natsort import natsorted
-            # patches = glob.glob(image_file + '/*png')
             patches = image_file
-            # patches = natsorted(patches)
             sub_images = []
             for sub_image in patches:
                 sub_images.append(self.load_image(sub_image))
             ll = len(patches)
-            # print(patches)
-            # print("len ll: ", ll)
         else:
             if ocr_type == 'format':
@@ -778,21 +715,16 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             image_tensor_1 = image_processor_high(image)
             image_list.append(image_tensor_1)
         image_list = torch.stack(image_list)
-        print('====new images batch size======:  \n',image_list.shape)
         if use_im_start_end:
             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
-        You should follow the instructions carefully and explain your answers in detail.""",
             # system = None,
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
@@ -811,8 +743,8 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             print(prompt)
         inputs = tokenizer([prompt])
         input_ids = torch.as_tensor(inputs.input_ids).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
@@ -820,32 +752,33 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    # no_repeat_ngram_size = 20,
                     streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    # no_repeat_ngram_size = 20,
                     # streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
@@ -861,14 +794,13 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             if right_num != left_num:
                 outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
             outputs = outputs.replace('"', '``').replace('$', '')
             outputs_list = outputs.split('\n')
             gt= ''
             for out in outputs_list:
                 gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
             gt = gt[:-2]
             lines = content_mmd_to_html

 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 import dataclasses
 DEFAULT_IMAGE_TOKEN = "<image>"
 DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
 DEFAULT_IM_END_TOKEN = '</img>'
 from enum import auto, Enum
+def has_bfloat16_support():
+    if not torch.cuda.is_available():
+        return False
+    capability = torch.cuda.get_device_capability()
+    return capability >= (8, 0)
+SUPPORTED_DTYPE = torch.bfloat16 if has_bfloat16_support() else torch.float16
 class SeparatorStyle(Enum):
     """Different separator style."""
     SINGLE = auto()
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
     def append_message(self, role, message):
         self.messages.append([role, message])
             sep2=self.sep2)
 class KeywordsStoppingCriteria(StoppingCriteria):
     def __init__(self, keywords, tokenizer, input_ids):
         self.keywords = keywords
                 if keyword in outputs:
                     return True
         return False
 class GOTImageEvalProcessor:
     def __init__(self, image_size=384, mean=None, std=None):
         return self.transform(item)
 class GOTConfig(Qwen2Config):
     model_type = "GOT"
         self.mm_projector_vary =  nn.Linear(1024, 1024)
     def initialize_vision_modules(
         self,
         vision_tower,
         device="cuda"
     ):
         image_processor_high = GOTImageEvalProcessor(image_size=1024)
         self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
         self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
         image_token_len = 256
         self.config.vision_tower = vision_tower
         self.config.vision_select_layer = vision_select_layer
         self.config.freeze_vision_tower = freeze_vision_tower
         return dict(
             image_processor_high=image_processor_high,
             image_token_len=image_token_len,
         )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         orig_embeds_params = getattr(self, 'orig_embeds_params', None)
         if orig_embeds_params is not None:
             with torch.no_grad():
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         vision_tower_high = getattr(self, 'vision_tower_high', None)
         if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
             use_im_start_end = getattr(self.config, "use_im_start_end", -1)
             im_start_token = 151857
             im_end_token = 151858
             image_features = []
             for image in images:
                 P, C, H, W = image.shape
                 if P == 1:
                     image_patches_features = []
                     for image_patch in image_patches:
                         image_p = torch.stack([image_patch])
                         with torch.set_grad_enabled(False):
                             cnn_feature_p = vision_tower_high(image_p)
                             cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
                     image_feature = torch.cat(image_patches_features, dim=1)
                     image_features.append(image_feature)
             dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             dummy_image_features = dummy_image_features_2
             use_im_start_end = True
                 if use_im_start_end:
                     if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
                         raise ValueError("The number of image start tokens and image end tokens should be the same.")
                     image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
                     for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
                         per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
                         if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
                             raise ValueError("The image end token should follow the image start token.")
                         cur_input_embeds = torch.cat(
                             (
                                 cur_input_embeds[:image_start_token_pos+1],
                             dim=0
                         )
                     new_input_embeds.append(cur_input_embeds)
                 else:
                     raise NotImplementedError
         )
 class GOTQwenForCausalLM(Qwen2ForCausalLM):
     config_class = GOTConfig
     def __init__(self, config):
         super(Qwen2ForCausalLM, self).__init__(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
     def get_model(self):
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
         logits = self.lm_head(hidden_states)
         logits = logits.float()
         loss = None
         if labels is not None:
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss()
             shift_logits = shift_logits.view(-1, self.config.vocab_size)
             shift_labels = shift_labels.view(-1)
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device)
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
+                current_length = cache_length
+                max_cache_shape = past_key_values.get_max_cache_shape()
+                max_cache_length = max_cache_shape[1] if max_cache_shape else None
             else:
+                cache_length = past_key_values[0][0].shape[2]
+                current_length = cache_length
                 max_cache_length = None
+            if attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - cache_length):]
+            elif cache_length < input_ids.shape[1]:
+                input_ids = input_ids[:, cache_length:]
+            if max_cache_length is not None and attention_mask is not None:
+                if cache_length + input_ids.shape[1] > max_cache_length:
+                    attention_mask = attention_mask[:, -max_cache_length:]
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        model_inputs = {
+            "input_ids": input_ids if inputs_embeds is None or past_key_values is not None else None,
+            "inputs_embeds": inputs_embeds if past_key_values is None else None,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "images": kwargs.get("images", None),
+            "use_cache": kwargs.get("use_cache", True)
+        }
         return model_inputs
     def initialize_vision_tokenizer(
     ):
         config = self.get_model().config
         self.resize_token_embeddings(len(tokenizer))
         config.im_patch_token = 151859
         self.disable_torch_init()
         image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
             image = self.load_image(image_file)
         w, h = image.size
         if ocr_type == 'format':
             qs = 'OCR with format: '
         else:
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
+You should follow the instructions carefully and explain your answers in detail.""",
             # system = None,
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
+            with torch.autocast("cuda", dtype=SUPPORTED_DTYPE):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     stopping_criteria=[stopping_criteria]
                     )
         else:
+            with torch.autocast("cuda", dtype=SUPPORTED_DTYPE):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
                     )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
         if render:
             print('==============rendering===============')
+            from .render_tools import content_mmd_to_html, tik_html, translation_table
             if '**kern' in outputs:
+                print("Musical notation detected but Verovio rendering is disabled")
             if ocr_type == 'format' and '**kern' not in outputs:
                 if  '\\begin{tikzpicture}' not in outputs:
                     html_path_2 = save_render_file
                     right_num = outputs.count('\\right')
                     if right_num != left_num:
                         outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
                     outputs = outputs.replace('"', '``').replace('$', '')
                     outputs_list = outputs.split('\n')
                     gt= ''
                     for out in outputs_list:
                         gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
+                    gt = gt[:-2]
                     lines = content_mmd_to_html
                     lines = lines.split("const text =")
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
                                     if out[-1] != ';':
                                         gt += out[:-1] + ';\n'
                             else:
                                 gt += out + '\n'
                     lines = tik_html
                     lines = lines.split("const text =")
                     new_web = lines[0] + gt + lines[1]
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
         def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
             best_ratio_diff = float('inf')
             best_ratio = (1, 1)
                 elif ratio_diff == best_ratio_diff:
                     if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                         best_ratio = ratio
             return best_ratio
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
         target_ratios = set(
             (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
             i * j <= max_num and i * j >= min_num)
         target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
         target_aspect_ratio = find_closest_aspect_ratio(
             aspect_ratio, target_ratios, orig_width, orig_height, image_size)
         target_width = image_size * target_aspect_ratio[0]
         target_height = image_size * target_aspect_ratio[1]
         blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
         resized_img = image.resize((target_width, target_height))
         processed_images = []
         for i in range(blocks):
                 ((i % (target_width // image_size)) + 1) * image_size,
                 ((i // (target_width // image_size)) + 1) * image_size
             )
             split_img = resized_img.crop(box)
             processed_images.append(split_img)
         assert len(processed_images) == blocks
             processed_images.append(thumbnail_img)
         return processed_images
+    def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag=False):
         self.disable_torch_init()
         multi_page=False
         image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         use_im_start_end = True
         image_token_len = 256
         image_list = []
         if multi_page:
             qs = 'OCR with format across multi pages: '
             patches = image_file
             sub_images = []
             for sub_image in patches:
                 sub_images.append(self.load_image(sub_image))
             ll = len(patches)
         else:
             if ocr_type == 'format':
             image_tensor_1 = image_processor_high(image)
             image_list.append(image_tensor_1)
         image_list = torch.stack(image_list)
         if use_im_start_end:
             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
+You should follow the instructions carefully and explain your answers in detail.""",
             # system = None,
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
             print(prompt)
         inputs = tokenizer([prompt])
         input_ids = torch.as_tensor(inputs.input_ids).cuda()
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device)
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
+            with torch.autocast("cuda", dtype=SUPPORTED_DTYPE):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
+                    attention_mask=attention_mask,
                     do_sample=False,
                     streamer=streamer,
+                    num_beams=1,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         else:
+            with torch.autocast("cuda", dtype=SUPPORTED_DTYPE):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
+                    attention_mask=attention_mask,
                     do_sample=False,
                     # streamer=streamer,
+                    num_beams=1,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
             if right_num != left_num:
                 outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
             outputs = outputs.replace('"', '``').replace('$', '')
             outputs_list = outputs.split('\n')
             gt= ''
             for out in outputs_list:
                 gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
             gt = gt[:-2]
             lines = content_mmd_to_html

render_tools.py CHANGED Viewed

@@ -5,29 +5,6 @@ punctuation_dict = {
 }
 translation_table = str.maketrans(punctuation_dict)
-def svg_to_html(svg_content, output_filename):
-    html_content = f"""
-    <!DOCTYPE html>
-    <html lang="en">
-    <head>
-        <meta charset="UTF-8">
-        <meta name="viewport" content="width=device-width, initial-scale=1.0">
-        <title>SVG Embedded in HTML</title>
-    </head>
-    <body>
-        <svg width="2100" height="15000" xmlns="http://www.w3.org/2000/svg">
-            {svg_content}
-        </svg>
-    </body>
-    </html>
-    """
-    with open(output_filename, 'w') as file:
-        file.write(html_content)
 content_mmd_to_html = """<!DOCTYPE html>
 <html lang="en" data-lt-installed="true"><head>
@@ -71,7 +48,6 @@ content_mmd_to_html = """<!DOCTYPE html>
 """
 tik_html = """
 <!DOCTYPE html>
@@ -92,5 +68,4 @@ const text =
 </html>"""
 # print(tik_html)

 }
 translation_table = str.maketrans(punctuation_dict)
 content_mmd_to_html = """<!DOCTYPE html>
 <html lang="en" data-lt-installed="true"><head>
 """
 tik_html = """
 <!DOCTYPE html>
 </html>"""
 # print(tik_html)

tokenization_qwen.py CHANGED Viewed

@@ -23,9 +23,6 @@ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s
 ENDOFTEXT = "<|endoftext|>"
 IMSTART = "<|im_start|>"
 IMEND = "<|im_end|>"
-# as the default behavior is changed to allow special tokens in
-# regular texts, the surface forms of special tokens need to be
-# as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
 SPECIAL_TOKENS = (
     ENDOFTEXT,
@@ -81,9 +78,9 @@ class QWenTokenizer(PreTrainedTokenizer):
             image_pad_tag
         )
-        self.errors = errors  # how to handle errors in decoding
-        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
         self.special_tokens = {
             token: index
             for index, token in enumerate(
@@ -113,10 +110,10 @@ class QWenTokenizer(PreTrainedTokenizer):
         self.decoder = {
             v: k for k, v in self.mergeable_ranks.items()
-        }  # type: dict[int, bytes|str]
         self.decoder.update({v: k for k, v in self.special_tokens.items()})
-        self.tokenizer = enc  # type: tiktoken.Encoding
         self.eod_id = self.tokenizer.eot_token
         self.im_start_id = self.special_tokens[IMSTART]
@@ -196,7 +193,6 @@ class QWenTokenizer(PreTrainedTokenizer):
         tokens = []
         text = unicodedata.normalize("NFC", text)
-        # this implementation takes a detour: text -> token id -> token surface forms
         for t in self.tokenizer.encode(
             text, allowed_special=allowed_special, disallowed_special=disallowed_special
         ):

 ENDOFTEXT = "<|endoftext|>"
 IMSTART = "<|im_start|>"
 IMEND = "<|im_end|>"
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
 SPECIAL_TOKENS = (
     ENDOFTEXT,
             image_pad_tag
         )
+        self.errors = errors
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
         self.special_tokens = {
             token: index
             for index, token in enumerate(
         self.decoder = {
             v: k for k, v in self.mergeable_ranks.items()
+        }
         self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc
         self.eod_id = self.tokenizer.eot_token
         self.im_start_id = self.special_tokens[IMSTART]
         tokens = []
         text = unicodedata.normalize("NFC", text)
         for t in self.tokenizer.encode(
             text, allowed_special=allowed_special, disallowed_special=disallowed_special
         ):