jinaai
/

jina-embeddings-v4

@@ -216,22 +216,21 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         Project the hidden states to single-vector embeddings.
         """
         if self._input_has_image(input_ids[0]):  # got document image
-            img_start_pos = torch.where(
-                input_ids[0] == self.config.vision_start_token_id
-            )[0][0]
-            img_end_pos = torch.where(input_ids[0] == self.config.vision_end_token_id)[
-                0
-            ][0]
-            pooled_output = (
-                hidden_states[0][img_start_pos : img_end_pos + 1]
-                .mean(dim=0)
-                .unsqueeze(0)
-            )
         else:  # got query text
             pooled_output = torch.sum(
                 hidden_states * attention_mask.unsqueeze(-1), dim=1
             ) / torch.sum(attention_mask, dim=1, keepdim=True)
         single_vec_emb = self.single_vector_projector(pooled_output)
         return torch.nn.functional.normalize(single_vec_emb, dim=-1)
@@ -310,14 +309,19 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             with torch.no_grad():
                 batch = {k: v.to(self.device) for k, v in batch.items()}
                 with torch.autocast(device_type=torch.device(self.device).type):
                     embeddings = self(**batch)
                     if vector_type == "single_vector":
                         embeddings = embeddings.single_vec_emb
                         if truncate_dim is not None:
                             embeddings = embeddings[:, :truncate_dim]
                     else:
                         embeddings = embeddings.multi_vec_emb
                     results.append(
                         embeddings.cpu()
                         if return_numpy
@@ -442,6 +446,8 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         encode_kwargs = self._validate_encoding_params(vector_type, truncate_dim)
         is_single = len(images) == 1
         embeddings = self._process_batches(
             data=images,
             processor_fn=self.processor.process_images,

         Project the hidden states to single-vector embeddings.
         """
         if self._input_has_image(input_ids[0]):  # got document image
+            img_start_positions = torch.where(input_ids == self.config.vision_start_token_id)[1]
+            img_end_positions = torch.where(input_ids == self.config.vision_end_token_id)[1]
+            batch_size, seq_len = input_ids.shape
+            position_indices = torch.arange(seq_len, device=input_ids.device).expand(batch_size, -1)
+            image_mask = (position_indices >= img_start_positions.unsqueeze(1)) & (position_indices <= img_end_positions.unsqueeze(1))
+            masked_hidden_states = hidden_states * image_mask.unsqueeze(-1)
+            pooled_output = masked_hidden_states.sum(dim=1) / image_mask.sum(dim=1, keepdim=True)
         else:  # got query text
             pooled_output = torch.sum(
                 hidden_states * attention_mask.unsqueeze(-1), dim=1
             ) / torch.sum(attention_mask, dim=1, keepdim=True)
         single_vec_emb = self.single_vector_projector(pooled_output)
         return torch.nn.functional.normalize(single_vec_emb, dim=-1)
             with torch.no_grad():
                 batch = {k: v.to(self.device) for k, v in batch.items()}
                 with torch.autocast(device_type=torch.device(self.device).type):
+                    for key, value in batch.items():
+                        if hasattr(value, 'shape'):
+                            print(f"{key}: {value.shape}")
+                        else:
+                            print(f"{key}: {type(value)}")
                     embeddings = self(**batch)
+                    print(embeddings.single_vec_emb.shape, embeddings.multi_vec_emb.shape)
                     if vector_type == "single_vector":
                         embeddings = embeddings.single_vec_emb
                         if truncate_dim is not None:
                             embeddings = embeddings[:, :truncate_dim]
                     else:
                         embeddings = embeddings.multi_vec_emb
                     results.append(
                         embeddings.cpu()
                         if return_numpy
         encode_kwargs = self._validate_encoding_params(vector_type, truncate_dim)
         is_single = len(images) == 1
+        print(is_single)
+        print(len(images))
         embeddings = self._process_batches(
             data=images,
             processor_fn=self.processor.process_images,