Feature Extraction
Transformers
Safetensors
English
Chinese
emova
Omni-modal-LLM
Multi-modal-LLM
Emotional-spoken-dialogue
custom_code
Eval Results
KaiChen1998 commited on
Commit
d01bd6b
·
verified ·
1 Parent(s): e76d237

Upload processor

Browse files
added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' }}{% if message['content'] is string %}{{ message['content'] + '<|im_end|>\n' }}{% else %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + '<|im_end|>\n' }}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
3
+ }
image_processing_emova.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """Image processor class for Qwen2-VL."""
21
+
22
+ import math
23
+ from typing import Dict, List, Optional, Union
24
+
25
+ import numpy as np
26
+
27
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
28
+ from transformers.image_transforms import (
29
+ convert_to_rgb,
30
+ resize,
31
+ to_channel_dimension_format,
32
+ )
33
+ from .image_utils import (
34
+ OPENAI_CLIP_MEAN,
35
+ OPENAI_CLIP_STD,
36
+ ChannelDimension,
37
+ ImageInput,
38
+ PILImageResampling,
39
+ VideoInput,
40
+ get_image_size,
41
+ infer_channel_dimension_format,
42
+ is_scaled_image,
43
+ is_valid_image,
44
+ make_list_of_images,
45
+ to_numpy_array,
46
+ valid_images,
47
+ validate_preprocess_arguments,
48
+ )
49
+ from transformers.utils import TensorType, is_vision_available, logging
50
+
51
+ logger = logging.get_logger(__name__)
52
+
53
+ if is_vision_available():
54
+ from PIL import Image
55
+
56
+
57
+ def make_batched_images(images) -> List[List[ImageInput]]:
58
+ """
59
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
60
+
61
+ Args:
62
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
63
+ The input image.
64
+
65
+ Returns:
66
+ list: A list of images.
67
+ """
68
+ if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
69
+ return [img for img_list in images for img in img_list]
70
+
71
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
72
+ return images
73
+
74
+ elif is_valid_image(images):
75
+ return [images]
76
+
77
+ raise ValueError(f"Could not make batched images from {images}")
78
+
79
+
80
+ # Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
81
+ def make_batched_videos(videos) -> List[VideoInput]:
82
+ if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
83
+ return videos
84
+
85
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
86
+ if isinstance(videos[0], Image.Image):
87
+ return [videos]
88
+ elif len(videos[0].shape) == 4:
89
+ return [list(video) for video in videos]
90
+
91
+ elif is_valid_image(videos) and len(videos.shape) == 4:
92
+ return [list(videos)]
93
+
94
+ raise ValueError(f"Could not make batched video from {videos}")
95
+
96
+
97
+ def smart_resize(
98
+ height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 4096
99
+ ):
100
+ """Rescales the image so that the following conditions are met:
101
+
102
+ 1. Both dimensions (height and width) are divisible by 'factor'.
103
+
104
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
105
+
106
+ 3. The aspect ratio of the image is maintained as closely as possible.
107
+
108
+ """
109
+ if height < factor or width < factor:
110
+ # print("height, width", height, width)
111
+ if height < width:
112
+ h_bar = factor
113
+ w_bar = round(width / height * factor)
114
+ else:
115
+ h_bar = round(height / width * factor)
116
+ w_bar = factor
117
+ # print("h_bar, w_bar", h_bar, w_bar)
118
+ height, width = h_bar, w_bar
119
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
120
+ elif max(height, width) / min(height, width) > 200:
121
+ raise ValueError(
122
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
123
+ )
124
+ h_bar = round(height / factor) * factor
125
+ w_bar = round(width / factor) * factor
126
+ if h_bar * w_bar > max_pixels:
127
+ beta = math.sqrt((height * width) / max_pixels)
128
+ h_bar = math.floor(height / beta / factor) * factor
129
+ w_bar = math.floor(width / beta / factor) * factor
130
+ elif h_bar * w_bar < min_pixels:
131
+ beta = math.sqrt(min_pixels / (height * width))
132
+ h_bar = math.ceil(height * beta / factor) * factor
133
+ w_bar = math.ceil(width * beta / factor) * factor
134
+ return h_bar, w_bar
135
+
136
+
137
+ class EMOVAImageProcessor(BaseImageProcessor):
138
+ r"""
139
+ Constructs a Qwen2-VL image processor that dynamically resizes images based on the original images.
140
+
141
+ Args:
142
+ do_resize (`bool`, *optional*, defaults to `True`):
143
+ Whether to resize the image's (height, width) dimensions.
144
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
145
+ Resampling filter to use when resizing the image.
146
+ do_rescale (`bool`, *optional*, defaults to `True`):
147
+ Whether to rescale the image by the specified scale `rescale_factor`.
148
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
149
+ Scale factor to use if rescaling the image.
150
+ do_normalize (`bool`, *optional*, defaults to `True`):
151
+ Whether to normalize the image.
152
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
153
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
154
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
155
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
156
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
157
+ Whether to convert the image to RGB.
158
+ min_pixels (`int`, *optional*, defaults to `56 * 56`):
159
+ The min pixels of the image to resize the image.
160
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
161
+ The max pixels of the image to resize the image.
162
+ patch_size (`int`, *optional*, defaults to 14):
163
+ The spacial patch size of the vision encoder.
164
+ temporal_patch_size (`int`, *optional*, defaults to 2):
165
+ The temporal patch size of the vision encoder.
166
+ merge_size (`int`, *optional*, defaults to 2):
167
+ The merge size of the vision encoder to llm encoder.
168
+ """
169
+
170
+ model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
171
+
172
+ def __init__(
173
+ self,
174
+ do_resize: bool = True,
175
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
176
+ do_rescale: bool = True,
177
+ rescale_factor: Union[int, float] = 1 / 255,
178
+ do_normalize: bool = True,
179
+ image_mean: Optional[Union[float, List[float]]] = None,
180
+ image_std: Optional[Union[float, List[float]]] = None,
181
+ do_convert_rgb: bool = True,
182
+ min_pixels: int = 56 * 56,
183
+ max_pixels: int = 28 * 28 * 4096,
184
+ patch_size: int = 14,
185
+ temporal_patch_size: int = 2,
186
+ merge_size: int = 2,
187
+ **kwargs,
188
+ ) -> None:
189
+ super().__init__(**kwargs)
190
+ self.do_resize = do_resize
191
+ self.resample = resample
192
+ self.do_rescale = do_rescale
193
+ self.rescale_factor = rescale_factor
194
+ self.do_normalize = do_normalize
195
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
196
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
197
+ self.min_pixels = min_pixels
198
+ self.max_pixels = max_pixels
199
+ self.patch_size = patch_size
200
+ self.temporal_patch_size = temporal_patch_size
201
+ self.merge_size = merge_size
202
+ self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
203
+ self.do_convert_rgb = do_convert_rgb
204
+
205
+ def _preprocess(
206
+ self,
207
+ images: Union[ImageInput, VideoInput],
208
+ do_resize: bool = None,
209
+ resample: PILImageResampling = None,
210
+ do_rescale: bool = None,
211
+ rescale_factor: float = None,
212
+ do_normalize: bool = None,
213
+ image_mean: Optional[Union[float, List[float]]] = None,
214
+ image_std: Optional[Union[float, List[float]]] = None,
215
+ do_convert_rgb: bool = None,
216
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
217
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
218
+ ):
219
+ """
220
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
221
+
222
+ Args:
223
+ images (`ImageInput`):
224
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
225
+ vision_info (`List[Dict]`, *optional*):
226
+ Optional list of dictionaries containing additional information about vision inputs.
227
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
228
+ Whether to resize the image.
229
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
230
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
231
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
232
+ Whether to rescale the image.
233
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
234
+ Scale factor to use if rescaling the image.
235
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
236
+ Whether to normalize the image.
237
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
238
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
239
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
240
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
241
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
242
+ Whether to convert the image to RGB.
243
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
244
+ The channel dimension format for the output image. Can be one of:
245
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
246
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
247
+ - Unset: Use the channel dimension format of the input image.
248
+ input_data_format (`ChannelDimension` or `str`, *optional*):
249
+ The channel dimension format for the input image. Can be one of:
250
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
251
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
252
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
253
+ """
254
+ # import pdb; pdb.set_trace()
255
+ # print("images", images)
256
+ # for image in images:
257
+ # print("image", image.size)
258
+ images = make_list_of_images(images)
259
+
260
+ if do_convert_rgb:
261
+ images = [convert_to_rgb(image) for image in images]
262
+
263
+ # All transformations expect numpy arrays.
264
+ images = [to_numpy_array(image) for image in images]
265
+
266
+ if is_scaled_image(images[0]) and do_rescale:
267
+ logger.warning_once(
268
+ "It looks like you are trying to rescale already rescaled images. If the input"
269
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
270
+ )
271
+ if input_data_format is None:
272
+ # We assume that all images have the same channel dimension format.
273
+ input_data_format = infer_channel_dimension_format(images[0])
274
+
275
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
276
+ resized_height, resized_width = height, width
277
+ processed_images = []
278
+ for image in images:
279
+ if do_resize:
280
+ resized_height, resized_width = smart_resize(
281
+ height,
282
+ width,
283
+ factor=self.patch_size * self.merge_size,
284
+ min_pixels=self.min_pixels,
285
+ max_pixels=self.max_pixels,
286
+ )
287
+ image = resize(
288
+ image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
289
+ )
290
+
291
+ if do_rescale:
292
+ image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
293
+
294
+ if do_normalize:
295
+ image = self.normalize(
296
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
297
+ )
298
+
299
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
300
+ processed_images.append(image)
301
+
302
+ patches = np.array(processed_images)
303
+ if data_format == ChannelDimension.LAST:
304
+ patches = patches.transpose(0, 3, 1, 2)
305
+ if patches.shape[0] == 1:
306
+ patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
307
+ channel = patches.shape[1]
308
+ grid_t = patches.shape[0] // self.temporal_patch_size
309
+ grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
310
+ patches = patches.reshape(
311
+ grid_t,
312
+ self.temporal_patch_size,
313
+ channel,
314
+ grid_h // self.merge_size,
315
+ self.merge_size,
316
+ self.patch_size,
317
+ grid_w // self.merge_size,
318
+ self.merge_size,
319
+ self.patch_size,
320
+ )
321
+ patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
322
+ flatten_patches = patches.reshape(
323
+ grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
324
+ )
325
+
326
+ return flatten_patches, (grid_t, grid_h, grid_w)
327
+
328
+ def preprocess(
329
+ self,
330
+ images: ImageInput,
331
+ videos: VideoInput = None,
332
+ do_resize: bool = None,
333
+ size: Dict[str, int] = None,
334
+ resample: PILImageResampling = None,
335
+ do_rescale: bool = None,
336
+ rescale_factor: float = None,
337
+ do_normalize: bool = None,
338
+ image_mean: Optional[Union[float, List[float]]] = None,
339
+ image_std: Optional[Union[float, List[float]]] = None,
340
+ do_convert_rgb: bool = None,
341
+ return_tensors: Optional[Union[str, TensorType]] = None,
342
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
343
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
344
+ ):
345
+ """
346
+ Args:
347
+ images (`ImageInput`):
348
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
349
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
350
+ videos (`VideoInput`):
351
+ Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
352
+ passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
353
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
354
+ Whether to resize the image.
355
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
356
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
357
+ the longest edge resized to keep the input aspect ratio.
358
+ resample (`int`, *optional*, defaults to `self.resample`):
359
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
360
+ has an effect if `do_resize` is set to `True`.
361
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
362
+ Whether to rescale the image.
363
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
364
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
365
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
366
+ Whether to normalize the image.
367
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
368
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
369
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
370
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
371
+ `True`.
372
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
373
+ Whether to convert the image to RGB.
374
+ return_tensors (`str` or `TensorType`, *optional*):
375
+ The type of tensors to return. Can be one of:
376
+ - Unset: Return a list of `np.ndarray`.
377
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
378
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
379
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
380
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
381
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
382
+ The channel dimension format for the output image. Can be one of:
383
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
384
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
385
+ - Unset: Use the channel dimension format of the input image.
386
+ input_data_format (`ChannelDimension` or `str`, *optional*):
387
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
388
+ from the input image. Can be one of:
389
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
390
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
391
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
392
+
393
+ """
394
+ do_resize = do_resize if do_resize is not None else self.do_resize
395
+ size = size if size is not None else self.size
396
+ resample = resample if resample is not None else self.resample
397
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
398
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
399
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
400
+ image_mean = image_mean if image_mean is not None else self.image_mean
401
+ image_std = image_std if image_std is not None else self.image_std
402
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
403
+
404
+ if images is not None:
405
+ images = make_batched_images(images)
406
+ if videos is not None:
407
+ videos = make_batched_videos(videos)
408
+
409
+ if images is not None and not valid_images(images):
410
+ raise ValueError(
411
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
412
+ "torch.Tensor, tf.Tensor or jax.ndarray."
413
+ )
414
+
415
+ validate_preprocess_arguments(
416
+ rescale_factor=rescale_factor,
417
+ do_normalize=do_normalize,
418
+ image_mean=image_mean,
419
+ image_std=image_std,
420
+ do_resize=do_resize,
421
+ size=size,
422
+ resample=resample,
423
+ )
424
+
425
+ if images is not None:
426
+ pixel_values, vision_grid_thws = [], []
427
+ for image in images:
428
+ patches, image_grid_thw = self._preprocess(
429
+ image,
430
+ do_resize=do_resize,
431
+ resample=resample,
432
+ do_rescale=do_rescale,
433
+ rescale_factor=rescale_factor,
434
+ do_normalize=do_normalize,
435
+ image_mean=image_mean,
436
+ image_std=image_std,
437
+ data_format=data_format,
438
+ do_convert_rgb=do_convert_rgb,
439
+ input_data_format=input_data_format,
440
+ )
441
+ pixel_values.extend(patches)
442
+ vision_grid_thws.append(image_grid_thw)
443
+ pixel_values = np.array(pixel_values)
444
+ vision_grid_thws = np.array(vision_grid_thws)
445
+ data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
446
+
447
+ if videos is not None:
448
+ pixel_values, vision_grid_thws = [], []
449
+ for images in videos:
450
+ patches, video_grid_thw = self._preprocess(
451
+ images,
452
+ do_resize=do_resize,
453
+ resample=resample,
454
+ do_rescale=do_rescale,
455
+ rescale_factor=rescale_factor,
456
+ do_normalize=do_normalize,
457
+ image_mean=image_mean,
458
+ image_std=image_std,
459
+ data_format=data_format,
460
+ do_convert_rgb=do_convert_rgb,
461
+ input_data_format=input_data_format,
462
+ )
463
+ pixel_values.extend(patches)
464
+ vision_grid_thws.append(video_grid_thw)
465
+ pixel_values = np.array(pixel_values)
466
+ vision_grid_thws = np.array(vision_grid_thws)
467
+ data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
468
+
469
+ return BatchFeature(data=data, tensor_type=return_tensors)
image_utils.py ADDED
@@ -0,0 +1,806 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import base64
17
+ import os
18
+ from io import BytesIO
19
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
20
+
21
+ import numpy as np
22
+ import requests
23
+ from packaging import version
24
+
25
+ from transformers.utils import (
26
+ ExplicitEnum,
27
+ is_jax_tensor,
28
+ is_numpy_array,
29
+ is_tf_tensor,
30
+ is_torch_available,
31
+ is_torch_tensor,
32
+ is_torchvision_available,
33
+ is_vision_available,
34
+ logging,
35
+ requires_backends,
36
+ to_numpy,
37
+ )
38
+ from transformers.utils.constants import ( # noqa: F401
39
+ IMAGENET_DEFAULT_MEAN,
40
+ IMAGENET_DEFAULT_STD,
41
+ IMAGENET_STANDARD_MEAN,
42
+ IMAGENET_STANDARD_STD,
43
+ OPENAI_CLIP_MEAN,
44
+ OPENAI_CLIP_STD,
45
+ )
46
+
47
+ if is_vision_available():
48
+ import PIL.Image
49
+ import PIL.ImageOps
50
+
51
+ if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
52
+ PILImageResampling = PIL.Image.Resampling
53
+ else:
54
+ PILImageResampling = PIL.Image
55
+
56
+ if is_torchvision_available():
57
+ from torchvision.transforms import InterpolationMode
58
+
59
+ pil_torch_interpolation_mapping = {
60
+ PILImageResampling.NEAREST: InterpolationMode.NEAREST,
61
+ PILImageResampling.BOX: InterpolationMode.BOX,
62
+ PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
63
+ PILImageResampling.HAMMING: InterpolationMode.HAMMING,
64
+ PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
65
+ PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
66
+ }
67
+
68
+ if TYPE_CHECKING:
69
+ if is_torch_available():
70
+ import torch
71
+
72
+ logger = logging.get_logger(__name__)
73
+
74
+ ImageInput = Union[
75
+ "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
76
+ ] # noqa
77
+
78
+ VideoInput = Union[
79
+ List["PIL.Image.Image"],
80
+ "np.ndarray",
81
+ "torch.Tensor",
82
+ List["np.ndarray"],
83
+ List["torch.Tensor"],
84
+ List[List["PIL.Image.Image"]],
85
+ List[List["np.ndarrray"]],
86
+ List[List["torch.Tensor"]],
87
+ ] # noqa
88
+
89
+
90
+ class ChannelDimension(ExplicitEnum):
91
+ FIRST = "channels_first"
92
+ LAST = "channels_last"
93
+
94
+
95
+ class AnnotationFormat(ExplicitEnum):
96
+ COCO_DETECTION = "coco_detection"
97
+ COCO_PANOPTIC = "coco_panoptic"
98
+
99
+
100
+ class AnnotionFormat(ExplicitEnum):
101
+ COCO_DETECTION = AnnotationFormat.COCO_DETECTION.value
102
+ COCO_PANOPTIC = AnnotationFormat.COCO_PANOPTIC.value
103
+
104
+
105
+ AnnotationType = Dict[str, Union[int, str, List[Dict]]]
106
+
107
+
108
+ def is_pil_image(img):
109
+ return is_vision_available() and isinstance(img, PIL.Image.Image)
110
+
111
+
112
+ class ImageType(ExplicitEnum):
113
+ PIL = "pillow"
114
+ TORCH = "torch"
115
+ NUMPY = "numpy"
116
+ TENSORFLOW = "tensorflow"
117
+ JAX = "jax"
118
+
119
+
120
+ def get_image_type(image):
121
+ if is_pil_image(image):
122
+ return ImageType.PIL
123
+ if is_torch_tensor(image):
124
+ return ImageType.TORCH
125
+ if is_numpy_array(image):
126
+ return ImageType.NUMPY
127
+ if is_tf_tensor(image):
128
+ return ImageType.TENSORFLOW
129
+ if is_jax_tensor(image):
130
+ return ImageType.JAX
131
+ raise ValueError(f"Unrecognised image type {type(image)}")
132
+
133
+
134
+ def is_valid_image(img):
135
+ return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
136
+
137
+
138
+ def valid_images(imgs):
139
+ # If we have an list of images, make sure every image is valid
140
+ if isinstance(imgs, (list, tuple)):
141
+ for img in imgs:
142
+ if not valid_images(img):
143
+ return False
144
+ # If not a list of tuple, we have been given a single image or batched tensor of images
145
+ elif not is_valid_image(imgs):
146
+ return False
147
+ return True
148
+
149
+
150
+ def is_batched(img):
151
+ if isinstance(img, (list, tuple)):
152
+ return is_valid_image(img[0])
153
+ return False
154
+
155
+
156
+ def is_scaled_image(image: np.ndarray) -> bool:
157
+ """
158
+ Checks to see whether the pixel values have already been rescaled to [0, 1].
159
+ """
160
+ if image.dtype == np.uint8:
161
+ return False
162
+
163
+ # It's possible the image has pixel values in [0, 255] but is of floating type
164
+ return np.min(image) >= 0 and np.max(image) <= 1
165
+
166
+
167
+ def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
168
+ """
169
+ Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
170
+ If the input is a batch of images, it is converted to a list of images.
171
+
172
+ Args:
173
+ images (`ImageInput`):
174
+ Image of images to turn into a list of images.
175
+ expected_ndims (`int`, *optional*, defaults to 3):
176
+ Expected number of dimensions for a single input image. If the input image has a different number of
177
+ dimensions, an error is raised.
178
+ """
179
+ if is_batched(images):
180
+ return images
181
+
182
+ # Either the input is a single image, in which case we create a list of length 1
183
+ if isinstance(images, PIL.Image.Image):
184
+ # PIL images are never batched
185
+ return [images]
186
+
187
+ if is_valid_image(images):
188
+ if images.ndim == expected_ndims + 1:
189
+ # Batch of images
190
+ images = list(images)
191
+ elif images.ndim == expected_ndims:
192
+ # Single image
193
+ images = [images]
194
+ else:
195
+ raise ValueError(
196
+ f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
197
+ f" {images.ndim} dimensions."
198
+ )
199
+ return images
200
+ raise ValueError(
201
+ "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or "
202
+ f"jax.ndarray, but got {type(images)}."
203
+ )
204
+
205
+
206
+ def to_numpy_array(img) -> np.ndarray:
207
+ if not is_valid_image(img):
208
+ raise ValueError(f"Invalid image type: {type(img)}")
209
+
210
+ if is_vision_available() and isinstance(img, PIL.Image.Image):
211
+ return np.array(img)
212
+ return to_numpy(img)
213
+
214
+
215
+ def infer_channel_dimension_format(
216
+ image: np.ndarray, num_channels: Optional[Union[int, Tuple[int, ...]]] = None
217
+ ) -> ChannelDimension:
218
+ """
219
+ Infers the channel dimension format of `image`.
220
+
221
+ Args:
222
+ image (`np.ndarray`):
223
+ The image to infer the channel dimension of.
224
+ num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
225
+ The number of channels of the image.
226
+
227
+ Returns:
228
+ The channel dimension of the image.
229
+ """
230
+ num_channels = num_channels if num_channels is not None else (1, 3)
231
+ num_channels = (num_channels,) if isinstance(num_channels, int) else num_channels
232
+
233
+ if image.ndim == 3:
234
+ first_dim, last_dim = 0, 2
235
+ elif image.ndim == 4:
236
+ first_dim, last_dim = 1, 3
237
+ else:
238
+ raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
239
+
240
+ if image.shape[first_dim] in num_channels and image.shape[last_dim] in num_channels:
241
+ logger.warning(
242
+ f"The channel dimension is ambiguous. Got image shape {image.shape}. Assuming channels are the first dimension."
243
+ )
244
+ return ChannelDimension.FIRST
245
+ elif image.shape[first_dim] in num_channels:
246
+ return ChannelDimension.FIRST
247
+ elif image.shape[last_dim] in num_channels:
248
+ return ChannelDimension.LAST
249
+ raise ValueError("Unable to infer channel dimension format")
250
+
251
+
252
+ def get_channel_dimension_axis(
253
+ image: np.ndarray, input_data_format: Optional[Union[ChannelDimension, str]] = None
254
+ ) -> int:
255
+ """
256
+ Returns the channel dimension axis of the image.
257
+
258
+ Args:
259
+ image (`np.ndarray`):
260
+ The image to get the channel dimension axis of.
261
+ input_data_format (`ChannelDimension` or `str`, *optional*):
262
+ The channel dimension format of the image. If `None`, will infer the channel dimension from the image.
263
+
264
+ Returns:
265
+ The channel dimension axis of the image.
266
+ """
267
+ if input_data_format is None:
268
+ input_data_format = infer_channel_dimension_format(image)
269
+ if input_data_format == ChannelDimension.FIRST:
270
+ return image.ndim - 3
271
+ elif input_data_format == ChannelDimension.LAST:
272
+ return image.ndim - 1
273
+ raise ValueError(f"Unsupported data format: {input_data_format}")
274
+
275
+
276
+ def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
277
+ """
278
+ Returns the (height, width) dimensions of the image.
279
+
280
+ Args:
281
+ image (`np.ndarray`):
282
+ The image to get the dimensions of.
283
+ channel_dim (`ChannelDimension`, *optional*):
284
+ Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.
285
+
286
+ Returns:
287
+ A tuple of the image's height and width.
288
+ """
289
+ if channel_dim is None:
290
+ channel_dim = infer_channel_dimension_format(image)
291
+
292
+ if channel_dim == ChannelDimension.FIRST:
293
+ return image.shape[-2], image.shape[-1]
294
+ elif channel_dim == ChannelDimension.LAST:
295
+ return image.shape[-3], image.shape[-2]
296
+ else:
297
+ raise ValueError(f"Unsupported data format: {channel_dim}")
298
+
299
+
300
+ def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool:
301
+ if (
302
+ isinstance(annotation, dict)
303
+ and "image_id" in annotation
304
+ and "annotations" in annotation
305
+ and isinstance(annotation["annotations"], (list, tuple))
306
+ and (
307
+ # an image can have no annotations
308
+ len(annotation["annotations"]) == 0 or isinstance(annotation["annotations"][0], dict)
309
+ )
310
+ ):
311
+ return True
312
+ return False
313
+
314
+
315
+ def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]]) -> bool:
316
+ if (
317
+ isinstance(annotation, dict)
318
+ and "image_id" in annotation
319
+ and "segments_info" in annotation
320
+ and "file_name" in annotation
321
+ and isinstance(annotation["segments_info"], (list, tuple))
322
+ and (
323
+ # an image can have no segments
324
+ len(annotation["segments_info"]) == 0 or isinstance(annotation["segments_info"][0], dict)
325
+ )
326
+ ):
327
+ return True
328
+ return False
329
+
330
+
331
+ def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
332
+ return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
333
+
334
+
335
+ def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
336
+ return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
337
+
338
+
339
+ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
340
+ """
341
+ Loads `image` to a PIL Image.
342
+
343
+ Args:
344
+ image (`str` or `PIL.Image.Image`):
345
+ The image to convert to the PIL Image format.
346
+ timeout (`float`, *optional*):
347
+ The timeout value in seconds for the URL request.
348
+
349
+ Returns:
350
+ `PIL.Image.Image`: A PIL Image.
351
+ """
352
+ requires_backends(load_image, ["vision"])
353
+ if isinstance(image, str):
354
+ if image.startswith("http://") or image.startswith("https://"):
355
+ # We need to actually check for a real protocol, otherwise it's impossible to use a local file
356
+ # like http_huggingface_co.png
357
+ image = PIL.Image.open(BytesIO(requests.get(image, timeout=timeout).content))
358
+ elif os.path.isfile(image):
359
+ image = PIL.Image.open(image)
360
+ else:
361
+ if image.startswith("data:image/"):
362
+ image = image.split(",")[1]
363
+
364
+ # Try to load as base64
365
+ try:
366
+ b64 = base64.decodebytes(image.encode())
367
+ image = PIL.Image.open(BytesIO(b64))
368
+ except Exception as e:
369
+ raise ValueError(
370
+ f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
371
+ )
372
+ elif isinstance(image, PIL.Image.Image):
373
+ image = image
374
+ else:
375
+ raise TypeError(
376
+ "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
377
+ )
378
+ image = PIL.ImageOps.exif_transpose(image)
379
+ image = image.convert("RGB")
380
+ return image
381
+
382
+
383
+ def validate_preprocess_arguments(
384
+ do_rescale: Optional[bool] = None,
385
+ rescale_factor: Optional[float] = None,
386
+ do_normalize: Optional[bool] = None,
387
+ image_mean: Optional[Union[float, List[float]]] = None,
388
+ image_std: Optional[Union[float, List[float]]] = None,
389
+ do_pad: Optional[bool] = None,
390
+ size_divisibility: Optional[int] = None,
391
+ do_center_crop: Optional[bool] = None,
392
+ crop_size: Optional[Dict[str, int]] = None,
393
+ do_resize: Optional[bool] = None,
394
+ size: Optional[Dict[str, int]] = None,
395
+ resample: Optional["PILImageResampling"] = None,
396
+ ):
397
+ """
398
+ Checks validity of typically used arguments in an `ImageProcessor` `preprocess` method.
399
+ Raises `ValueError` if arguments incompatibility is caught.
400
+ Many incompatibilities are model-specific. `do_pad` sometimes needs `size_divisor`,
401
+ sometimes `size_divisibility`, and sometimes `size`. New models and processors added should follow
402
+ existing arguments when possible.
403
+
404
+ """
405
+ if do_rescale and rescale_factor is None:
406
+ raise ValueError("`rescale_factor` must be specified if `do_rescale` is `True`.")
407
+
408
+ if do_pad and size_divisibility is None:
409
+ # Here, size_divisor might be passed as the value of size
410
+ raise ValueError(
411
+ "Depending on the model, `size_divisibility`, `size_divisor`, `pad_size` or `size` must be specified if `do_pad` is `True`."
412
+ )
413
+
414
+ if do_normalize and (image_mean is None or image_std is None):
415
+ raise ValueError("`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.")
416
+
417
+ if do_center_crop and crop_size is None:
418
+ raise ValueError("`crop_size` must be specified if `do_center_crop` is `True`.")
419
+
420
+ if do_resize and (size is None or resample is None):
421
+ raise ValueError("`size` and `resample` must be specified if `do_resize` is `True`.")
422
+
423
+
424
+ # In the future we can add a TF implementation here when we have TF models.
425
+ class ImageFeatureExtractionMixin:
426
+ """
427
+ Mixin that contain utilities for preparing image features.
428
+ """
429
+
430
+ def _ensure_format_supported(self, image):
431
+ if not isinstance(image, (PIL.Image.Image, np.ndarray)) and not is_torch_tensor(image):
432
+ raise ValueError(
433
+ f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.array` and "
434
+ "`torch.Tensor` are."
435
+ )
436
+
437
+ def to_pil_image(self, image, rescale=None):
438
+ """
439
+ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
440
+ needed.
441
+
442
+ Args:
443
+ image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
444
+ The image to convert to the PIL Image format.
445
+ rescale (`bool`, *optional*):
446
+ Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
447
+ default to `True` if the image type is a floating type, `False` otherwise.
448
+ """
449
+ self._ensure_format_supported(image)
450
+
451
+ if is_torch_tensor(image):
452
+ image = image.numpy()
453
+
454
+ if isinstance(image, np.ndarray):
455
+ if rescale is None:
456
+ # rescale default to the array being of floating type.
457
+ rescale = isinstance(image.flat[0], np.floating)
458
+ # If the channel as been moved to first dim, we put it back at the end.
459
+ if image.ndim == 3 and image.shape[0] in [1, 3]:
460
+ image = image.transpose(1, 2, 0)
461
+ if rescale:
462
+ image = image * 255
463
+ image = image.astype(np.uint8)
464
+ return PIL.Image.fromarray(image)
465
+ return image
466
+
467
+ def convert_rgb(self, image):
468
+ """
469
+ Converts `PIL.Image.Image` to RGB format.
470
+
471
+ Args:
472
+ image (`PIL.Image.Image`):
473
+ The image to convert.
474
+ """
475
+ self._ensure_format_supported(image)
476
+ if not isinstance(image, PIL.Image.Image):
477
+ return image
478
+
479
+ return image.convert("RGB")
480
+
481
+ def rescale(self, image: np.ndarray, scale: Union[float, int]) -> np.ndarray:
482
+ """
483
+ Rescale a numpy image by scale amount
484
+ """
485
+ self._ensure_format_supported(image)
486
+ return image * scale
487
+
488
+ def to_numpy_array(self, image, rescale=None, channel_first=True):
489
+ """
490
+ Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
491
+ dimension.
492
+
493
+ Args:
494
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
495
+ The image to convert to a NumPy array.
496
+ rescale (`bool`, *optional*):
497
+ Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
498
+ default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
499
+ channel_first (`bool`, *optional*, defaults to `True`):
500
+ Whether or not to permute the dimensions of the image to put the channel dimension first.
501
+ """
502
+ self._ensure_format_supported(image)
503
+
504
+ if isinstance(image, PIL.Image.Image):
505
+ image = np.array(image)
506
+
507
+ if is_torch_tensor(image):
508
+ image = image.numpy()
509
+
510
+ rescale = isinstance(image.flat[0], np.integer) if rescale is None else rescale
511
+
512
+ if rescale:
513
+ image = self.rescale(image.astype(np.float32), 1 / 255.0)
514
+
515
+ if channel_first and image.ndim == 3:
516
+ image = image.transpose(2, 0, 1)
517
+
518
+ return image
519
+
520
+ def expand_dims(self, image):
521
+ """
522
+ Expands 2-dimensional `image` to 3 dimensions.
523
+
524
+ Args:
525
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
526
+ The image to expand.
527
+ """
528
+ self._ensure_format_supported(image)
529
+
530
+ # Do nothing if PIL image
531
+ if isinstance(image, PIL.Image.Image):
532
+ return image
533
+
534
+ if is_torch_tensor(image):
535
+ image = image.unsqueeze(0)
536
+ else:
537
+ image = np.expand_dims(image, axis=0)
538
+ return image
539
+
540
+ def normalize(self, image, mean, std, rescale=False):
541
+ """
542
+ Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
543
+ if it's a PIL Image.
544
+
545
+ Args:
546
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
547
+ The image to normalize.
548
+ mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
549
+ The mean (per channel) to use for normalization.
550
+ std (`List[float]` or `np.ndarray` or `torch.Tensor`):
551
+ The standard deviation (per channel) to use for normalization.
552
+ rescale (`bool`, *optional*, defaults to `False`):
553
+ Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
554
+ happen automatically.
555
+ """
556
+ self._ensure_format_supported(image)
557
+
558
+ if isinstance(image, PIL.Image.Image):
559
+ image = self.to_numpy_array(image, rescale=True)
560
+ # If the input image is a PIL image, it automatically gets rescaled. If it's another
561
+ # type it may need rescaling.
562
+ elif rescale:
563
+ if isinstance(image, np.ndarray):
564
+ image = self.rescale(image.astype(np.float32), 1 / 255.0)
565
+ elif is_torch_tensor(image):
566
+ image = self.rescale(image.float(), 1 / 255.0)
567
+
568
+ if isinstance(image, np.ndarray):
569
+ if not isinstance(mean, np.ndarray):
570
+ mean = np.array(mean).astype(image.dtype)
571
+ if not isinstance(std, np.ndarray):
572
+ std = np.array(std).astype(image.dtype)
573
+ elif is_torch_tensor(image):
574
+ import torch
575
+
576
+ if not isinstance(mean, torch.Tensor):
577
+ if isinstance(mean, np.ndarray):
578
+ mean = torch.from_numpy(mean)
579
+ else:
580
+ mean = torch.tensor(mean)
581
+ if not isinstance(std, torch.Tensor):
582
+ if isinstance(std, np.ndarray):
583
+ std = torch.from_numpy(std)
584
+ else:
585
+ std = torch.tensor(std)
586
+
587
+ if image.ndim == 3 and image.shape[0] in [1, 3]:
588
+ return (image - mean[:, None, None]) / std[:, None, None]
589
+ else:
590
+ return (image - mean) / std
591
+
592
+ def resize(self, image, size, resample=None, default_to_square=True, max_size=None):
593
+ """
594
+ Resizes `image`. Enforces conversion of input to PIL.Image.
595
+
596
+ Args:
597
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
598
+ The image to resize.
599
+ size (`int` or `Tuple[int, int]`):
600
+ The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
601
+ matched to this.
602
+
603
+ If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
604
+ `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to
605
+ this number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
606
+ resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
607
+ The filter to user for resampling.
608
+ default_to_square (`bool`, *optional*, defaults to `True`):
609
+ How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a
610
+ square (`size`,`size`). If set to `False`, will replicate
611
+ [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
612
+ with support for resizing only the smallest edge and providing an optional `max_size`.
613
+ max_size (`int`, *optional*, defaults to `None`):
614
+ The maximum allowed for the longer edge of the resized image: if the longer edge of the image is
615
+ greater than `max_size` after being resized according to `size`, then the image is resized again so
616
+ that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller
617
+ edge may be shorter than `size`. Only used if `default_to_square` is `False`.
618
+
619
+ Returns:
620
+ image: A resized `PIL.Image.Image`.
621
+ """
622
+ resample = resample if resample is not None else PILImageResampling.BILINEAR
623
+
624
+ self._ensure_format_supported(image)
625
+
626
+ if not isinstance(image, PIL.Image.Image):
627
+ image = self.to_pil_image(image)
628
+
629
+ if isinstance(size, list):
630
+ size = tuple(size)
631
+
632
+ if isinstance(size, int) or len(size) == 1:
633
+ if default_to_square:
634
+ size = (size, size) if isinstance(size, int) else (size[0], size[0])
635
+ else:
636
+ width, height = image.size
637
+ # specified size only for the smallest edge
638
+ short, long = (width, height) if width <= height else (height, width)
639
+ requested_new_short = size if isinstance(size, int) else size[0]
640
+
641
+ if short == requested_new_short:
642
+ return image
643
+
644
+ new_short, new_long = requested_new_short, int(requested_new_short * long / short)
645
+
646
+ if max_size is not None:
647
+ if max_size <= requested_new_short:
648
+ raise ValueError(
649
+ f"max_size = {max_size} must be strictly greater than the requested "
650
+ f"size for the smaller edge size = {size}"
651
+ )
652
+ if new_long > max_size:
653
+ new_short, new_long = int(max_size * new_short / new_long), max_size
654
+
655
+ size = (new_short, new_long) if width <= height else (new_long, new_short)
656
+
657
+ return image.resize(size, resample=resample)
658
+
659
+ def center_crop(self, image, size):
660
+ """
661
+ Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
662
+ size given, it will be padded (so the returned result has the size asked).
663
+
664
+ Args:
665
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
666
+ The image to resize.
667
+ size (`int` or `Tuple[int, int]`):
668
+ The size to which crop the image.
669
+
670
+ Returns:
671
+ new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape: (n_channels,
672
+ height, width).
673
+ """
674
+ self._ensure_format_supported(image)
675
+
676
+ if not isinstance(size, tuple):
677
+ size = (size, size)
678
+
679
+ # PIL Image.size is (width, height) but NumPy array and torch Tensors have (height, width)
680
+ if is_torch_tensor(image) or isinstance(image, np.ndarray):
681
+ if image.ndim == 2:
682
+ image = self.expand_dims(image)
683
+ image_shape = image.shape[1:] if image.shape[0] in [1, 3] else image.shape[:2]
684
+ else:
685
+ image_shape = (image.size[1], image.size[0])
686
+
687
+ top = (image_shape[0] - size[0]) // 2
688
+ bottom = top + size[0] # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
689
+ left = (image_shape[1] - size[1]) // 2
690
+ right = left + size[1] # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
691
+
692
+ # For PIL Images we have a method to crop directly.
693
+ if isinstance(image, PIL.Image.Image):
694
+ return image.crop((left, top, right, bottom))
695
+
696
+ # Check if image is in (n_channels, height, width) or (height, width, n_channels) format
697
+ channel_first = True if image.shape[0] in [1, 3] else False
698
+
699
+ # Transpose (height, width, n_channels) format images
700
+ if not channel_first:
701
+ if isinstance(image, np.ndarray):
702
+ image = image.transpose(2, 0, 1)
703
+ if is_torch_tensor(image):
704
+ image = image.permute(2, 0, 1)
705
+
706
+ # Check if cropped area is within image boundaries
707
+ if top >= 0 and bottom <= image_shape[0] and left >= 0 and right <= image_shape[1]:
708
+ return image[..., top:bottom, left:right]
709
+
710
+ # Otherwise, we may need to pad if the image is too small. Oh joy...
711
+ new_shape = image.shape[:-2] + (max(size[0], image_shape[0]), max(size[1], image_shape[1]))
712
+ if isinstance(image, np.ndarray):
713
+ new_image = np.zeros_like(image, shape=new_shape)
714
+ elif is_torch_tensor(image):
715
+ new_image = image.new_zeros(new_shape)
716
+
717
+ top_pad = (new_shape[-2] - image_shape[0]) // 2
718
+ bottom_pad = top_pad + image_shape[0]
719
+ left_pad = (new_shape[-1] - image_shape[1]) // 2
720
+ right_pad = left_pad + image_shape[1]
721
+ new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
722
+
723
+ top += top_pad
724
+ bottom += top_pad
725
+ left += left_pad
726
+ right += left_pad
727
+
728
+ new_image = new_image[
729
+ ..., max(0, top): min(new_image.shape[-2], bottom), max(0, left): min(new_image.shape[-1], right)
730
+ ]
731
+
732
+ return new_image
733
+
734
+ def flip_channel_order(self, image):
735
+ """
736
+ Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of
737
+ `image` to a NumPy array if it's a PIL Image.
738
+
739
+ Args:
740
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
741
+ The image whose color channels to flip. If `np.ndarray` or `torch.Tensor`, the channel dimension should
742
+ be first.
743
+ """
744
+ self._ensure_format_supported(image)
745
+
746
+ if isinstance(image, PIL.Image.Image):
747
+ image = self.to_numpy_array(image)
748
+
749
+ return image[::-1, :, :]
750
+
751
+ def rotate(self, image, angle, resample=None, expand=0, center=None, translate=None, fillcolor=None):
752
+ """
753
+ Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
754
+ counter clockwise around its centre.
755
+
756
+ Args:
757
+ image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
758
+ The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
759
+ rotating.
760
+
761
+ Returns:
762
+ image: A rotated `PIL.Image.Image`.
763
+ """
764
+ resample = resample if resample is not None else PIL.Image.NEAREST
765
+
766
+ self._ensure_format_supported(image)
767
+
768
+ if not isinstance(image, PIL.Image.Image):
769
+ image = self.to_pil_image(image)
770
+
771
+ return image.rotate(
772
+ angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
773
+ )
774
+
775
+
776
+ def validate_annotations(
777
+ annotation_format: AnnotationFormat,
778
+ supported_annotation_formats: Tuple[AnnotationFormat, ...],
779
+ annotations: List[Dict],
780
+ ) -> None:
781
+ if annotation_format not in supported_annotation_formats:
782
+ raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
783
+
784
+ if annotation_format is AnnotationFormat.COCO_DETECTION:
785
+ if not valid_coco_detection_annotations(annotations):
786
+ raise ValueError(
787
+ "Invalid COCO detection annotations. Annotations must a dict (single image) or list of dicts "
788
+ "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
789
+ "being a list of annotations in the COCO format."
790
+ )
791
+
792
+ if annotation_format is AnnotationFormat.COCO_PANOPTIC:
793
+ if not valid_coco_panoptic_annotations(annotations):
794
+ raise ValueError(
795
+ "Invalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts "
796
+ "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
797
+ "the latter being a list of annotations in the COCO format."
798
+ )
799
+
800
+
801
+ def validate_kwargs(valid_processor_keys: List[str], captured_kwargs: List[str]):
802
+ unused_keys = set(captured_kwargs).difference(set(valid_processor_keys))
803
+ if unused_keys:
804
+ unused_key_str = ", ".join(unused_keys)
805
+ # TODO raise a warning here instead of simply logging?
806
+ logger.warning(f"Unused or unrecognized kwargs: {unused_key_str}.")
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_emova.EMOVAImageProcessor",
4
+ "AutoProcessor": "processing_emova.EMOVAProcessor"
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_pad": true,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "EMOVAImageProcessor",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "max_pixels": 3211264,
24
+ "merge_size": 2,
25
+ "min_pixels": 3136,
26
+ "patch_size": 14,
27
+ "processor_class": "EMOVAProcessor",
28
+ "resample": 3,
29
+ "rescale_factor": 0.00392156862745098,
30
+ "size": {
31
+ "max_pixels": 3211264,
32
+ "min_pixels": 3136
33
+ },
34
+ "temporal_patch_size": 2
35
+ }
processing_emova.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """
21
+ Processor class for EMOVA with qwen2vit.
22
+ """
23
+
24
+ import json
25
+ from typing import List, Union
26
+
27
+ from transformers import AutoProcessor, AutoImageProcessor
28
+
29
+ try:
30
+ from typing import Unpack
31
+ except ImportError:
32
+ from typing_extensions import Unpack
33
+
34
+ from transformers.feature_extraction_utils import BatchFeature
35
+ from .image_utils import ImageInput, VideoInput
36
+ from transformers.processing_utils import (
37
+ ProcessingKwargs,
38
+ ProcessorMixin,
39
+ )
40
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
41
+ from transformers.utils import logging
42
+
43
+ from .configuration_emova import EMOVAConfig
44
+ from .image_processing_emova import EMOVAImageProcessor
45
+
46
+ logger = logging.get_logger(__name__)
47
+
48
+
49
+ class EMOVAProcessorKwargs(ProcessingKwargs, total=False):
50
+ _defaults = {
51
+ "text_kwargs": {
52
+ "padding": False,
53
+ },
54
+ }
55
+
56
+
57
+ class EMOVAProcessor(ProcessorMixin):
58
+ r"""
59
+ Constructs a Qwen2-VL processor which wraps a Qwen2-VL image processor and a Qwen2 tokenizer into a single processor.
60
+ [`EMOVAProcessor`] offers all the functionalities of [`EmovaImageProcessor`] and [`Qwen2TokenizerFast`]. See the
61
+ [`~EMOVAProcessor.__call__`] and [`~EMOVAProcessor.decode`] for more information.
62
+ Args:
63
+ image_processor ([`EmovaImageProcessor`], *optional*):
64
+ The image processor is a required input.
65
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
66
+ The tokenizer is a required input.
67
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
68
+ in a chat into a tokenizable string.
69
+ """
70
+
71
+ attributes = ["image_processor", "tokenizer"]
72
+ valid_kwargs = ["chat_template"]
73
+ image_processor_class = "AutoImageProcessor"
74
+ # image_processor_class = "EMOVAImageProcessor"
75
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
76
+
77
+ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
78
+ super().__init__(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template)
79
+ self.speech_tokenizer = None
80
+
81
+ def set_speech_tokenizer(self, tokenizer=None):
82
+ if self.speech_tokenizer and tokenizer:
83
+ logger.info('You are resetting speech tokenizer!')
84
+ return
85
+ self.speech_tokenizer = tokenizer
86
+ logger.info('Setting speech tokenizer!')
87
+
88
+ def prepare_audio_input(self, text, audio, has_image=False):
89
+ if text[0]["role"] == "system":
90
+ system_prompt = text[0]
91
+ valid_index = 1
92
+ else:
93
+ system_prompt = None
94
+ valid_index = 0
95
+ logger.warning("Audio inputs are given, but system prompts are not given.")
96
+ if len(text) > valid_index:
97
+ logger.warning("When audio inputs are given, text inputs except system prompts will be discarded.")
98
+
99
+ audio_chat_format = r'Please recognize the texts, emotion and pitch from the user question speech units and provide the texts, emotion, pitch and speech units for the assistant response. \nEmotion should be chosen from ["neutral", "happy", "sad", "angry", "surprised", "disgusted", "fearful"]. \nPitch should be chosen from ["low", "normal", "high"].\nYour output should be in json format.\nAn output example is:\n{"user question text": "", "user question emotion": "", "user question pitch": "", "assistant response text": "", "assistant response emotion": "", "assistant response pitch": "","assistant response speech": ""}\n\nuser question speech:'
100
+ audio_chat_prompt = audio_chat_format + self.speech_tokenizer.encode(audio)
101
+
102
+ if has_image:
103
+ audio_chat_input = {
104
+ "role": "user",
105
+ "content": [{"type": "image"}, {"type": "text", "text": audio_chat_prompt}],
106
+ }
107
+ else:
108
+ audio_chat_input = {
109
+ "role": "user",
110
+ "content": [{"type": "text", "text": audio_chat_prompt}],
111
+ }
112
+ return [system_prompt, audio_chat_input] if system_prompt else [audio_chat_input]
113
+
114
+ def prepare_audio_output(self, output):
115
+ try:
116
+ if output.startswith('{"{"'):
117
+ return self.prepare_audio_output(output[2:])
118
+ if output.startswith("{"):
119
+ if output.endswith("|>"):
120
+ output += "\"}"
121
+ elif output.endswith("\""):
122
+ output += "}"
123
+ info_dict = json.loads(output)
124
+ content_unit = info_dict['assistant response speech'].strip()
125
+ emotion = info_dict['assistant response emotion'] if 'assistant response emotion' in info_dict else "neutral"
126
+ speed = info_dict['assistant response speed'] if 'assistant response speed' in info_dict else "normal"
127
+ pitch = info_dict['assistant response pitch'] if 'assistant response pitch' in info_dict else "normal"
128
+ except:
129
+ content_unit = output.strip()
130
+ emotion = 'neutral'
131
+ speed = "normal"
132
+ pitch = "normal"
133
+ return content_unit, emotion, speed, pitch
134
+
135
+ def __call__(
136
+ self,
137
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
138
+ images: ImageInput = None,
139
+ audios: Union[str, List[str]] = None,
140
+ **kwargs: Unpack[EMOVAProcessorKwargs],
141
+ ) -> BatchFeature:
142
+ """
143
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
144
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
145
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
146
+ EmovaImageProcessor's [`~EmovaImageProcessor.__call__`] if `vision_infos` is not `None`.
147
+
148
+ Args:
149
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
150
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
151
+ tensor. Both channels-first and channels-last formats are supported.
152
+ text (`str`, `List[str]`, `List[List[str]]`):
153
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
154
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
155
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
156
+ audios (`str`, `List[str]`): Paths to the audio input(s).
157
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
158
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
159
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
160
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
161
+ If set, will return tensors of a particular framework. Acceptable values are:
162
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
163
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
164
+ - `'np'`: Return NumPy `np.ndarray` objects.
165
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
166
+
167
+ Returns:
168
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
169
+
170
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
171
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
172
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
173
+ `None`).
174
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
175
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
176
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
177
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
178
+ """
179
+ output_kwargs = self._merge_kwargs(
180
+ EMOVAProcessorKwargs,
181
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
182
+ **kwargs,
183
+ )
184
+ if images is not None:
185
+ image_inputs = self.image_processor(images=images, videos=None, **output_kwargs["images_kwargs"])
186
+ image_grid_thw = image_inputs.pop("image_grid_thw")
187
+ image_inputs['image_sizes'] = image_grid_thw
188
+ else:
189
+ image_inputs = {}
190
+ image_sizes = None
191
+
192
+ if audios is not None:
193
+ audios = [audios] if not isinstance(audios, list) else audios
194
+ text = [text] if not isinstance(text[0], list) else text
195
+ assert len(audios) == len(text), "Audio inputs should correspond with text inputs."
196
+ assert self.speech_tokenizer, "Audio inputs are given, while speech tokenizer is not set. Call `EMOVAProcessor.prepare_audio_input()` before processing audio inputs."
197
+ text = [self.prepare_audio_input(txt, audio, has_image=images is not None) for txt, audio in zip(text, audios)]
198
+
199
+ if not isinstance(text, list):
200
+ text = [text]
201
+
202
+ _ = output_kwargs["text_kwargs"].pop("padding_side", None)
203
+ try:
204
+ text = self.apply_chat_template(text, add_generation_prompt=True, padding=True)
205
+ except Exception as e:
206
+ logger.info('Warning: input texts have been applied chat templates!')
207
+
208
+
209
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
210
+
211
+ return BatchFeature(data={**text_inputs, **image_inputs})
212
+
213
+ def batch_decode(self, sequences, output_wav_prefix='output', *args, **kwargs):
214
+ return [self.decode(seq, output_wav_file="{}_{}.wav".format(output_wav_prefix, i), *args, **kwargs)
215
+ for i, seq in enumerate(sequences)]
216
+
217
+ def decode(self, *args, speaker='female', output_wav_file='output.wav', **kwargs):
218
+ output = self.tokenizer.decode(*args, **kwargs)
219
+ if '<|speech_' not in output:
220
+ return output
221
+ content_unit, emotion, speed, pitch = self.prepare_audio_output(output)
222
+ gender = speaker.lower()
223
+ condition = f'gender-{gender}_emotion-{emotion}_speed-{speed}_pitch-{pitch}'
224
+ self.speech_tokenizer.decode(content_unit, condition=condition, output_wav_file=output_wav_file)
225
+ return output_wav_file
226
+
227
+ @property
228
+ def model_input_names(self):
229
+ tokenizer_input_names = self.tokenizer.model_input_names
230
+ image_processor_input_names = self.image_processor.model_input_names
231
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
processor_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_emova.EMOVAProcessor"
4
+ },
5
+ "chat_template": "{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' }}{% if message['content'] is string %}{{ message['content'] + '<|im_end|>\n' }}{% else %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + '<|im_end|>\n' }}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
6
+ "processor_class": "EMOVAProcessor"
7
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<image>"
17
+ ],
18
+ "eos_token": {
19
+ "content": "<|im_end|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<pad>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff