Upload processor
Browse files- added_tokens.json +0 -0
- chat_template.json +3 -0
- image_processing_emova.py +469 -0
- image_utils.py +806 -0
- merges.txt +0 -0
- preprocessor_config.json +35 -0
- processing_emova.py +231 -0
- processor_config.json +7 -0
- special_tokens_map.json +32 -0
- tokenizer.json +0 -0
- tokenizer_config.json +0 -0
- vocab.json +0 -0
added_tokens.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
chat_template.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"chat_template": "{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' }}{% if message['content'] is string %}{{ message['content'] + '<|im_end|>\n' }}{% else %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + '<|im_end|>\n' }}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
|
3 |
+
}
|
image_processing_emova.py
ADDED
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
5 |
+
# and OPT implementations in this library. It has been modified from its
|
6 |
+
# original forms to accommodate minor architectural differences compared
|
7 |
+
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
8 |
+
#
|
9 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
+
# you may not use this file except in compliance with the License.
|
11 |
+
# You may obtain a copy of the License at
|
12 |
+
#
|
13 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14 |
+
#
|
15 |
+
# Unless required by applicable law or agreed to in writing, software
|
16 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
+
# See the License for the specific language governing permissions and
|
19 |
+
# limitations under the License.
|
20 |
+
"""Image processor class for Qwen2-VL."""
|
21 |
+
|
22 |
+
import math
|
23 |
+
from typing import Dict, List, Optional, Union
|
24 |
+
|
25 |
+
import numpy as np
|
26 |
+
|
27 |
+
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
|
28 |
+
from transformers.image_transforms import (
|
29 |
+
convert_to_rgb,
|
30 |
+
resize,
|
31 |
+
to_channel_dimension_format,
|
32 |
+
)
|
33 |
+
from .image_utils import (
|
34 |
+
OPENAI_CLIP_MEAN,
|
35 |
+
OPENAI_CLIP_STD,
|
36 |
+
ChannelDimension,
|
37 |
+
ImageInput,
|
38 |
+
PILImageResampling,
|
39 |
+
VideoInput,
|
40 |
+
get_image_size,
|
41 |
+
infer_channel_dimension_format,
|
42 |
+
is_scaled_image,
|
43 |
+
is_valid_image,
|
44 |
+
make_list_of_images,
|
45 |
+
to_numpy_array,
|
46 |
+
valid_images,
|
47 |
+
validate_preprocess_arguments,
|
48 |
+
)
|
49 |
+
from transformers.utils import TensorType, is_vision_available, logging
|
50 |
+
|
51 |
+
logger = logging.get_logger(__name__)
|
52 |
+
|
53 |
+
if is_vision_available():
|
54 |
+
from PIL import Image
|
55 |
+
|
56 |
+
|
57 |
+
def make_batched_images(images) -> List[List[ImageInput]]:
|
58 |
+
"""
|
59 |
+
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
63 |
+
The input image.
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
list: A list of images.
|
67 |
+
"""
|
68 |
+
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
69 |
+
return [img for img_list in images for img in img_list]
|
70 |
+
|
71 |
+
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
72 |
+
return images
|
73 |
+
|
74 |
+
elif is_valid_image(images):
|
75 |
+
return [images]
|
76 |
+
|
77 |
+
raise ValueError(f"Could not make batched images from {images}")
|
78 |
+
|
79 |
+
|
80 |
+
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
|
81 |
+
def make_batched_videos(videos) -> List[VideoInput]:
|
82 |
+
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
|
83 |
+
return videos
|
84 |
+
|
85 |
+
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
|
86 |
+
if isinstance(videos[0], Image.Image):
|
87 |
+
return [videos]
|
88 |
+
elif len(videos[0].shape) == 4:
|
89 |
+
return [list(video) for video in videos]
|
90 |
+
|
91 |
+
elif is_valid_image(videos) and len(videos.shape) == 4:
|
92 |
+
return [list(videos)]
|
93 |
+
|
94 |
+
raise ValueError(f"Could not make batched video from {videos}")
|
95 |
+
|
96 |
+
|
97 |
+
def smart_resize(
|
98 |
+
height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 4096
|
99 |
+
):
|
100 |
+
"""Rescales the image so that the following conditions are met:
|
101 |
+
|
102 |
+
1. Both dimensions (height and width) are divisible by 'factor'.
|
103 |
+
|
104 |
+
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
105 |
+
|
106 |
+
3. The aspect ratio of the image is maintained as closely as possible.
|
107 |
+
|
108 |
+
"""
|
109 |
+
if height < factor or width < factor:
|
110 |
+
# print("height, width", height, width)
|
111 |
+
if height < width:
|
112 |
+
h_bar = factor
|
113 |
+
w_bar = round(width / height * factor)
|
114 |
+
else:
|
115 |
+
h_bar = round(height / width * factor)
|
116 |
+
w_bar = factor
|
117 |
+
# print("h_bar, w_bar", h_bar, w_bar)
|
118 |
+
height, width = h_bar, w_bar
|
119 |
+
# raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
|
120 |
+
elif max(height, width) / min(height, width) > 200:
|
121 |
+
raise ValueError(
|
122 |
+
f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
|
123 |
+
)
|
124 |
+
h_bar = round(height / factor) * factor
|
125 |
+
w_bar = round(width / factor) * factor
|
126 |
+
if h_bar * w_bar > max_pixels:
|
127 |
+
beta = math.sqrt((height * width) / max_pixels)
|
128 |
+
h_bar = math.floor(height / beta / factor) * factor
|
129 |
+
w_bar = math.floor(width / beta / factor) * factor
|
130 |
+
elif h_bar * w_bar < min_pixels:
|
131 |
+
beta = math.sqrt(min_pixels / (height * width))
|
132 |
+
h_bar = math.ceil(height * beta / factor) * factor
|
133 |
+
w_bar = math.ceil(width * beta / factor) * factor
|
134 |
+
return h_bar, w_bar
|
135 |
+
|
136 |
+
|
137 |
+
class EMOVAImageProcessor(BaseImageProcessor):
|
138 |
+
r"""
|
139 |
+
Constructs a Qwen2-VL image processor that dynamically resizes images based on the original images.
|
140 |
+
|
141 |
+
Args:
|
142 |
+
do_resize (`bool`, *optional*, defaults to `True`):
|
143 |
+
Whether to resize the image's (height, width) dimensions.
|
144 |
+
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
145 |
+
Resampling filter to use when resizing the image.
|
146 |
+
do_rescale (`bool`, *optional*, defaults to `True`):
|
147 |
+
Whether to rescale the image by the specified scale `rescale_factor`.
|
148 |
+
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
149 |
+
Scale factor to use if rescaling the image.
|
150 |
+
do_normalize (`bool`, *optional*, defaults to `True`):
|
151 |
+
Whether to normalize the image.
|
152 |
+
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
153 |
+
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
154 |
+
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
155 |
+
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
156 |
+
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
157 |
+
Whether to convert the image to RGB.
|
158 |
+
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
159 |
+
The min pixels of the image to resize the image.
|
160 |
+
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
|
161 |
+
The max pixels of the image to resize the image.
|
162 |
+
patch_size (`int`, *optional*, defaults to 14):
|
163 |
+
The spacial patch size of the vision encoder.
|
164 |
+
temporal_patch_size (`int`, *optional*, defaults to 2):
|
165 |
+
The temporal patch size of the vision encoder.
|
166 |
+
merge_size (`int`, *optional*, defaults to 2):
|
167 |
+
The merge size of the vision encoder to llm encoder.
|
168 |
+
"""
|
169 |
+
|
170 |
+
model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
|
171 |
+
|
172 |
+
def __init__(
|
173 |
+
self,
|
174 |
+
do_resize: bool = True,
|
175 |
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
176 |
+
do_rescale: bool = True,
|
177 |
+
rescale_factor: Union[int, float] = 1 / 255,
|
178 |
+
do_normalize: bool = True,
|
179 |
+
image_mean: Optional[Union[float, List[float]]] = None,
|
180 |
+
image_std: Optional[Union[float, List[float]]] = None,
|
181 |
+
do_convert_rgb: bool = True,
|
182 |
+
min_pixels: int = 56 * 56,
|
183 |
+
max_pixels: int = 28 * 28 * 4096,
|
184 |
+
patch_size: int = 14,
|
185 |
+
temporal_patch_size: int = 2,
|
186 |
+
merge_size: int = 2,
|
187 |
+
**kwargs,
|
188 |
+
) -> None:
|
189 |
+
super().__init__(**kwargs)
|
190 |
+
self.do_resize = do_resize
|
191 |
+
self.resample = resample
|
192 |
+
self.do_rescale = do_rescale
|
193 |
+
self.rescale_factor = rescale_factor
|
194 |
+
self.do_normalize = do_normalize
|
195 |
+
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
196 |
+
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
197 |
+
self.min_pixels = min_pixels
|
198 |
+
self.max_pixels = max_pixels
|
199 |
+
self.patch_size = patch_size
|
200 |
+
self.temporal_patch_size = temporal_patch_size
|
201 |
+
self.merge_size = merge_size
|
202 |
+
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
|
203 |
+
self.do_convert_rgb = do_convert_rgb
|
204 |
+
|
205 |
+
def _preprocess(
|
206 |
+
self,
|
207 |
+
images: Union[ImageInput, VideoInput],
|
208 |
+
do_resize: bool = None,
|
209 |
+
resample: PILImageResampling = None,
|
210 |
+
do_rescale: bool = None,
|
211 |
+
rescale_factor: float = None,
|
212 |
+
do_normalize: bool = None,
|
213 |
+
image_mean: Optional[Union[float, List[float]]] = None,
|
214 |
+
image_std: Optional[Union[float, List[float]]] = None,
|
215 |
+
do_convert_rgb: bool = None,
|
216 |
+
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
217 |
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
218 |
+
):
|
219 |
+
"""
|
220 |
+
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
221 |
+
|
222 |
+
Args:
|
223 |
+
images (`ImageInput`):
|
224 |
+
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
225 |
+
vision_info (`List[Dict]`, *optional*):
|
226 |
+
Optional list of dictionaries containing additional information about vision inputs.
|
227 |
+
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
228 |
+
Whether to resize the image.
|
229 |
+
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
230 |
+
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
231 |
+
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
232 |
+
Whether to rescale the image.
|
233 |
+
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
234 |
+
Scale factor to use if rescaling the image.
|
235 |
+
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
236 |
+
Whether to normalize the image.
|
237 |
+
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
238 |
+
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
239 |
+
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
240 |
+
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
241 |
+
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
242 |
+
Whether to convert the image to RGB.
|
243 |
+
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
244 |
+
The channel dimension format for the output image. Can be one of:
|
245 |
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
246 |
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
247 |
+
- Unset: Use the channel dimension format of the input image.
|
248 |
+
input_data_format (`ChannelDimension` or `str`, *optional*):
|
249 |
+
The channel dimension format for the input image. Can be one of:
|
250 |
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
251 |
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
252 |
+
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
253 |
+
"""
|
254 |
+
# import pdb; pdb.set_trace()
|
255 |
+
# print("images", images)
|
256 |
+
# for image in images:
|
257 |
+
# print("image", image.size)
|
258 |
+
images = make_list_of_images(images)
|
259 |
+
|
260 |
+
if do_convert_rgb:
|
261 |
+
images = [convert_to_rgb(image) for image in images]
|
262 |
+
|
263 |
+
# All transformations expect numpy arrays.
|
264 |
+
images = [to_numpy_array(image) for image in images]
|
265 |
+
|
266 |
+
if is_scaled_image(images[0]) and do_rescale:
|
267 |
+
logger.warning_once(
|
268 |
+
"It looks like you are trying to rescale already rescaled images. If the input"
|
269 |
+
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
270 |
+
)
|
271 |
+
if input_data_format is None:
|
272 |
+
# We assume that all images have the same channel dimension format.
|
273 |
+
input_data_format = infer_channel_dimension_format(images[0])
|
274 |
+
|
275 |
+
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
276 |
+
resized_height, resized_width = height, width
|
277 |
+
processed_images = []
|
278 |
+
for image in images:
|
279 |
+
if do_resize:
|
280 |
+
resized_height, resized_width = smart_resize(
|
281 |
+
height,
|
282 |
+
width,
|
283 |
+
factor=self.patch_size * self.merge_size,
|
284 |
+
min_pixels=self.min_pixels,
|
285 |
+
max_pixels=self.max_pixels,
|
286 |
+
)
|
287 |
+
image = resize(
|
288 |
+
image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
|
289 |
+
)
|
290 |
+
|
291 |
+
if do_rescale:
|
292 |
+
image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
|
293 |
+
|
294 |
+
if do_normalize:
|
295 |
+
image = self.normalize(
|
296 |
+
image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
|
297 |
+
)
|
298 |
+
|
299 |
+
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
300 |
+
processed_images.append(image)
|
301 |
+
|
302 |
+
patches = np.array(processed_images)
|
303 |
+
if data_format == ChannelDimension.LAST:
|
304 |
+
patches = patches.transpose(0, 3, 1, 2)
|
305 |
+
if patches.shape[0] == 1:
|
306 |
+
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
|
307 |
+
channel = patches.shape[1]
|
308 |
+
grid_t = patches.shape[0] // self.temporal_patch_size
|
309 |
+
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
|
310 |
+
patches = patches.reshape(
|
311 |
+
grid_t,
|
312 |
+
self.temporal_patch_size,
|
313 |
+
channel,
|
314 |
+
grid_h // self.merge_size,
|
315 |
+
self.merge_size,
|
316 |
+
self.patch_size,
|
317 |
+
grid_w // self.merge_size,
|
318 |
+
self.merge_size,
|
319 |
+
self.patch_size,
|
320 |
+
)
|
321 |
+
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
322 |
+
flatten_patches = patches.reshape(
|
323 |
+
grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
|
324 |
+
)
|
325 |
+
|
326 |
+
return flatten_patches, (grid_t, grid_h, grid_w)
|
327 |
+
|
328 |
+
def preprocess(
|
329 |
+
self,
|
330 |
+
images: ImageInput,
|
331 |
+
videos: VideoInput = None,
|
332 |
+
do_resize: bool = None,
|
333 |
+
size: Dict[str, int] = None,
|
334 |
+
resample: PILImageResampling = None,
|
335 |
+
do_rescale: bool = None,
|
336 |
+
rescale_factor: float = None,
|
337 |
+
do_normalize: bool = None,
|
338 |
+
image_mean: Optional[Union[float, List[float]]] = None,
|
339 |
+
image_std: Optional[Union[float, List[float]]] = None,
|
340 |
+
do_convert_rgb: bool = None,
|
341 |
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
342 |
+
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
343 |
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
344 |
+
):
|
345 |
+
"""
|
346 |
+
Args:
|
347 |
+
images (`ImageInput`):
|
348 |
+
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
349 |
+
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
350 |
+
videos (`VideoInput`):
|
351 |
+
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
352 |
+
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
353 |
+
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
354 |
+
Whether to resize the image.
|
355 |
+
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
356 |
+
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
357 |
+
the longest edge resized to keep the input aspect ratio.
|
358 |
+
resample (`int`, *optional*, defaults to `self.resample`):
|
359 |
+
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
360 |
+
has an effect if `do_resize` is set to `True`.
|
361 |
+
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
362 |
+
Whether to rescale the image.
|
363 |
+
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
364 |
+
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
365 |
+
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
366 |
+
Whether to normalize the image.
|
367 |
+
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
368 |
+
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
369 |
+
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
370 |
+
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
371 |
+
`True`.
|
372 |
+
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
373 |
+
Whether to convert the image to RGB.
|
374 |
+
return_tensors (`str` or `TensorType`, *optional*):
|
375 |
+
The type of tensors to return. Can be one of:
|
376 |
+
- Unset: Return a list of `np.ndarray`.
|
377 |
+
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
378 |
+
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
379 |
+
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
380 |
+
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
381 |
+
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
382 |
+
The channel dimension format for the output image. Can be one of:
|
383 |
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
384 |
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
385 |
+
- Unset: Use the channel dimension format of the input image.
|
386 |
+
input_data_format (`ChannelDimension` or `str`, *optional*):
|
387 |
+
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
388 |
+
from the input image. Can be one of:
|
389 |
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
390 |
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
391 |
+
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
392 |
+
|
393 |
+
"""
|
394 |
+
do_resize = do_resize if do_resize is not None else self.do_resize
|
395 |
+
size = size if size is not None else self.size
|
396 |
+
resample = resample if resample is not None else self.resample
|
397 |
+
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
398 |
+
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
399 |
+
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
400 |
+
image_mean = image_mean if image_mean is not None else self.image_mean
|
401 |
+
image_std = image_std if image_std is not None else self.image_std
|
402 |
+
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
403 |
+
|
404 |
+
if images is not None:
|
405 |
+
images = make_batched_images(images)
|
406 |
+
if videos is not None:
|
407 |
+
videos = make_batched_videos(videos)
|
408 |
+
|
409 |
+
if images is not None and not valid_images(images):
|
410 |
+
raise ValueError(
|
411 |
+
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
412 |
+
"torch.Tensor, tf.Tensor or jax.ndarray."
|
413 |
+
)
|
414 |
+
|
415 |
+
validate_preprocess_arguments(
|
416 |
+
rescale_factor=rescale_factor,
|
417 |
+
do_normalize=do_normalize,
|
418 |
+
image_mean=image_mean,
|
419 |
+
image_std=image_std,
|
420 |
+
do_resize=do_resize,
|
421 |
+
size=size,
|
422 |
+
resample=resample,
|
423 |
+
)
|
424 |
+
|
425 |
+
if images is not None:
|
426 |
+
pixel_values, vision_grid_thws = [], []
|
427 |
+
for image in images:
|
428 |
+
patches, image_grid_thw = self._preprocess(
|
429 |
+
image,
|
430 |
+
do_resize=do_resize,
|
431 |
+
resample=resample,
|
432 |
+
do_rescale=do_rescale,
|
433 |
+
rescale_factor=rescale_factor,
|
434 |
+
do_normalize=do_normalize,
|
435 |
+
image_mean=image_mean,
|
436 |
+
image_std=image_std,
|
437 |
+
data_format=data_format,
|
438 |
+
do_convert_rgb=do_convert_rgb,
|
439 |
+
input_data_format=input_data_format,
|
440 |
+
)
|
441 |
+
pixel_values.extend(patches)
|
442 |
+
vision_grid_thws.append(image_grid_thw)
|
443 |
+
pixel_values = np.array(pixel_values)
|
444 |
+
vision_grid_thws = np.array(vision_grid_thws)
|
445 |
+
data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
|
446 |
+
|
447 |
+
if videos is not None:
|
448 |
+
pixel_values, vision_grid_thws = [], []
|
449 |
+
for images in videos:
|
450 |
+
patches, video_grid_thw = self._preprocess(
|
451 |
+
images,
|
452 |
+
do_resize=do_resize,
|
453 |
+
resample=resample,
|
454 |
+
do_rescale=do_rescale,
|
455 |
+
rescale_factor=rescale_factor,
|
456 |
+
do_normalize=do_normalize,
|
457 |
+
image_mean=image_mean,
|
458 |
+
image_std=image_std,
|
459 |
+
data_format=data_format,
|
460 |
+
do_convert_rgb=do_convert_rgb,
|
461 |
+
input_data_format=input_data_format,
|
462 |
+
)
|
463 |
+
pixel_values.extend(patches)
|
464 |
+
vision_grid_thws.append(video_grid_thw)
|
465 |
+
pixel_values = np.array(pixel_values)
|
466 |
+
vision_grid_thws = np.array(vision_grid_thws)
|
467 |
+
data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
|
468 |
+
|
469 |
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
image_utils.py
ADDED
@@ -0,0 +1,806 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2021 The HuggingFace Inc. team.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
|
16 |
+
import base64
|
17 |
+
import os
|
18 |
+
from io import BytesIO
|
19 |
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
|
20 |
+
|
21 |
+
import numpy as np
|
22 |
+
import requests
|
23 |
+
from packaging import version
|
24 |
+
|
25 |
+
from transformers.utils import (
|
26 |
+
ExplicitEnum,
|
27 |
+
is_jax_tensor,
|
28 |
+
is_numpy_array,
|
29 |
+
is_tf_tensor,
|
30 |
+
is_torch_available,
|
31 |
+
is_torch_tensor,
|
32 |
+
is_torchvision_available,
|
33 |
+
is_vision_available,
|
34 |
+
logging,
|
35 |
+
requires_backends,
|
36 |
+
to_numpy,
|
37 |
+
)
|
38 |
+
from transformers.utils.constants import ( # noqa: F401
|
39 |
+
IMAGENET_DEFAULT_MEAN,
|
40 |
+
IMAGENET_DEFAULT_STD,
|
41 |
+
IMAGENET_STANDARD_MEAN,
|
42 |
+
IMAGENET_STANDARD_STD,
|
43 |
+
OPENAI_CLIP_MEAN,
|
44 |
+
OPENAI_CLIP_STD,
|
45 |
+
)
|
46 |
+
|
47 |
+
if is_vision_available():
|
48 |
+
import PIL.Image
|
49 |
+
import PIL.ImageOps
|
50 |
+
|
51 |
+
if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
|
52 |
+
PILImageResampling = PIL.Image.Resampling
|
53 |
+
else:
|
54 |
+
PILImageResampling = PIL.Image
|
55 |
+
|
56 |
+
if is_torchvision_available():
|
57 |
+
from torchvision.transforms import InterpolationMode
|
58 |
+
|
59 |
+
pil_torch_interpolation_mapping = {
|
60 |
+
PILImageResampling.NEAREST: InterpolationMode.NEAREST,
|
61 |
+
PILImageResampling.BOX: InterpolationMode.BOX,
|
62 |
+
PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
|
63 |
+
PILImageResampling.HAMMING: InterpolationMode.HAMMING,
|
64 |
+
PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
|
65 |
+
PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
|
66 |
+
}
|
67 |
+
|
68 |
+
if TYPE_CHECKING:
|
69 |
+
if is_torch_available():
|
70 |
+
import torch
|
71 |
+
|
72 |
+
logger = logging.get_logger(__name__)
|
73 |
+
|
74 |
+
ImageInput = Union[
|
75 |
+
"PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
|
76 |
+
] # noqa
|
77 |
+
|
78 |
+
VideoInput = Union[
|
79 |
+
List["PIL.Image.Image"],
|
80 |
+
"np.ndarray",
|
81 |
+
"torch.Tensor",
|
82 |
+
List["np.ndarray"],
|
83 |
+
List["torch.Tensor"],
|
84 |
+
List[List["PIL.Image.Image"]],
|
85 |
+
List[List["np.ndarrray"]],
|
86 |
+
List[List["torch.Tensor"]],
|
87 |
+
] # noqa
|
88 |
+
|
89 |
+
|
90 |
+
class ChannelDimension(ExplicitEnum):
|
91 |
+
FIRST = "channels_first"
|
92 |
+
LAST = "channels_last"
|
93 |
+
|
94 |
+
|
95 |
+
class AnnotationFormat(ExplicitEnum):
|
96 |
+
COCO_DETECTION = "coco_detection"
|
97 |
+
COCO_PANOPTIC = "coco_panoptic"
|
98 |
+
|
99 |
+
|
100 |
+
class AnnotionFormat(ExplicitEnum):
|
101 |
+
COCO_DETECTION = AnnotationFormat.COCO_DETECTION.value
|
102 |
+
COCO_PANOPTIC = AnnotationFormat.COCO_PANOPTIC.value
|
103 |
+
|
104 |
+
|
105 |
+
AnnotationType = Dict[str, Union[int, str, List[Dict]]]
|
106 |
+
|
107 |
+
|
108 |
+
def is_pil_image(img):
|
109 |
+
return is_vision_available() and isinstance(img, PIL.Image.Image)
|
110 |
+
|
111 |
+
|
112 |
+
class ImageType(ExplicitEnum):
|
113 |
+
PIL = "pillow"
|
114 |
+
TORCH = "torch"
|
115 |
+
NUMPY = "numpy"
|
116 |
+
TENSORFLOW = "tensorflow"
|
117 |
+
JAX = "jax"
|
118 |
+
|
119 |
+
|
120 |
+
def get_image_type(image):
|
121 |
+
if is_pil_image(image):
|
122 |
+
return ImageType.PIL
|
123 |
+
if is_torch_tensor(image):
|
124 |
+
return ImageType.TORCH
|
125 |
+
if is_numpy_array(image):
|
126 |
+
return ImageType.NUMPY
|
127 |
+
if is_tf_tensor(image):
|
128 |
+
return ImageType.TENSORFLOW
|
129 |
+
if is_jax_tensor(image):
|
130 |
+
return ImageType.JAX
|
131 |
+
raise ValueError(f"Unrecognised image type {type(image)}")
|
132 |
+
|
133 |
+
|
134 |
+
def is_valid_image(img):
|
135 |
+
return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
|
136 |
+
|
137 |
+
|
138 |
+
def valid_images(imgs):
|
139 |
+
# If we have an list of images, make sure every image is valid
|
140 |
+
if isinstance(imgs, (list, tuple)):
|
141 |
+
for img in imgs:
|
142 |
+
if not valid_images(img):
|
143 |
+
return False
|
144 |
+
# If not a list of tuple, we have been given a single image or batched tensor of images
|
145 |
+
elif not is_valid_image(imgs):
|
146 |
+
return False
|
147 |
+
return True
|
148 |
+
|
149 |
+
|
150 |
+
def is_batched(img):
|
151 |
+
if isinstance(img, (list, tuple)):
|
152 |
+
return is_valid_image(img[0])
|
153 |
+
return False
|
154 |
+
|
155 |
+
|
156 |
+
def is_scaled_image(image: np.ndarray) -> bool:
|
157 |
+
"""
|
158 |
+
Checks to see whether the pixel values have already been rescaled to [0, 1].
|
159 |
+
"""
|
160 |
+
if image.dtype == np.uint8:
|
161 |
+
return False
|
162 |
+
|
163 |
+
# It's possible the image has pixel values in [0, 255] but is of floating type
|
164 |
+
return np.min(image) >= 0 and np.max(image) <= 1
|
165 |
+
|
166 |
+
|
167 |
+
def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
|
168 |
+
"""
|
169 |
+
Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
|
170 |
+
If the input is a batch of images, it is converted to a list of images.
|
171 |
+
|
172 |
+
Args:
|
173 |
+
images (`ImageInput`):
|
174 |
+
Image of images to turn into a list of images.
|
175 |
+
expected_ndims (`int`, *optional*, defaults to 3):
|
176 |
+
Expected number of dimensions for a single input image. If the input image has a different number of
|
177 |
+
dimensions, an error is raised.
|
178 |
+
"""
|
179 |
+
if is_batched(images):
|
180 |
+
return images
|
181 |
+
|
182 |
+
# Either the input is a single image, in which case we create a list of length 1
|
183 |
+
if isinstance(images, PIL.Image.Image):
|
184 |
+
# PIL images are never batched
|
185 |
+
return [images]
|
186 |
+
|
187 |
+
if is_valid_image(images):
|
188 |
+
if images.ndim == expected_ndims + 1:
|
189 |
+
# Batch of images
|
190 |
+
images = list(images)
|
191 |
+
elif images.ndim == expected_ndims:
|
192 |
+
# Single image
|
193 |
+
images = [images]
|
194 |
+
else:
|
195 |
+
raise ValueError(
|
196 |
+
f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
|
197 |
+
f" {images.ndim} dimensions."
|
198 |
+
)
|
199 |
+
return images
|
200 |
+
raise ValueError(
|
201 |
+
"Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or "
|
202 |
+
f"jax.ndarray, but got {type(images)}."
|
203 |
+
)
|
204 |
+
|
205 |
+
|
206 |
+
def to_numpy_array(img) -> np.ndarray:
|
207 |
+
if not is_valid_image(img):
|
208 |
+
raise ValueError(f"Invalid image type: {type(img)}")
|
209 |
+
|
210 |
+
if is_vision_available() and isinstance(img, PIL.Image.Image):
|
211 |
+
return np.array(img)
|
212 |
+
return to_numpy(img)
|
213 |
+
|
214 |
+
|
215 |
+
def infer_channel_dimension_format(
|
216 |
+
image: np.ndarray, num_channels: Optional[Union[int, Tuple[int, ...]]] = None
|
217 |
+
) -> ChannelDimension:
|
218 |
+
"""
|
219 |
+
Infers the channel dimension format of `image`.
|
220 |
+
|
221 |
+
Args:
|
222 |
+
image (`np.ndarray`):
|
223 |
+
The image to infer the channel dimension of.
|
224 |
+
num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
|
225 |
+
The number of channels of the image.
|
226 |
+
|
227 |
+
Returns:
|
228 |
+
The channel dimension of the image.
|
229 |
+
"""
|
230 |
+
num_channels = num_channels if num_channels is not None else (1, 3)
|
231 |
+
num_channels = (num_channels,) if isinstance(num_channels, int) else num_channels
|
232 |
+
|
233 |
+
if image.ndim == 3:
|
234 |
+
first_dim, last_dim = 0, 2
|
235 |
+
elif image.ndim == 4:
|
236 |
+
first_dim, last_dim = 1, 3
|
237 |
+
else:
|
238 |
+
raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
|
239 |
+
|
240 |
+
if image.shape[first_dim] in num_channels and image.shape[last_dim] in num_channels:
|
241 |
+
logger.warning(
|
242 |
+
f"The channel dimension is ambiguous. Got image shape {image.shape}. Assuming channels are the first dimension."
|
243 |
+
)
|
244 |
+
return ChannelDimension.FIRST
|
245 |
+
elif image.shape[first_dim] in num_channels:
|
246 |
+
return ChannelDimension.FIRST
|
247 |
+
elif image.shape[last_dim] in num_channels:
|
248 |
+
return ChannelDimension.LAST
|
249 |
+
raise ValueError("Unable to infer channel dimension format")
|
250 |
+
|
251 |
+
|
252 |
+
def get_channel_dimension_axis(
|
253 |
+
image: np.ndarray, input_data_format: Optional[Union[ChannelDimension, str]] = None
|
254 |
+
) -> int:
|
255 |
+
"""
|
256 |
+
Returns the channel dimension axis of the image.
|
257 |
+
|
258 |
+
Args:
|
259 |
+
image (`np.ndarray`):
|
260 |
+
The image to get the channel dimension axis of.
|
261 |
+
input_data_format (`ChannelDimension` or `str`, *optional*):
|
262 |
+
The channel dimension format of the image. If `None`, will infer the channel dimension from the image.
|
263 |
+
|
264 |
+
Returns:
|
265 |
+
The channel dimension axis of the image.
|
266 |
+
"""
|
267 |
+
if input_data_format is None:
|
268 |
+
input_data_format = infer_channel_dimension_format(image)
|
269 |
+
if input_data_format == ChannelDimension.FIRST:
|
270 |
+
return image.ndim - 3
|
271 |
+
elif input_data_format == ChannelDimension.LAST:
|
272 |
+
return image.ndim - 1
|
273 |
+
raise ValueError(f"Unsupported data format: {input_data_format}")
|
274 |
+
|
275 |
+
|
276 |
+
def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
|
277 |
+
"""
|
278 |
+
Returns the (height, width) dimensions of the image.
|
279 |
+
|
280 |
+
Args:
|
281 |
+
image (`np.ndarray`):
|
282 |
+
The image to get the dimensions of.
|
283 |
+
channel_dim (`ChannelDimension`, *optional*):
|
284 |
+
Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.
|
285 |
+
|
286 |
+
Returns:
|
287 |
+
A tuple of the image's height and width.
|
288 |
+
"""
|
289 |
+
if channel_dim is None:
|
290 |
+
channel_dim = infer_channel_dimension_format(image)
|
291 |
+
|
292 |
+
if channel_dim == ChannelDimension.FIRST:
|
293 |
+
return image.shape[-2], image.shape[-1]
|
294 |
+
elif channel_dim == ChannelDimension.LAST:
|
295 |
+
return image.shape[-3], image.shape[-2]
|
296 |
+
else:
|
297 |
+
raise ValueError(f"Unsupported data format: {channel_dim}")
|
298 |
+
|
299 |
+
|
300 |
+
def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool:
|
301 |
+
if (
|
302 |
+
isinstance(annotation, dict)
|
303 |
+
and "image_id" in annotation
|
304 |
+
and "annotations" in annotation
|
305 |
+
and isinstance(annotation["annotations"], (list, tuple))
|
306 |
+
and (
|
307 |
+
# an image can have no annotations
|
308 |
+
len(annotation["annotations"]) == 0 or isinstance(annotation["annotations"][0], dict)
|
309 |
+
)
|
310 |
+
):
|
311 |
+
return True
|
312 |
+
return False
|
313 |
+
|
314 |
+
|
315 |
+
def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]]) -> bool:
|
316 |
+
if (
|
317 |
+
isinstance(annotation, dict)
|
318 |
+
and "image_id" in annotation
|
319 |
+
and "segments_info" in annotation
|
320 |
+
and "file_name" in annotation
|
321 |
+
and isinstance(annotation["segments_info"], (list, tuple))
|
322 |
+
and (
|
323 |
+
# an image can have no segments
|
324 |
+
len(annotation["segments_info"]) == 0 or isinstance(annotation["segments_info"][0], dict)
|
325 |
+
)
|
326 |
+
):
|
327 |
+
return True
|
328 |
+
return False
|
329 |
+
|
330 |
+
|
331 |
+
def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
|
332 |
+
return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
|
333 |
+
|
334 |
+
|
335 |
+
def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
|
336 |
+
return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
|
337 |
+
|
338 |
+
|
339 |
+
def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
|
340 |
+
"""
|
341 |
+
Loads `image` to a PIL Image.
|
342 |
+
|
343 |
+
Args:
|
344 |
+
image (`str` or `PIL.Image.Image`):
|
345 |
+
The image to convert to the PIL Image format.
|
346 |
+
timeout (`float`, *optional*):
|
347 |
+
The timeout value in seconds for the URL request.
|
348 |
+
|
349 |
+
Returns:
|
350 |
+
`PIL.Image.Image`: A PIL Image.
|
351 |
+
"""
|
352 |
+
requires_backends(load_image, ["vision"])
|
353 |
+
if isinstance(image, str):
|
354 |
+
if image.startswith("http://") or image.startswith("https://"):
|
355 |
+
# We need to actually check for a real protocol, otherwise it's impossible to use a local file
|
356 |
+
# like http_huggingface_co.png
|
357 |
+
image = PIL.Image.open(BytesIO(requests.get(image, timeout=timeout).content))
|
358 |
+
elif os.path.isfile(image):
|
359 |
+
image = PIL.Image.open(image)
|
360 |
+
else:
|
361 |
+
if image.startswith("data:image/"):
|
362 |
+
image = image.split(",")[1]
|
363 |
+
|
364 |
+
# Try to load as base64
|
365 |
+
try:
|
366 |
+
b64 = base64.decodebytes(image.encode())
|
367 |
+
image = PIL.Image.open(BytesIO(b64))
|
368 |
+
except Exception as e:
|
369 |
+
raise ValueError(
|
370 |
+
f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
|
371 |
+
)
|
372 |
+
elif isinstance(image, PIL.Image.Image):
|
373 |
+
image = image
|
374 |
+
else:
|
375 |
+
raise TypeError(
|
376 |
+
"Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
|
377 |
+
)
|
378 |
+
image = PIL.ImageOps.exif_transpose(image)
|
379 |
+
image = image.convert("RGB")
|
380 |
+
return image
|
381 |
+
|
382 |
+
|
383 |
+
def validate_preprocess_arguments(
|
384 |
+
do_rescale: Optional[bool] = None,
|
385 |
+
rescale_factor: Optional[float] = None,
|
386 |
+
do_normalize: Optional[bool] = None,
|
387 |
+
image_mean: Optional[Union[float, List[float]]] = None,
|
388 |
+
image_std: Optional[Union[float, List[float]]] = None,
|
389 |
+
do_pad: Optional[bool] = None,
|
390 |
+
size_divisibility: Optional[int] = None,
|
391 |
+
do_center_crop: Optional[bool] = None,
|
392 |
+
crop_size: Optional[Dict[str, int]] = None,
|
393 |
+
do_resize: Optional[bool] = None,
|
394 |
+
size: Optional[Dict[str, int]] = None,
|
395 |
+
resample: Optional["PILImageResampling"] = None,
|
396 |
+
):
|
397 |
+
"""
|
398 |
+
Checks validity of typically used arguments in an `ImageProcessor` `preprocess` method.
|
399 |
+
Raises `ValueError` if arguments incompatibility is caught.
|
400 |
+
Many incompatibilities are model-specific. `do_pad` sometimes needs `size_divisor`,
|
401 |
+
sometimes `size_divisibility`, and sometimes `size`. New models and processors added should follow
|
402 |
+
existing arguments when possible.
|
403 |
+
|
404 |
+
"""
|
405 |
+
if do_rescale and rescale_factor is None:
|
406 |
+
raise ValueError("`rescale_factor` must be specified if `do_rescale` is `True`.")
|
407 |
+
|
408 |
+
if do_pad and size_divisibility is None:
|
409 |
+
# Here, size_divisor might be passed as the value of size
|
410 |
+
raise ValueError(
|
411 |
+
"Depending on the model, `size_divisibility`, `size_divisor`, `pad_size` or `size` must be specified if `do_pad` is `True`."
|
412 |
+
)
|
413 |
+
|
414 |
+
if do_normalize and (image_mean is None or image_std is None):
|
415 |
+
raise ValueError("`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.")
|
416 |
+
|
417 |
+
if do_center_crop and crop_size is None:
|
418 |
+
raise ValueError("`crop_size` must be specified if `do_center_crop` is `True`.")
|
419 |
+
|
420 |
+
if do_resize and (size is None or resample is None):
|
421 |
+
raise ValueError("`size` and `resample` must be specified if `do_resize` is `True`.")
|
422 |
+
|
423 |
+
|
424 |
+
# In the future we can add a TF implementation here when we have TF models.
|
425 |
+
class ImageFeatureExtractionMixin:
|
426 |
+
"""
|
427 |
+
Mixin that contain utilities for preparing image features.
|
428 |
+
"""
|
429 |
+
|
430 |
+
def _ensure_format_supported(self, image):
|
431 |
+
if not isinstance(image, (PIL.Image.Image, np.ndarray)) and not is_torch_tensor(image):
|
432 |
+
raise ValueError(
|
433 |
+
f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.array` and "
|
434 |
+
"`torch.Tensor` are."
|
435 |
+
)
|
436 |
+
|
437 |
+
def to_pil_image(self, image, rescale=None):
|
438 |
+
"""
|
439 |
+
Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
|
440 |
+
needed.
|
441 |
+
|
442 |
+
Args:
|
443 |
+
image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
|
444 |
+
The image to convert to the PIL Image format.
|
445 |
+
rescale (`bool`, *optional*):
|
446 |
+
Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
|
447 |
+
default to `True` if the image type is a floating type, `False` otherwise.
|
448 |
+
"""
|
449 |
+
self._ensure_format_supported(image)
|
450 |
+
|
451 |
+
if is_torch_tensor(image):
|
452 |
+
image = image.numpy()
|
453 |
+
|
454 |
+
if isinstance(image, np.ndarray):
|
455 |
+
if rescale is None:
|
456 |
+
# rescale default to the array being of floating type.
|
457 |
+
rescale = isinstance(image.flat[0], np.floating)
|
458 |
+
# If the channel as been moved to first dim, we put it back at the end.
|
459 |
+
if image.ndim == 3 and image.shape[0] in [1, 3]:
|
460 |
+
image = image.transpose(1, 2, 0)
|
461 |
+
if rescale:
|
462 |
+
image = image * 255
|
463 |
+
image = image.astype(np.uint8)
|
464 |
+
return PIL.Image.fromarray(image)
|
465 |
+
return image
|
466 |
+
|
467 |
+
def convert_rgb(self, image):
|
468 |
+
"""
|
469 |
+
Converts `PIL.Image.Image` to RGB format.
|
470 |
+
|
471 |
+
Args:
|
472 |
+
image (`PIL.Image.Image`):
|
473 |
+
The image to convert.
|
474 |
+
"""
|
475 |
+
self._ensure_format_supported(image)
|
476 |
+
if not isinstance(image, PIL.Image.Image):
|
477 |
+
return image
|
478 |
+
|
479 |
+
return image.convert("RGB")
|
480 |
+
|
481 |
+
def rescale(self, image: np.ndarray, scale: Union[float, int]) -> np.ndarray:
|
482 |
+
"""
|
483 |
+
Rescale a numpy image by scale amount
|
484 |
+
"""
|
485 |
+
self._ensure_format_supported(image)
|
486 |
+
return image * scale
|
487 |
+
|
488 |
+
def to_numpy_array(self, image, rescale=None, channel_first=True):
|
489 |
+
"""
|
490 |
+
Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
|
491 |
+
dimension.
|
492 |
+
|
493 |
+
Args:
|
494 |
+
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
|
495 |
+
The image to convert to a NumPy array.
|
496 |
+
rescale (`bool`, *optional*):
|
497 |
+
Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
|
498 |
+
default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
|
499 |
+
channel_first (`bool`, *optional*, defaults to `True`):
|
500 |
+
Whether or not to permute the dimensions of the image to put the channel dimension first.
|
501 |
+
"""
|
502 |
+
self._ensure_format_supported(image)
|
503 |
+
|
504 |
+
if isinstance(image, PIL.Image.Image):
|
505 |
+
image = np.array(image)
|
506 |
+
|
507 |
+
if is_torch_tensor(image):
|
508 |
+
image = image.numpy()
|
509 |
+
|
510 |
+
rescale = isinstance(image.flat[0], np.integer) if rescale is None else rescale
|
511 |
+
|
512 |
+
if rescale:
|
513 |
+
image = self.rescale(image.astype(np.float32), 1 / 255.0)
|
514 |
+
|
515 |
+
if channel_first and image.ndim == 3:
|
516 |
+
image = image.transpose(2, 0, 1)
|
517 |
+
|
518 |
+
return image
|
519 |
+
|
520 |
+
def expand_dims(self, image):
|
521 |
+
"""
|
522 |
+
Expands 2-dimensional `image` to 3 dimensions.
|
523 |
+
|
524 |
+
Args:
|
525 |
+
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
|
526 |
+
The image to expand.
|
527 |
+
"""
|
528 |
+
self._ensure_format_supported(image)
|
529 |
+
|
530 |
+
# Do nothing if PIL image
|
531 |
+
if isinstance(image, PIL.Image.Image):
|
532 |
+
return image
|
533 |
+
|
534 |
+
if is_torch_tensor(image):
|
535 |
+
image = image.unsqueeze(0)
|
536 |
+
else:
|
537 |
+
image = np.expand_dims(image, axis=0)
|
538 |
+
return image
|
539 |
+
|
540 |
+
def normalize(self, image, mean, std, rescale=False):
|
541 |
+
"""
|
542 |
+
Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
|
543 |
+
if it's a PIL Image.
|
544 |
+
|
545 |
+
Args:
|
546 |
+
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
|
547 |
+
The image to normalize.
|
548 |
+
mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
|
549 |
+
The mean (per channel) to use for normalization.
|
550 |
+
std (`List[float]` or `np.ndarray` or `torch.Tensor`):
|
551 |
+
The standard deviation (per channel) to use for normalization.
|
552 |
+
rescale (`bool`, *optional*, defaults to `False`):
|
553 |
+
Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
|
554 |
+
happen automatically.
|
555 |
+
"""
|
556 |
+
self._ensure_format_supported(image)
|
557 |
+
|
558 |
+
if isinstance(image, PIL.Image.Image):
|
559 |
+
image = self.to_numpy_array(image, rescale=True)
|
560 |
+
# If the input image is a PIL image, it automatically gets rescaled. If it's another
|
561 |
+
# type it may need rescaling.
|
562 |
+
elif rescale:
|
563 |
+
if isinstance(image, np.ndarray):
|
564 |
+
image = self.rescale(image.astype(np.float32), 1 / 255.0)
|
565 |
+
elif is_torch_tensor(image):
|
566 |
+
image = self.rescale(image.float(), 1 / 255.0)
|
567 |
+
|
568 |
+
if isinstance(image, np.ndarray):
|
569 |
+
if not isinstance(mean, np.ndarray):
|
570 |
+
mean = np.array(mean).astype(image.dtype)
|
571 |
+
if not isinstance(std, np.ndarray):
|
572 |
+
std = np.array(std).astype(image.dtype)
|
573 |
+
elif is_torch_tensor(image):
|
574 |
+
import torch
|
575 |
+
|
576 |
+
if not isinstance(mean, torch.Tensor):
|
577 |
+
if isinstance(mean, np.ndarray):
|
578 |
+
mean = torch.from_numpy(mean)
|
579 |
+
else:
|
580 |
+
mean = torch.tensor(mean)
|
581 |
+
if not isinstance(std, torch.Tensor):
|
582 |
+
if isinstance(std, np.ndarray):
|
583 |
+
std = torch.from_numpy(std)
|
584 |
+
else:
|
585 |
+
std = torch.tensor(std)
|
586 |
+
|
587 |
+
if image.ndim == 3 and image.shape[0] in [1, 3]:
|
588 |
+
return (image - mean[:, None, None]) / std[:, None, None]
|
589 |
+
else:
|
590 |
+
return (image - mean) / std
|
591 |
+
|
592 |
+
def resize(self, image, size, resample=None, default_to_square=True, max_size=None):
|
593 |
+
"""
|
594 |
+
Resizes `image`. Enforces conversion of input to PIL.Image.
|
595 |
+
|
596 |
+
Args:
|
597 |
+
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
|
598 |
+
The image to resize.
|
599 |
+
size (`int` or `Tuple[int, int]`):
|
600 |
+
The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
|
601 |
+
matched to this.
|
602 |
+
|
603 |
+
If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
|
604 |
+
`size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to
|
605 |
+
this number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
|
606 |
+
resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
607 |
+
The filter to user for resampling.
|
608 |
+
default_to_square (`bool`, *optional*, defaults to `True`):
|
609 |
+
How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a
|
610 |
+
square (`size`,`size`). If set to `False`, will replicate
|
611 |
+
[`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
|
612 |
+
with support for resizing only the smallest edge and providing an optional `max_size`.
|
613 |
+
max_size (`int`, *optional*, defaults to `None`):
|
614 |
+
The maximum allowed for the longer edge of the resized image: if the longer edge of the image is
|
615 |
+
greater than `max_size` after being resized according to `size`, then the image is resized again so
|
616 |
+
that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller
|
617 |
+
edge may be shorter than `size`. Only used if `default_to_square` is `False`.
|
618 |
+
|
619 |
+
Returns:
|
620 |
+
image: A resized `PIL.Image.Image`.
|
621 |
+
"""
|
622 |
+
resample = resample if resample is not None else PILImageResampling.BILINEAR
|
623 |
+
|
624 |
+
self._ensure_format_supported(image)
|
625 |
+
|
626 |
+
if not isinstance(image, PIL.Image.Image):
|
627 |
+
image = self.to_pil_image(image)
|
628 |
+
|
629 |
+
if isinstance(size, list):
|
630 |
+
size = tuple(size)
|
631 |
+
|
632 |
+
if isinstance(size, int) or len(size) == 1:
|
633 |
+
if default_to_square:
|
634 |
+
size = (size, size) if isinstance(size, int) else (size[0], size[0])
|
635 |
+
else:
|
636 |
+
width, height = image.size
|
637 |
+
# specified size only for the smallest edge
|
638 |
+
short, long = (width, height) if width <= height else (height, width)
|
639 |
+
requested_new_short = size if isinstance(size, int) else size[0]
|
640 |
+
|
641 |
+
if short == requested_new_short:
|
642 |
+
return image
|
643 |
+
|
644 |
+
new_short, new_long = requested_new_short, int(requested_new_short * long / short)
|
645 |
+
|
646 |
+
if max_size is not None:
|
647 |
+
if max_size <= requested_new_short:
|
648 |
+
raise ValueError(
|
649 |
+
f"max_size = {max_size} must be strictly greater than the requested "
|
650 |
+
f"size for the smaller edge size = {size}"
|
651 |
+
)
|
652 |
+
if new_long > max_size:
|
653 |
+
new_short, new_long = int(max_size * new_short / new_long), max_size
|
654 |
+
|
655 |
+
size = (new_short, new_long) if width <= height else (new_long, new_short)
|
656 |
+
|
657 |
+
return image.resize(size, resample=resample)
|
658 |
+
|
659 |
+
def center_crop(self, image, size):
|
660 |
+
"""
|
661 |
+
Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
|
662 |
+
size given, it will be padded (so the returned result has the size asked).
|
663 |
+
|
664 |
+
Args:
|
665 |
+
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
|
666 |
+
The image to resize.
|
667 |
+
size (`int` or `Tuple[int, int]`):
|
668 |
+
The size to which crop the image.
|
669 |
+
|
670 |
+
Returns:
|
671 |
+
new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape: (n_channels,
|
672 |
+
height, width).
|
673 |
+
"""
|
674 |
+
self._ensure_format_supported(image)
|
675 |
+
|
676 |
+
if not isinstance(size, tuple):
|
677 |
+
size = (size, size)
|
678 |
+
|
679 |
+
# PIL Image.size is (width, height) but NumPy array and torch Tensors have (height, width)
|
680 |
+
if is_torch_tensor(image) or isinstance(image, np.ndarray):
|
681 |
+
if image.ndim == 2:
|
682 |
+
image = self.expand_dims(image)
|
683 |
+
image_shape = image.shape[1:] if image.shape[0] in [1, 3] else image.shape[:2]
|
684 |
+
else:
|
685 |
+
image_shape = (image.size[1], image.size[0])
|
686 |
+
|
687 |
+
top = (image_shape[0] - size[0]) // 2
|
688 |
+
bottom = top + size[0] # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
|
689 |
+
left = (image_shape[1] - size[1]) // 2
|
690 |
+
right = left + size[1] # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
|
691 |
+
|
692 |
+
# For PIL Images we have a method to crop directly.
|
693 |
+
if isinstance(image, PIL.Image.Image):
|
694 |
+
return image.crop((left, top, right, bottom))
|
695 |
+
|
696 |
+
# Check if image is in (n_channels, height, width) or (height, width, n_channels) format
|
697 |
+
channel_first = True if image.shape[0] in [1, 3] else False
|
698 |
+
|
699 |
+
# Transpose (height, width, n_channels) format images
|
700 |
+
if not channel_first:
|
701 |
+
if isinstance(image, np.ndarray):
|
702 |
+
image = image.transpose(2, 0, 1)
|
703 |
+
if is_torch_tensor(image):
|
704 |
+
image = image.permute(2, 0, 1)
|
705 |
+
|
706 |
+
# Check if cropped area is within image boundaries
|
707 |
+
if top >= 0 and bottom <= image_shape[0] and left >= 0 and right <= image_shape[1]:
|
708 |
+
return image[..., top:bottom, left:right]
|
709 |
+
|
710 |
+
# Otherwise, we may need to pad if the image is too small. Oh joy...
|
711 |
+
new_shape = image.shape[:-2] + (max(size[0], image_shape[0]), max(size[1], image_shape[1]))
|
712 |
+
if isinstance(image, np.ndarray):
|
713 |
+
new_image = np.zeros_like(image, shape=new_shape)
|
714 |
+
elif is_torch_tensor(image):
|
715 |
+
new_image = image.new_zeros(new_shape)
|
716 |
+
|
717 |
+
top_pad = (new_shape[-2] - image_shape[0]) // 2
|
718 |
+
bottom_pad = top_pad + image_shape[0]
|
719 |
+
left_pad = (new_shape[-1] - image_shape[1]) // 2
|
720 |
+
right_pad = left_pad + image_shape[1]
|
721 |
+
new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
|
722 |
+
|
723 |
+
top += top_pad
|
724 |
+
bottom += top_pad
|
725 |
+
left += left_pad
|
726 |
+
right += left_pad
|
727 |
+
|
728 |
+
new_image = new_image[
|
729 |
+
..., max(0, top): min(new_image.shape[-2], bottom), max(0, left): min(new_image.shape[-1], right)
|
730 |
+
]
|
731 |
+
|
732 |
+
return new_image
|
733 |
+
|
734 |
+
def flip_channel_order(self, image):
|
735 |
+
"""
|
736 |
+
Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of
|
737 |
+
`image` to a NumPy array if it's a PIL Image.
|
738 |
+
|
739 |
+
Args:
|
740 |
+
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
|
741 |
+
The image whose color channels to flip. If `np.ndarray` or `torch.Tensor`, the channel dimension should
|
742 |
+
be first.
|
743 |
+
"""
|
744 |
+
self._ensure_format_supported(image)
|
745 |
+
|
746 |
+
if isinstance(image, PIL.Image.Image):
|
747 |
+
image = self.to_numpy_array(image)
|
748 |
+
|
749 |
+
return image[::-1, :, :]
|
750 |
+
|
751 |
+
def rotate(self, image, angle, resample=None, expand=0, center=None, translate=None, fillcolor=None):
|
752 |
+
"""
|
753 |
+
Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
|
754 |
+
counter clockwise around its centre.
|
755 |
+
|
756 |
+
Args:
|
757 |
+
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
|
758 |
+
The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
|
759 |
+
rotating.
|
760 |
+
|
761 |
+
Returns:
|
762 |
+
image: A rotated `PIL.Image.Image`.
|
763 |
+
"""
|
764 |
+
resample = resample if resample is not None else PIL.Image.NEAREST
|
765 |
+
|
766 |
+
self._ensure_format_supported(image)
|
767 |
+
|
768 |
+
if not isinstance(image, PIL.Image.Image):
|
769 |
+
image = self.to_pil_image(image)
|
770 |
+
|
771 |
+
return image.rotate(
|
772 |
+
angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
|
773 |
+
)
|
774 |
+
|
775 |
+
|
776 |
+
def validate_annotations(
|
777 |
+
annotation_format: AnnotationFormat,
|
778 |
+
supported_annotation_formats: Tuple[AnnotationFormat, ...],
|
779 |
+
annotations: List[Dict],
|
780 |
+
) -> None:
|
781 |
+
if annotation_format not in supported_annotation_formats:
|
782 |
+
raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
|
783 |
+
|
784 |
+
if annotation_format is AnnotationFormat.COCO_DETECTION:
|
785 |
+
if not valid_coco_detection_annotations(annotations):
|
786 |
+
raise ValueError(
|
787 |
+
"Invalid COCO detection annotations. Annotations must a dict (single image) or list of dicts "
|
788 |
+
"(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
|
789 |
+
"being a list of annotations in the COCO format."
|
790 |
+
)
|
791 |
+
|
792 |
+
if annotation_format is AnnotationFormat.COCO_PANOPTIC:
|
793 |
+
if not valid_coco_panoptic_annotations(annotations):
|
794 |
+
raise ValueError(
|
795 |
+
"Invalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts "
|
796 |
+
"(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
|
797 |
+
"the latter being a list of annotations in the COCO format."
|
798 |
+
)
|
799 |
+
|
800 |
+
|
801 |
+
def validate_kwargs(valid_processor_keys: List[str], captured_kwargs: List[str]):
|
802 |
+
unused_keys = set(captured_kwargs).difference(set(valid_processor_keys))
|
803 |
+
if unused_keys:
|
804 |
+
unused_key_str = ", ".join(unused_keys)
|
805 |
+
# TODO raise a warning here instead of simply logging?
|
806 |
+
logger.warning(f"Unused or unrecognized kwargs: {unused_key_str}.")
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessor_config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoImageProcessor": "image_processing_emova.EMOVAImageProcessor",
|
4 |
+
"AutoProcessor": "processing_emova.EMOVAProcessor"
|
5 |
+
},
|
6 |
+
"do_center_crop": true,
|
7 |
+
"do_convert_rgb": true,
|
8 |
+
"do_normalize": true,
|
9 |
+
"do_pad": true,
|
10 |
+
"do_rescale": true,
|
11 |
+
"do_resize": true,
|
12 |
+
"image_mean": [
|
13 |
+
0.48145466,
|
14 |
+
0.4578275,
|
15 |
+
0.40821073
|
16 |
+
],
|
17 |
+
"image_processor_type": "EMOVAImageProcessor",
|
18 |
+
"image_std": [
|
19 |
+
0.26862954,
|
20 |
+
0.26130258,
|
21 |
+
0.27577711
|
22 |
+
],
|
23 |
+
"max_pixels": 3211264,
|
24 |
+
"merge_size": 2,
|
25 |
+
"min_pixels": 3136,
|
26 |
+
"patch_size": 14,
|
27 |
+
"processor_class": "EMOVAProcessor",
|
28 |
+
"resample": 3,
|
29 |
+
"rescale_factor": 0.00392156862745098,
|
30 |
+
"size": {
|
31 |
+
"max_pixels": 3211264,
|
32 |
+
"min_pixels": 3136
|
33 |
+
},
|
34 |
+
"temporal_patch_size": 2
|
35 |
+
}
|
processing_emova.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
5 |
+
# and OPT implementations in this library. It has been modified from its
|
6 |
+
# original forms to accommodate minor architectural differences compared
|
7 |
+
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
8 |
+
#
|
9 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
+
# you may not use this file except in compliance with the License.
|
11 |
+
# You may obtain a copy of the License at
|
12 |
+
#
|
13 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14 |
+
#
|
15 |
+
# Unless required by applicable law or agreed to in writing, software
|
16 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
+
# See the License for the specific language governing permissions and
|
19 |
+
# limitations under the License.
|
20 |
+
"""
|
21 |
+
Processor class for EMOVA with qwen2vit.
|
22 |
+
"""
|
23 |
+
|
24 |
+
import json
|
25 |
+
from typing import List, Union
|
26 |
+
|
27 |
+
from transformers import AutoProcessor, AutoImageProcessor
|
28 |
+
|
29 |
+
try:
|
30 |
+
from typing import Unpack
|
31 |
+
except ImportError:
|
32 |
+
from typing_extensions import Unpack
|
33 |
+
|
34 |
+
from transformers.feature_extraction_utils import BatchFeature
|
35 |
+
from .image_utils import ImageInput, VideoInput
|
36 |
+
from transformers.processing_utils import (
|
37 |
+
ProcessingKwargs,
|
38 |
+
ProcessorMixin,
|
39 |
+
)
|
40 |
+
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
41 |
+
from transformers.utils import logging
|
42 |
+
|
43 |
+
from .configuration_emova import EMOVAConfig
|
44 |
+
from .image_processing_emova import EMOVAImageProcessor
|
45 |
+
|
46 |
+
logger = logging.get_logger(__name__)
|
47 |
+
|
48 |
+
|
49 |
+
class EMOVAProcessorKwargs(ProcessingKwargs, total=False):
|
50 |
+
_defaults = {
|
51 |
+
"text_kwargs": {
|
52 |
+
"padding": False,
|
53 |
+
},
|
54 |
+
}
|
55 |
+
|
56 |
+
|
57 |
+
class EMOVAProcessor(ProcessorMixin):
|
58 |
+
r"""
|
59 |
+
Constructs a Qwen2-VL processor which wraps a Qwen2-VL image processor and a Qwen2 tokenizer into a single processor.
|
60 |
+
[`EMOVAProcessor`] offers all the functionalities of [`EmovaImageProcessor`] and [`Qwen2TokenizerFast`]. See the
|
61 |
+
[`~EMOVAProcessor.__call__`] and [`~EMOVAProcessor.decode`] for more information.
|
62 |
+
Args:
|
63 |
+
image_processor ([`EmovaImageProcessor`], *optional*):
|
64 |
+
The image processor is a required input.
|
65 |
+
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
66 |
+
The tokenizer is a required input.
|
67 |
+
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
68 |
+
in a chat into a tokenizable string.
|
69 |
+
"""
|
70 |
+
|
71 |
+
attributes = ["image_processor", "tokenizer"]
|
72 |
+
valid_kwargs = ["chat_template"]
|
73 |
+
image_processor_class = "AutoImageProcessor"
|
74 |
+
# image_processor_class = "EMOVAImageProcessor"
|
75 |
+
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
76 |
+
|
77 |
+
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
78 |
+
super().__init__(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template)
|
79 |
+
self.speech_tokenizer = None
|
80 |
+
|
81 |
+
def set_speech_tokenizer(self, tokenizer=None):
|
82 |
+
if self.speech_tokenizer and tokenizer:
|
83 |
+
logger.info('You are resetting speech tokenizer!')
|
84 |
+
return
|
85 |
+
self.speech_tokenizer = tokenizer
|
86 |
+
logger.info('Setting speech tokenizer!')
|
87 |
+
|
88 |
+
def prepare_audio_input(self, text, audio, has_image=False):
|
89 |
+
if text[0]["role"] == "system":
|
90 |
+
system_prompt = text[0]
|
91 |
+
valid_index = 1
|
92 |
+
else:
|
93 |
+
system_prompt = None
|
94 |
+
valid_index = 0
|
95 |
+
logger.warning("Audio inputs are given, but system prompts are not given.")
|
96 |
+
if len(text) > valid_index:
|
97 |
+
logger.warning("When audio inputs are given, text inputs except system prompts will be discarded.")
|
98 |
+
|
99 |
+
audio_chat_format = r'Please recognize the texts, emotion and pitch from the user question speech units and provide the texts, emotion, pitch and speech units for the assistant response. \nEmotion should be chosen from ["neutral", "happy", "sad", "angry", "surprised", "disgusted", "fearful"]. \nPitch should be chosen from ["low", "normal", "high"].\nYour output should be in json format.\nAn output example is:\n{"user question text": "", "user question emotion": "", "user question pitch": "", "assistant response text": "", "assistant response emotion": "", "assistant response pitch": "","assistant response speech": ""}\n\nuser question speech:'
|
100 |
+
audio_chat_prompt = audio_chat_format + self.speech_tokenizer.encode(audio)
|
101 |
+
|
102 |
+
if has_image:
|
103 |
+
audio_chat_input = {
|
104 |
+
"role": "user",
|
105 |
+
"content": [{"type": "image"}, {"type": "text", "text": audio_chat_prompt}],
|
106 |
+
}
|
107 |
+
else:
|
108 |
+
audio_chat_input = {
|
109 |
+
"role": "user",
|
110 |
+
"content": [{"type": "text", "text": audio_chat_prompt}],
|
111 |
+
}
|
112 |
+
return [system_prompt, audio_chat_input] if system_prompt else [audio_chat_input]
|
113 |
+
|
114 |
+
def prepare_audio_output(self, output):
|
115 |
+
try:
|
116 |
+
if output.startswith('{"{"'):
|
117 |
+
return self.prepare_audio_output(output[2:])
|
118 |
+
if output.startswith("{"):
|
119 |
+
if output.endswith("|>"):
|
120 |
+
output += "\"}"
|
121 |
+
elif output.endswith("\""):
|
122 |
+
output += "}"
|
123 |
+
info_dict = json.loads(output)
|
124 |
+
content_unit = info_dict['assistant response speech'].strip()
|
125 |
+
emotion = info_dict['assistant response emotion'] if 'assistant response emotion' in info_dict else "neutral"
|
126 |
+
speed = info_dict['assistant response speed'] if 'assistant response speed' in info_dict else "normal"
|
127 |
+
pitch = info_dict['assistant response pitch'] if 'assistant response pitch' in info_dict else "normal"
|
128 |
+
except:
|
129 |
+
content_unit = output.strip()
|
130 |
+
emotion = 'neutral'
|
131 |
+
speed = "normal"
|
132 |
+
pitch = "normal"
|
133 |
+
return content_unit, emotion, speed, pitch
|
134 |
+
|
135 |
+
def __call__(
|
136 |
+
self,
|
137 |
+
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
138 |
+
images: ImageInput = None,
|
139 |
+
audios: Union[str, List[str]] = None,
|
140 |
+
**kwargs: Unpack[EMOVAProcessorKwargs],
|
141 |
+
) -> BatchFeature:
|
142 |
+
"""
|
143 |
+
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
144 |
+
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
145 |
+
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
|
146 |
+
EmovaImageProcessor's [`~EmovaImageProcessor.__call__`] if `vision_infos` is not `None`.
|
147 |
+
|
148 |
+
Args:
|
149 |
+
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
150 |
+
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
151 |
+
tensor. Both channels-first and channels-last formats are supported.
|
152 |
+
text (`str`, `List[str]`, `List[List[str]]`):
|
153 |
+
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
154 |
+
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
155 |
+
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
156 |
+
audios (`str`, `List[str]`): Paths to the audio input(s).
|
157 |
+
videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
158 |
+
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
159 |
+
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
160 |
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
161 |
+
If set, will return tensors of a particular framework. Acceptable values are:
|
162 |
+
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
163 |
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
164 |
+
- `'np'`: Return NumPy `np.ndarray` objects.
|
165 |
+
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
169 |
+
|
170 |
+
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
171 |
+
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
172 |
+
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
173 |
+
`None`).
|
174 |
+
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
175 |
+
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
|
176 |
+
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
|
177 |
+
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
|
178 |
+
"""
|
179 |
+
output_kwargs = self._merge_kwargs(
|
180 |
+
EMOVAProcessorKwargs,
|
181 |
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
182 |
+
**kwargs,
|
183 |
+
)
|
184 |
+
if images is not None:
|
185 |
+
image_inputs = self.image_processor(images=images, videos=None, **output_kwargs["images_kwargs"])
|
186 |
+
image_grid_thw = image_inputs.pop("image_grid_thw")
|
187 |
+
image_inputs['image_sizes'] = image_grid_thw
|
188 |
+
else:
|
189 |
+
image_inputs = {}
|
190 |
+
image_sizes = None
|
191 |
+
|
192 |
+
if audios is not None:
|
193 |
+
audios = [audios] if not isinstance(audios, list) else audios
|
194 |
+
text = [text] if not isinstance(text[0], list) else text
|
195 |
+
assert len(audios) == len(text), "Audio inputs should correspond with text inputs."
|
196 |
+
assert self.speech_tokenizer, "Audio inputs are given, while speech tokenizer is not set. Call `EMOVAProcessor.prepare_audio_input()` before processing audio inputs."
|
197 |
+
text = [self.prepare_audio_input(txt, audio, has_image=images is not None) for txt, audio in zip(text, audios)]
|
198 |
+
|
199 |
+
if not isinstance(text, list):
|
200 |
+
text = [text]
|
201 |
+
|
202 |
+
_ = output_kwargs["text_kwargs"].pop("padding_side", None)
|
203 |
+
try:
|
204 |
+
text = self.apply_chat_template(text, add_generation_prompt=True, padding=True)
|
205 |
+
except Exception as e:
|
206 |
+
logger.info('Warning: input texts have been applied chat templates!')
|
207 |
+
|
208 |
+
|
209 |
+
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
210 |
+
|
211 |
+
return BatchFeature(data={**text_inputs, **image_inputs})
|
212 |
+
|
213 |
+
def batch_decode(self, sequences, output_wav_prefix='output', *args, **kwargs):
|
214 |
+
return [self.decode(seq, output_wav_file="{}_{}.wav".format(output_wav_prefix, i), *args, **kwargs)
|
215 |
+
for i, seq in enumerate(sequences)]
|
216 |
+
|
217 |
+
def decode(self, *args, speaker='female', output_wav_file='output.wav', **kwargs):
|
218 |
+
output = self.tokenizer.decode(*args, **kwargs)
|
219 |
+
if '<|speech_' not in output:
|
220 |
+
return output
|
221 |
+
content_unit, emotion, speed, pitch = self.prepare_audio_output(output)
|
222 |
+
gender = speaker.lower()
|
223 |
+
condition = f'gender-{gender}_emotion-{emotion}_speed-{speed}_pitch-{pitch}'
|
224 |
+
self.speech_tokenizer.decode(content_unit, condition=condition, output_wav_file=output_wav_file)
|
225 |
+
return output_wav_file
|
226 |
+
|
227 |
+
@property
|
228 |
+
def model_input_names(self):
|
229 |
+
tokenizer_input_names = self.tokenizer.model_input_names
|
230 |
+
image_processor_input_names = self.image_processor.model_input_names
|
231 |
+
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
processor_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoProcessor": "processing_emova.EMOVAProcessor"
|
4 |
+
},
|
5 |
+
"chat_template": "{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' }}{% if message['content'] is string %}{{ message['content'] + '<|im_end|>\n' }}{% else %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + '<|im_end|>\n' }}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
6 |
+
"processor_class": "EMOVAProcessor"
|
7 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>",
|
5 |
+
"<|object_ref_start|>",
|
6 |
+
"<|object_ref_end|>",
|
7 |
+
"<|box_start|>",
|
8 |
+
"<|box_end|>",
|
9 |
+
"<|quad_start|>",
|
10 |
+
"<|quad_end|>",
|
11 |
+
"<|vision_start|>",
|
12 |
+
"<|vision_end|>",
|
13 |
+
"<|vision_pad|>",
|
14 |
+
"<|image_pad|>",
|
15 |
+
"<|video_pad|>",
|
16 |
+
"<image>"
|
17 |
+
],
|
18 |
+
"eos_token": {
|
19 |
+
"content": "<|im_end|>",
|
20 |
+
"lstrip": false,
|
21 |
+
"normalized": false,
|
22 |
+
"rstrip": false,
|
23 |
+
"single_word": false
|
24 |
+
},
|
25 |
+
"pad_token": {
|
26 |
+
"content": "<pad>",
|
27 |
+
"lstrip": false,
|
28 |
+
"normalized": false,
|
29 |
+
"rstrip": false,
|
30 |
+
"single_word": false
|
31 |
+
}
|
32 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|