Adding all project files
This commit is contained in:
parent
6c9e127bdc
commit
cd4316ad0f
42289 changed files with 8009643 additions and 0 deletions
|
@ -0,0 +1,31 @@
|
|||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_llava_onevision import *
|
||||
from .image_processing_llava_onevision import *
|
||||
from .image_processing_llava_onevision_fast import *
|
||||
from .modeling_llava_onevision import *
|
||||
from .processing_llava_onevision import *
|
||||
from .video_processing_llava_onevision import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,196 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import (
|
||||
logging,
|
||||
)
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class LlavaOnevisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`LlavaOnevisionForConditionalGeneration`]. It is used to instantiate an
|
||||
Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the [llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)
|
||||
model.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`):
|
||||
The config object or dictionary of the vision backbone.
|
||||
text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
|
||||
The config object or dictionary of the text backbone.
|
||||
image_token_index (`int`, *optional*, defaults to 151646):
|
||||
The image token index to encode the image prompt.
|
||||
video_token_index (`int`, *optional*, defaults to 151647):
|
||||
The video token index to encode the video prompt.
|
||||
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
|
||||
The activation function used by the multimodal projector.
|
||||
vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`):
|
||||
The feature selection strategy used to select the vision feature from the vision backbone.
|
||||
Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
|
||||
If `"full"`, the full vision features are used.
|
||||
vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -1):
|
||||
The index of the layer to select the vision feature. If multiple indices are provided,
|
||||
the vision feature of the corresponding indices will be concatenated to form the
|
||||
vision features.
|
||||
vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
|
||||
Aspect ratio used when processong image features. The default value is "anyres_max_9".
|
||||
image_grid_pinpoints (`List`, *optional*):
|
||||
A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
|
||||
of the form `(height, width)`.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use bias in the multimodal projector.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import LlavaOnevisionForConditionalGeneration, LlavaOnevisionConfig, SiglipVisionConfig, Qwen2Config
|
||||
|
||||
>>> # Initializing a CLIP-vision config
|
||||
>>> vision_config = SiglipVisionConfig()
|
||||
|
||||
>>> # Initializing a Llama config
|
||||
>>> text_config = Qwen2Config()
|
||||
|
||||
>>> # Initializing a Llava-Next llava-hf/llava-onevision-qwen2-7b-ov-hf style configuration
|
||||
>>> configuration = LlavaOnevisionConfig(vision_config, text_config)
|
||||
|
||||
>>> # Initializing a model from the llava-hf/llava-onevision-qwen2-7b-ov-hf style configuration
|
||||
>>> model = LlavaOnevisionForConditionalGeneration(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "llava_onevision"
|
||||
attribute_map = {
|
||||
"image_token_id": "image_token_index",
|
||||
"video_token_id": "video_token_index",
|
||||
}
|
||||
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config=None,
|
||||
text_config=None,
|
||||
image_token_index=151646,
|
||||
video_token_index=151647,
|
||||
projector_hidden_act="gelu",
|
||||
vision_feature_select_strategy="full",
|
||||
vision_feature_layer=-1,
|
||||
vision_aspect_ratio="anyres_max_9",
|
||||
image_grid_pinpoints=None,
|
||||
tie_word_embeddings=False,
|
||||
multimodal_projector_bias=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_token_index = image_token_index
|
||||
self.video_token_index = video_token_index
|
||||
self.projector_hidden_act = projector_hidden_act
|
||||
self.multimodal_projector_bias = multimodal_projector_bias
|
||||
|
||||
if vision_feature_select_strategy not in ["default", "full"]:
|
||||
raise ValueError(
|
||||
"vision_feature_select_strategy should be one of 'default', 'full'."
|
||||
f"Got: {vision_feature_select_strategy}"
|
||||
)
|
||||
|
||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||
self.vision_feature_layer = vision_feature_layer
|
||||
self.vision_aspect_ratio = vision_aspect_ratio
|
||||
image_grid_pinpoints = (
|
||||
image_grid_pinpoints
|
||||
if image_grid_pinpoints is not None
|
||||
else [
|
||||
[384, 384],
|
||||
[384, 768],
|
||||
[384, 1152],
|
||||
[384, 1536],
|
||||
[384, 1920],
|
||||
[384, 2304],
|
||||
[768, 384],
|
||||
[768, 768],
|
||||
[768, 1152],
|
||||
[768, 1536],
|
||||
[768, 1920],
|
||||
[768, 2304],
|
||||
[1152, 384],
|
||||
[1152, 768],
|
||||
[1152, 1152],
|
||||
[1152, 1536],
|
||||
[1152, 1920],
|
||||
[1152, 2304],
|
||||
[1536, 384],
|
||||
[1536, 768],
|
||||
[1536, 1152],
|
||||
[1536, 1536],
|
||||
[1536, 1920],
|
||||
[1536, 2304],
|
||||
[1920, 384],
|
||||
[1920, 768],
|
||||
[1920, 1152],
|
||||
[1920, 1536],
|
||||
[1920, 1920],
|
||||
[1920, 2304],
|
||||
[2304, 384],
|
||||
[2304, 768],
|
||||
[2304, 1152],
|
||||
[2304, 1536],
|
||||
[2304, 1920],
|
||||
[2304, 2304],
|
||||
]
|
||||
)
|
||||
self.image_grid_pinpoints = image_grid_pinpoints
|
||||
|
||||
if isinstance(vision_config, dict):
|
||||
vision_config["model_type"] = (
|
||||
vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
|
||||
)
|
||||
vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
|
||||
elif vision_config is None:
|
||||
vision_config = CONFIG_MAPPING["siglip_vision_model"](
|
||||
hidden_size=1152,
|
||||
intermediate_size=4304,
|
||||
patch_size=14,
|
||||
image_size=384,
|
||||
num_hidden_layers=26,
|
||||
num_attention_heads=16,
|
||||
vision_use_head=False,
|
||||
)
|
||||
|
||||
self.vision_config = vision_config
|
||||
|
||||
if isinstance(text_config, dict):
|
||||
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "qwen2"
|
||||
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
|
||||
elif text_config is None:
|
||||
text_config = CONFIG_MAPPING["qwen2"]()
|
||||
|
||||
self.text_config = text_config
|
||||
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
|
||||
|
||||
__all__ = ["LlavaOnevisionConfig"]
|
|
@ -0,0 +1,784 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Image processor class for LLaVa-Onevision."""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...image_processing_utils import (
|
||||
BaseImageProcessor,
|
||||
BatchFeature,
|
||||
get_patch_output_size,
|
||||
get_size_dict,
|
||||
select_best_resolution,
|
||||
)
|
||||
from ...image_transforms import (
|
||||
PaddingMode,
|
||||
convert_to_rgb,
|
||||
pad,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from ...image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
OPENAI_CLIP_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
is_scaled_image,
|
||||
make_flat_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...utils import TensorType, is_vision_available, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches
|
||||
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> list[np.array]:
|
||||
"""
|
||||
Divides an image into patches of a specified size.
|
||||
|
||||
Args:
|
||||
image (`np.array`):
|
||||
The input image.
|
||||
patch_size (`int`):
|
||||
The size of each patch.
|
||||
input_data_format (`ChannelDimension` or `str`):
|
||||
The channel dimension format of the input image.
|
||||
|
||||
Returns:
|
||||
list: A list of np.array representing the patches.
|
||||
"""
|
||||
patches = []
|
||||
height, width = get_image_size(image, channel_dim=input_data_format)
|
||||
for i in range(0, height, patch_size):
|
||||
for j in range(0, width, patch_size):
|
||||
if input_data_format == ChannelDimension.LAST:
|
||||
patch = image[i : i + patch_size, j : j + patch_size]
|
||||
else:
|
||||
patch = image[:, i : i + patch_size, j : j + patch_size]
|
||||
patches.append(patch)
|
||||
|
||||
return patches
|
||||
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.expand_to_square
|
||||
def expand_to_square(image: np.array, background_color, input_data_format) -> np.array:
|
||||
"""
|
||||
Expands an image to a square by adding a background color.
|
||||
"""
|
||||
|
||||
height, width = get_image_size(image, channel_dim=input_data_format)
|
||||
if width == height:
|
||||
return image
|
||||
elif width > height:
|
||||
result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color
|
||||
result[(width - height) // 2 : (width - height) // 2 + height, :] = image
|
||||
return result
|
||||
else:
|
||||
result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color
|
||||
result[:, (height - width) // 2 : (height - width) // 2 + width] = image
|
||||
return result
|
||||
|
||||
|
||||
class LlavaOnevisionImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a LLaVa-Onevision image processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame.
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
|
||||
`do_resize` in the `preprocess` method.
|
||||
size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
|
||||
Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
|
||||
method.
|
||||
image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
|
||||
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
||||
based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
|
||||
method. Not used for processing videos.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
||||
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
||||
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
|
||||
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to RGB.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values_videos"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
image_grid_pinpoints: Optional[list] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
do_pad: Optional[bool] = True,
|
||||
do_convert_rgb: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
size = size if size is not None else {"height": 384, "width": 384}
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
image_grid_pinpoints = (
|
||||
image_grid_pinpoints
|
||||
if image_grid_pinpoints is not None
|
||||
else [
|
||||
[384, 384],
|
||||
[384, 768],
|
||||
[384, 1152],
|
||||
[384, 1536],
|
||||
[384, 1920],
|
||||
[384, 2304],
|
||||
[768, 384],
|
||||
[768, 768],
|
||||
[768, 1152],
|
||||
[768, 1536],
|
||||
[768, 1920],
|
||||
[768, 2304],
|
||||
[1152, 384],
|
||||
[1152, 768],
|
||||
[1152, 1152],
|
||||
[1152, 1536],
|
||||
[1152, 1920],
|
||||
[1152, 2304],
|
||||
[1536, 384],
|
||||
[1536, 768],
|
||||
[1536, 1152],
|
||||
[1536, 1536],
|
||||
[1536, 1920],
|
||||
[1536, 2304],
|
||||
[1920, 384],
|
||||
[1920, 768],
|
||||
[1920, 1152],
|
||||
[1920, 1536],
|
||||
[1920, 1920],
|
||||
[1920, 2304],
|
||||
[2304, 384],
|
||||
[2304, 768],
|
||||
[2304, 1152],
|
||||
[2304, 1536],
|
||||
[2304, 1920],
|
||||
[2304, 2304],
|
||||
]
|
||||
)
|
||||
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.image_grid_pinpoints = image_grid_pinpoints
|
||||
self.resample = resample
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||
self.do_pad = do_pad
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.pad
|
||||
def pad(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
padding: Union[int, tuple[int, int], Iterable[tuple[int, int]]],
|
||||
mode: PaddingMode = PaddingMode.CONSTANT,
|
||||
constant_values: Union[float, Iterable[float]] = 0.0,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`)
|
||||
dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected
|
||||
as input.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
The image to pad.
|
||||
padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`):
|
||||
Padding to apply to the edges of the height, width axes. Can be one of three formats:
|
||||
- `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
|
||||
- `((before, after),)` yields same before and after pad for height and width.
|
||||
- `(pad,)` or int is a shortcut for before = after = pad width for all axes.
|
||||
mode (`PaddingMode`):
|
||||
The padding mode to use. Can be one of:
|
||||
- `"constant"`: pads with a constant value.
|
||||
- `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
|
||||
vector along each axis.
|
||||
- `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
|
||||
- `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
|
||||
constant_values (`float` or `Iterable[float]`, *optional*):
|
||||
The value to use for the padding if `mode` is `"constant"`.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
If unset, will use same as the input image.
|
||||
input_data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
If unset, will use the inferred format of the input image.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`: The padded image.
|
||||
|
||||
"""
|
||||
|
||||
# call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim
|
||||
if isinstance(padding, int) or len(padding) != 4:
|
||||
return pad(image, padding, mode, constant_values, data_format, input_data_format)
|
||||
|
||||
if input_data_format is None:
|
||||
input_data_format = infer_channel_dimension_format(image)
|
||||
if mode == PaddingMode.CONSTANT:
|
||||
image = np.pad(image, padding, mode="constant", constant_values=constant_values)
|
||||
elif mode == PaddingMode.REFLECT:
|
||||
image = np.pad(image, padding, mode="reflect")
|
||||
elif mode == PaddingMode.REPLICATE:
|
||||
image = np.pad(image, padding, mode="edge")
|
||||
elif mode == PaddingMode.SYMMETRIC:
|
||||
image = np.pad(image, padding, mode="symmetric")
|
||||
else:
|
||||
raise ValueError(f"Invalid padding mode: {mode}")
|
||||
image = (
|
||||
to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
|
||||
)
|
||||
return image
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._resize_for_patching
|
||||
def _resize_for_patching(
|
||||
self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension
|
||||
) -> np.array:
|
||||
"""
|
||||
Resizes an image to a target resolution while maintaining aspect ratio.
|
||||
|
||||
Args:
|
||||
image (np.array):
|
||||
The input image.
|
||||
target_resolution (tuple):
|
||||
The target resolution (height, width) of the image.
|
||||
resample (`PILImageResampling`):
|
||||
Resampling filter to use if resizing the image.
|
||||
input_data_format (`ChannelDimension` or `str`):
|
||||
The channel dimension format of the input image.
|
||||
|
||||
Returns:
|
||||
np.array: The resized and padded image.
|
||||
"""
|
||||
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
|
||||
# Resize the image
|
||||
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
|
||||
|
||||
return resized_image
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._get_padding_size
|
||||
def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple):
|
||||
original_height, original_width = original_resolution
|
||||
target_height, target_width = target_resolution
|
||||
paste_x, r_x = divmod(target_width - original_width, 2)
|
||||
paste_y, r_y = divmod(target_height - original_height, 2)
|
||||
return (paste_y, paste_y + r_y), (paste_x, paste_x + r_x)
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_patching
|
||||
def _pad_for_patching(
|
||||
self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension
|
||||
) -> np.array:
|
||||
"""
|
||||
Pad an image to a target resolution while maintaining aspect ratio.
|
||||
"""
|
||||
new_resolution = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
padding = self._get_padding_size(new_resolution, target_resolution)
|
||||
|
||||
padded_image = self.pad(image, padding=padding)
|
||||
|
||||
return padded_image
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.get_image_patches
|
||||
def get_image_patches(
|
||||
self,
|
||||
image: np.array,
|
||||
grid_pinpoints,
|
||||
size: tuple,
|
||||
patch_size: int,
|
||||
resample: PILImageResampling,
|
||||
data_format: ChannelDimension,
|
||||
input_data_format: ChannelDimension,
|
||||
) -> list[np.array]:
|
||||
"""
|
||||
Process an image with variable resolutions by dividing it into patches.
|
||||
|
||||
Args:
|
||||
image (np.array):
|
||||
The input image to be processed.
|
||||
grid_pinpoints (List):
|
||||
A string representation of a list of possible resolutions.
|
||||
size (`tuple`):
|
||||
Size to resize the original image to.
|
||||
patch_size (`int`):
|
||||
Size of the patches to divide the image into.
|
||||
resample (`PILImageResampling`):
|
||||
Resampling filter to use if resizing the image.
|
||||
data_format (`ChannelDimension` or `str`):
|
||||
The channel dimension format for the output image.
|
||||
input_data_format (`ChannelDimension` or `str`):
|
||||
The channel dimension format of the input image.
|
||||
|
||||
Returns:
|
||||
list[np.array]: A list of NumPy arrays containing the processed image patches.
|
||||
"""
|
||||
if not isinstance(grid_pinpoints, list):
|
||||
raise TypeError("grid_pinpoints must be a list of possible resolutions.")
|
||||
|
||||
possible_resolutions = grid_pinpoints
|
||||
|
||||
image_size = get_image_size(image, channel_dim=input_data_format)
|
||||
best_resolution = select_best_resolution(image_size, possible_resolutions)
|
||||
resized_image = self._resize_for_patching(
|
||||
image, best_resolution, resample=resample, input_data_format=input_data_format
|
||||
)
|
||||
padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format)
|
||||
|
||||
patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format)
|
||||
|
||||
# make sure that all patches are in the input data format
|
||||
patches = [
|
||||
to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format)
|
||||
for patch in patches
|
||||
]
|
||||
|
||||
resized_original_image = resize(
|
||||
image,
|
||||
size=size,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
|
||||
image_patches = [resized_original_image] + patches
|
||||
|
||||
return image_patches
|
||||
|
||||
# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_batching
|
||||
def _pad_for_batching(
|
||||
self,
|
||||
pixel_values: list[np.ndarray],
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
):
|
||||
"""
|
||||
Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
|
||||
|
||||
Args:
|
||||
pixel_values (`list[np.ndarray]`):
|
||||
An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
If unset, will use same as the input image.
|
||||
input_data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
If unset, will use the inferred format of the input image.
|
||||
|
||||
Returns:
|
||||
list[`np.ndarray`]: The padded images.
|
||||
"""
|
||||
max_patch = max(len(x) for x in pixel_values)
|
||||
pixel_values = [
|
||||
self.pad(
|
||||
image,
|
||||
padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0, 0)),
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
for image in pixel_values
|
||||
]
|
||||
|
||||
return pixel_values
|
||||
|
||||
# Copied from transformers.models.llava.image_processing_llava.LlavaImageProcessor.pad_to_square
|
||||
def pad_to_square(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
background_color: Union[int, tuple[int, int, int]] = 0,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> np.array:
|
||||
"""
|
||||
Pads an image to a square based on the longest edge.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
The image to pad.
|
||||
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
|
||||
The color to use for the padding. Can be an integer for single channel or a
|
||||
tuple of integers representing for multi-channel images. If passed as integer
|
||||
in mutli-channel mode, it will default to `0` in subsequent channels.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
If unset, will use same as the input image.
|
||||
input_data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
If unset, will use the inferred format of the input image.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`: The padded image.
|
||||
"""
|
||||
height, width = get_image_size(image, input_data_format)
|
||||
num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
|
||||
|
||||
if height == width:
|
||||
image = (
|
||||
to_channel_dimension_format(image, data_format, input_data_format)
|
||||
if data_format is not None
|
||||
else image
|
||||
)
|
||||
return image
|
||||
|
||||
max_dim = max(height, width)
|
||||
|
||||
# Ensure background_color is the correct shape
|
||||
if isinstance(background_color, int):
|
||||
background_color = [background_color]
|
||||
elif len(background_color) != num_channels:
|
||||
raise ValueError(
|
||||
f"background_color must have no more than {num_channels} elements to match the number of channels"
|
||||
)
|
||||
|
||||
if input_data_format == ChannelDimension.FIRST:
|
||||
result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
|
||||
for i, color in enumerate(background_color):
|
||||
result[i, :, :] = color
|
||||
if width > height:
|
||||
start = (max_dim - height) // 2
|
||||
result[:, start : start + height, :] = image
|
||||
else:
|
||||
start = (max_dim - width) // 2
|
||||
result[:, :, start : start + width] = image
|
||||
else:
|
||||
result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
|
||||
for i, color in enumerate(background_color):
|
||||
result[:, :, i] = color
|
||||
if width > height:
|
||||
start = (max_dim - height) // 2
|
||||
result[start : start + height, :, :] = image
|
||||
else:
|
||||
start = (max_dim - width) // 2
|
||||
result[:, start : start + width, :] = image
|
||||
|
||||
image = (
|
||||
to_channel_dimension_format(result, data_format, input_data_format) if data_format is not None else result
|
||||
)
|
||||
return image
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> Image.Image:
|
||||
"""
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Batch of frames (one video) to preprocess. Expects a batch of frames with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio.
|
||||
resample (`int`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||
has an effect if `do_resize` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||
`True`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
"""
|
||||
if do_resize:
|
||||
images = [
|
||||
resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
|
||||
if do_rescale:
|
||||
images = [
|
||||
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
|
||||
if do_normalize:
|
||||
images = [
|
||||
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
|
||||
]
|
||||
|
||||
return images
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
image_grid_pinpoints: Optional[list] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
do_pad: Optional[bool] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio.
|
||||
image_grid_pinpoints (`List` *optional*, defaults to `self.image_grid_pinpoints`):
|
||||
A list of possible resolutions to use for processing high resolution images. The best resolution is
|
||||
selected based on the original size of the image.
|
||||
resample (`int`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||
has an effect if `do_resize` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||
`True`.
|
||||
do_pad (`bool`, *optional*, defaults to `self.do_pad`):
|
||||
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
|
||||
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
||||
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
|
||||
"""
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
do_pad = do_pad if do_pad is not None else self.do_pad
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
|
||||
if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
|
||||
# if the first element is a list, we assume that all elements are lists
|
||||
batch_num_images = [len(x) for x in images]
|
||||
elif isinstance(images, (tuple, list)):
|
||||
# treat this as a single-image case for backward compatibility
|
||||
batch_num_images = [1] * len(images)
|
||||
else:
|
||||
batch_num_images = [1]
|
||||
# only single image patching is supported
|
||||
need_patching = [n == 1 for n in batch_num_images for _ in range(n)]
|
||||
|
||||
images = make_flat_list_of_images(images)
|
||||
|
||||
if not valid_images(images):
|
||||
raise ValueError(
|
||||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||
)
|
||||
|
||||
validate_preprocess_arguments(
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [convert_to_rgb(image) for image in images]
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if do_rescale and is_scaled_image(images[0]):
|
||||
logger.warning_once(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
size_tuple = (
|
||||
(size["height"], size["width"])
|
||||
if "height" in size and "width" in size
|
||||
else (size["shortest_edge"], size["shortest_edge"])
|
||||
)
|
||||
|
||||
new_images = []
|
||||
image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
|
||||
for i, image in enumerate(images):
|
||||
if need_patching[i]:
|
||||
# convert image into a list of patches
|
||||
# we intentionally use the same data format as the input data format
|
||||
image_patches = self.get_image_patches(
|
||||
image,
|
||||
image_grid_pinpoints,
|
||||
size=size_tuple,
|
||||
patch_size=size_tuple[0],
|
||||
resample=resample,
|
||||
data_format=input_data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
else:
|
||||
padded_image = self.pad_to_square(
|
||||
image=image,
|
||||
background_color=tuple(int(x * 255) for x in self.image_mean),
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
image_patches = [padded_image]
|
||||
|
||||
# preprocess patches
|
||||
pixel_values = self._preprocess(
|
||||
image_patches,
|
||||
do_resize=do_resize,
|
||||
size=size_tuple,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
pixel_values = np.array(pixel_values)
|
||||
new_images.append(pixel_values)
|
||||
|
||||
if do_pad:
|
||||
processed_images = self._pad_for_batching(new_images)
|
||||
|
||||
return BatchFeature(
|
||||
data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["LlavaOnevisionImageProcessor"]
|
|
@ -0,0 +1,355 @@
|
|||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/llava_onevision/modular_llava_onevision.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_llava_onevision.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# coding=utf-8
|
||||
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
divide_to_patches,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
OPENAI_CLIP_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
get_image_size,
|
||||
)
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TensorType, auto_docstring, is_torchvision_v2_available
|
||||
|
||||
|
||||
if is_torchvision_v2_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
else:
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
|
||||
class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
image_grid_pinpoints (`list[list[int]]`, *optional*):
|
||||
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
||||
based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
|
||||
method.
|
||||
do_pad (`bool`, *optional*):
|
||||
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
|
||||
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
||||
"""
|
||||
|
||||
image_grid_pinpoints: Optional[list[list[int]]]
|
||||
do_pad: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
image_mean = OPENAI_CLIP_MEAN
|
||||
image_std = OPENAI_CLIP_STD
|
||||
size = {"height": 384, "width": 384}
|
||||
default_to_square = False
|
||||
crop_size = None
|
||||
do_resize = True
|
||||
do_center_crop = None
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
do_convert_rgb = True
|
||||
do_pad = True
|
||||
image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]] # fmt: skip
|
||||
valid_kwargs = LlavaOnevisionFastImageProcessorKwargs
|
||||
model_input_names = ["pixel_values_videos"]
|
||||
|
||||
def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature:
|
||||
if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
|
||||
# if the first element is a list, we assume that all elements are lists
|
||||
batch_num_images = [len(x) for x in images]
|
||||
elif isinstance(images, (tuple, list)):
|
||||
# treat this as a single-image case for backward compatibility
|
||||
batch_num_images = [1] * len(images)
|
||||
else:
|
||||
batch_num_images = [1]
|
||||
kwargs["batch_num_images"] = batch_num_images
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _resize_for_patching(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
target_resolution: tuple,
|
||||
interpolation: "F.InterpolationMode",
|
||||
input_data_format: ChannelDimension,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Resizes an image to a target resolution while maintaining aspect ratio.
|
||||
|
||||
Args:
|
||||
image ("torch.Tensor"):
|
||||
The input image.
|
||||
target_resolution (tuple):
|
||||
The target resolution (height, width) of the image.
|
||||
interpolation (`InterpolationMode`):
|
||||
Resampling filter to use if resizing the image.
|
||||
input_data_format (`ChannelDimension` or `str`):
|
||||
The channel dimension format of the input image.
|
||||
|
||||
Returns:
|
||||
"torch.Tensor": The resized and padded image.
|
||||
"""
|
||||
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
|
||||
# Resize the image
|
||||
resized_image = self.resize(
|
||||
image=image,
|
||||
size=SizeDict(height=new_height, width=new_width),
|
||||
interpolation=interpolation,
|
||||
)
|
||||
|
||||
return resized_image
|
||||
|
||||
def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple):
|
||||
original_height, original_width = original_resolution
|
||||
target_height, target_width = target_resolution
|
||||
paste_x, r_x = divmod(target_width - original_width, 2)
|
||||
paste_y, r_y = divmod(target_height - original_height, 2)
|
||||
return [paste_x, paste_y, paste_x + r_x, paste_y + r_y]
|
||||
|
||||
def _pad_for_patching(
|
||||
self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Pad an image to a target resolution while maintaining aspect ratio.
|
||||
"""
|
||||
new_resolution = get_patch_output_size(image, target_resolution, input_data_format)
|
||||
padding = self._get_padding_size(new_resolution, target_resolution)
|
||||
|
||||
padded_image = F.pad(image, padding=padding)
|
||||
|
||||
return padded_image
|
||||
|
||||
def _get_image_patches(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
grid_pinpoints,
|
||||
size: tuple,
|
||||
patch_size: int,
|
||||
interpolation: "F.InterpolationMode",
|
||||
) -> list["torch.Tensor"]:
|
||||
"""
|
||||
Process an image with variable resolutions by dividing it into patches.
|
||||
|
||||
Args:
|
||||
image ("torch.Tensor"):
|
||||
The input image to be processed.
|
||||
grid_pinpoints (List):
|
||||
A string representation of a list of possible resolutions.
|
||||
size (`tuple`):
|
||||
Size to resize the original image to.
|
||||
patch_size (`int`):
|
||||
Size of the patches to divide the image into.
|
||||
interpolation (`"InterpolationMode"`):
|
||||
Resampling filter to use if resizing the image.
|
||||
|
||||
Returns:
|
||||
list["torch.Tensor"]: A list of NumPy arrays containing the processed image patches.
|
||||
"""
|
||||
if not isinstance(grid_pinpoints, list):
|
||||
raise TypeError("grid_pinpoints must be a list of possible resolutions.")
|
||||
|
||||
possible_resolutions = grid_pinpoints
|
||||
|
||||
image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST)
|
||||
best_resolution = select_best_resolution(image_size, possible_resolutions)
|
||||
resized_image = self._resize_for_patching(
|
||||
image, best_resolution, interpolation=interpolation, input_data_format=ChannelDimension.FIRST
|
||||
)
|
||||
padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=ChannelDimension.FIRST)
|
||||
patches = divide_to_patches(padded_image, patch_size=patch_size)
|
||||
resized_original_image = F.resize(image, size=size, interpolation=interpolation)
|
||||
|
||||
image_patches = [resized_original_image] + patches
|
||||
|
||||
return image_patches
|
||||
|
||||
def _pad_for_batching(
|
||||
self,
|
||||
pixel_values: list["torch.Tensor"],
|
||||
) -> list["torch.Tensor"]:
|
||||
"""
|
||||
Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
|
||||
|
||||
Args:
|
||||
pixel_values (`list[torch.Tensor]`):
|
||||
An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
|
||||
|
||||
Returns:
|
||||
list[`torch.Tensor`]: The padded images.
|
||||
"""
|
||||
max_patch = max(len(x) for x in pixel_values)
|
||||
pixel_values = [
|
||||
torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]])
|
||||
for image in pixel_values
|
||||
]
|
||||
|
||||
return pixel_values
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
image_grid_pinpoints: list[list[int]],
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
do_center_crop: bool,
|
||||
crop_size: SizeDict,
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
do_pad: bool,
|
||||
batch_num_images: list[int],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
processed_images = []
|
||||
image_sizes = []
|
||||
|
||||
# only single image patching is supported
|
||||
need_patching = [n == 1 for n in batch_num_images for _ in range(n)]
|
||||
|
||||
# Determine the size tuple
|
||||
if size and size.height and size.width:
|
||||
size_tuple = (size.height, size.width)
|
||||
else:
|
||||
size_tuple = (size.shortest_edge, size.shortest_edge)
|
||||
|
||||
# Determine the patch size
|
||||
if crop_size and crop_size.height:
|
||||
patch_size = crop_size.height
|
||||
elif size and size.height:
|
||||
patch_size = size.height
|
||||
else:
|
||||
patch_size = size.shortest_edge
|
||||
|
||||
for i, image in enumerate(images):
|
||||
if need_patching[i]:
|
||||
image_patches = self._get_image_patches(
|
||||
image,
|
||||
image_grid_pinpoints,
|
||||
size=size_tuple,
|
||||
patch_size=patch_size,
|
||||
interpolation=interpolation,
|
||||
)
|
||||
else:
|
||||
padded_image = self.pad_to_square(
|
||||
images=image, background_color=tuple(int(x * 255) for x in self.image_mean)
|
||||
)
|
||||
image_patches = [padded_image]
|
||||
|
||||
# Group images by size for batched processing
|
||||
processed_image_patches_grouped = {}
|
||||
grouped_image_patches, grouped_image_patches_index = group_images_by_shape(
|
||||
image_patches, disable_grouping=disable_grouping
|
||||
)
|
||||
for shape, stacked_image_patches in grouped_image_patches.items():
|
||||
if do_resize:
|
||||
stacked_image_patches = self.resize(
|
||||
image=stacked_image_patches,
|
||||
size=size,
|
||||
interpolation=interpolation,
|
||||
)
|
||||
if do_center_crop:
|
||||
stacked_image_patches = self.center_crop(stacked_image_patches, crop_size)
|
||||
# Fused rescale and normalize
|
||||
stacked_image_patches = self.rescale_and_normalize(
|
||||
stacked_image_patches, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_image_patches_grouped[shape] = stacked_image_patches
|
||||
processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)
|
||||
processed_image_patches = (
|
||||
torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches
|
||||
)
|
||||
processed_images.append(processed_image_patches)
|
||||
image_sizes.append(get_image_size(image, ChannelDimension.FIRST))
|
||||
|
||||
if do_pad:
|
||||
processed_images = self._pad_for_batching(processed_images)
|
||||
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
||||
return BatchFeature(
|
||||
data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
# Copied from transformers.models.llava.image_processing_llava_fast.LlavaImageProcessorFast.pad_to_square
|
||||
def pad_to_square(
|
||||
self,
|
||||
images: "torch.Tensor",
|
||||
background_color: Union[int, tuple[int, int, int]] = 0,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Pads an image to a square based on the longest edge.
|
||||
|
||||
Args:
|
||||
images (`np.ndarray`):
|
||||
The images to pad.
|
||||
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
|
||||
The color to use for the padding. Can be an integer for single channel or a
|
||||
tuple of integers representing for multi-channel images. If passed as integer
|
||||
in mutli-channel mode, it will default to `0` in subsequent channels.
|
||||
Returns:
|
||||
`torch.Tensor`: The padded images.
|
||||
"""
|
||||
height, width = get_image_size(images, ChannelDimension.FIRST)
|
||||
|
||||
if height == width:
|
||||
return images
|
||||
|
||||
num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]
|
||||
if isinstance(background_color, int):
|
||||
background_color = [background_color] + [0] * (num_channels - 1)
|
||||
elif len(background_color) != num_channels:
|
||||
raise ValueError(
|
||||
f"background_color must have no more than {num_channels} elements to match the number of channels"
|
||||
)
|
||||
|
||||
max_dim = max(height, width)
|
||||
paste_x_left = (max_dim - width) // 2
|
||||
paste_y_left = (max_dim - height) // 2
|
||||
paste_x_right = max_dim - width - paste_x_left
|
||||
paste_y_right = max_dim - height - paste_y_left
|
||||
padded_images = F.pad(
|
||||
images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color
|
||||
)
|
||||
|
||||
return padded_images
|
||||
|
||||
|
||||
__all__ = ["LlavaOnevisionImageProcessorFast"]
|
|
@ -0,0 +1,999 @@
|
|||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/llava_onevision/modular_llava_onevision.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_llava_onevision.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# coding=utf-8
|
||||
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache
|
||||
from ...generation import GenerationMixin
|
||||
from ...image_processing_utils import select_best_resolution
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
TransformersKwargs,
|
||||
auto_docstring,
|
||||
can_return_tuple,
|
||||
is_torchdynamo_compiling,
|
||||
logging,
|
||||
)
|
||||
from ..auto import AutoModel
|
||||
from .configuration_llava_onevision import LlavaOnevisionConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class LlavaOnevisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
video_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LlavaOnevision causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
past_key_values: Optional[list[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
video_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
class LlavaOnevisionPooler(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
|
||||
mode = config.spatial_pool_mode
|
||||
stride = config.spatial_pool_stride
|
||||
out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
|
||||
self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2
|
||||
|
||||
if mode == "average":
|
||||
self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
|
||||
elif mode == "max":
|
||||
self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
|
||||
elif mode == "conv":
|
||||
self.pool = nn.Conv2d(
|
||||
in_channels=config.vision_config.hidden_size,
|
||||
out_channels=out_channels,
|
||||
kernel_size=stride,
|
||||
stride=stride,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown pooling mode: {mode}. Has to be one of [`average`, `max`, `conv`]")
|
||||
|
||||
def forward(self, image_features):
|
||||
ori_width = int(math.sqrt(image_features.shape[1] * self.image_size // self.image_size))
|
||||
ori_height = int(ori_width * self.image_size // self.image_size)
|
||||
|
||||
batch_size, _, dim = image_features.shape
|
||||
image_features_spatial = image_features.view(batch_size, ori_height, ori_height, dim).permute(0, 3, 1, 2)
|
||||
image_features_spatial_pool = self.pool(image_features_spatial)
|
||||
|
||||
return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
|
||||
|
||||
|
||||
class LlavaOnevisionMultiModalProjector(nn.Module):
|
||||
def __init__(self, config: LlavaOnevisionConfig):
|
||||
super().__init__()
|
||||
# We have hidden_size * the number of vision feature layers
|
||||
num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
|
||||
self.linear_1 = nn.Linear(
|
||||
config.vision_config.hidden_size * num_feature_layers,
|
||||
config.text_config.hidden_size,
|
||||
bias=config.multimodal_projector_bias,
|
||||
)
|
||||
self.act = ACT2FN[config.projector_hidden_act]
|
||||
self.linear_2 = nn.Linear(
|
||||
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
|
||||
)
|
||||
|
||||
def forward(self, image_features):
|
||||
hidden_states = self.linear_1(image_features)
|
||||
hidden_states = self.act(hidden_states)
|
||||
hidden_states = self.linear_2(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
|
||||
"""
|
||||
Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
|
||||
|
||||
Args:
|
||||
image_size (`tuple`):
|
||||
The size of the input image in the format (width, height).
|
||||
grid_pinpoints (`List`):
|
||||
A list containing possible resolutions. Each item in the list should be a tuple or list
|
||||
of the form `(height, width)`.
|
||||
patch_size (`int`):
|
||||
The size of each image patch.
|
||||
|
||||
Returns:
|
||||
tuple: The shape of the image patch grid in the format (width, height).
|
||||
"""
|
||||
if not isinstance(grid_pinpoints, list):
|
||||
raise TypeError("grid_pinpoints should be a list of tuples or lists")
|
||||
|
||||
# ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
|
||||
if not isinstance(image_size, (list, tuple)):
|
||||
if not isinstance(image_size, (torch.Tensor, np.ndarray)):
|
||||
raise TypeError(
|
||||
f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor"
|
||||
)
|
||||
image_size = image_size.tolist()
|
||||
|
||||
height, width = select_best_resolution(image_size, grid_pinpoints)
|
||||
return height // patch_size, width // patch_size
|
||||
|
||||
|
||||
def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
|
||||
"""
|
||||
Calculate the number of patches after the preprocessing for images of any resolution.
|
||||
|
||||
Args:
|
||||
image_size (`torch.LongTensor` or `np.ndarray` or `tuple[int, int]`):
|
||||
The size of the input image in the format (height, width). ?
|
||||
grid_pinpoints (`List`):
|
||||
A list containing possible resolutions. Each item in the list should be a tuple or list
|
||||
of the form `(height, width)`.
|
||||
patch_size (`int`):
|
||||
The size of each image patch.
|
||||
|
||||
Returns:
|
||||
int: the number of patches
|
||||
"""
|
||||
if not isinstance(grid_pinpoints, list):
|
||||
raise TypeError("grid_pinpoints should be a list of tuples or lists")
|
||||
|
||||
# ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
|
||||
if not isinstance(image_size, (list, tuple)):
|
||||
if not isinstance(image_size, (torch.Tensor, np.ndarray)):
|
||||
raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}")
|
||||
image_size = image_size.tolist()
|
||||
|
||||
best_resolution = select_best_resolution(image_size, grid_pinpoints)
|
||||
height, width = best_resolution
|
||||
num_patches = 0
|
||||
# consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1
|
||||
for i in range(0, height, patch_size):
|
||||
for j in range(0, width, patch_size):
|
||||
num_patches += 1
|
||||
# add the base patch
|
||||
num_patches += 1
|
||||
return num_patches
|
||||
|
||||
|
||||
def unpad_image(tensor, original_size):
|
||||
"""
|
||||
Unpads a PyTorch tensor of a padded and resized image.
|
||||
|
||||
Args:
|
||||
tensor (`torch.Tensor`):
|
||||
The image tensor, assumed to be of shape (num_channels, height, width).
|
||||
original_size (`tuple`):
|
||||
The original size of the image (height, width).
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: The unpadded image tensor.
|
||||
"""
|
||||
if not isinstance(original_size, (list, tuple)):
|
||||
if not isinstance(original_size, (torch.Tensor, np.ndarray)):
|
||||
raise TypeError(
|
||||
f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
|
||||
)
|
||||
original_size = original_size.tolist()
|
||||
original_height, original_width = original_size
|
||||
current_height, current_width = tensor.shape[1:]
|
||||
|
||||
original_aspect_ratio = original_width / original_height
|
||||
current_aspect_ratio = current_width / current_height
|
||||
|
||||
if original_aspect_ratio > current_aspect_ratio:
|
||||
scale_factor = current_width / original_width
|
||||
new_height = int(round(original_height * scale_factor, 7))
|
||||
padding = (current_height - new_height) // 2
|
||||
unpadded_tensor = tensor[:, padding : current_height - padding, :]
|
||||
else:
|
||||
scale_factor = current_height / original_height
|
||||
new_width = int(round(original_width * scale_factor, 7))
|
||||
padding = (current_width - new_width) // 2
|
||||
unpadded_tensor = tensor[:, :, padding : current_width - padding]
|
||||
|
||||
return unpadded_tensor
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class LlavaOnevisionPreTrainedModel(PreTrainedModel):
|
||||
config: LlavaOnevisionConfig
|
||||
base_model_prefix = ""
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["LlamaDecoderLayer"]
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
|
||||
_supports_flash_attn = True
|
||||
_supports_sdpa = True
|
||||
|
||||
_can_compile_fullgraph = True
|
||||
_supports_flex_attn = True
|
||||
_supports_attention_backend = True
|
||||
|
||||
def _init_weights(self, module):
|
||||
std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
|
||||
|
||||
if isinstance(module, nn.Linear):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, LlavaOnevisionModel):
|
||||
embed_std = 1 / math.sqrt(self.config.text_config.hidden_size)
|
||||
module.image_newline.data.normal_(mean=0.0, std=embed_std)
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The Llava-Next model which consists of a vision backbone and a language model without language modeling head.
|
||||
"""
|
||||
)
|
||||
class LlavaOnevisionModel(LlavaOnevisionPreTrainedModel):
|
||||
_checkpoint_conversion_mapping = {"language_model.model": "language_model"}
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.vision_tower = AutoModel.from_config(config.vision_config)
|
||||
|
||||
self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
|
||||
embed_std = 1 / math.sqrt(config.text_config.hidden_size)
|
||||
self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std)
|
||||
|
||||
self.vocab_size = config.text_config.vocab_size
|
||||
self.language_model = AutoModel.from_config(config.text_config)
|
||||
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.language_model.get_input_embeddings()
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def pack_image_features(self, image_features, image_sizes, image_newline=None, vision_aspect_ratio="anyres_max_9"):
|
||||
"""
|
||||
Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
|
||||
|
||||
Args:
|
||||
image_features (`list[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
|
||||
List of image feature tensor, each contains all the visual feature of all patches.
|
||||
image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
|
||||
Actual image size of each images (H, W).
|
||||
image_newline (`torch.Tensor` of shape `(embed_dim)`)
|
||||
New line embedding vector.
|
||||
vision_aspect_ratio (`str`, *optional*, "anyres_max_9"):
|
||||
Aspect ratio used when processong image features. The default value is "anyres_max_9".
|
||||
Returns:
|
||||
image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
|
||||
feature_lens (`list[int]`)
|
||||
token length of each image in image_features
|
||||
"""
|
||||
new_image_features = []
|
||||
feature_lens = []
|
||||
for image_idx, image_feature in enumerate(image_features):
|
||||
if image_feature.shape[0] > 1:
|
||||
base_image_feature = image_feature[0]
|
||||
image_feature = image_feature[1:]
|
||||
height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
||||
if height * width != base_image_feature.shape[0]:
|
||||
raise ValueError("The number of patches is not consistent with the image size.")
|
||||
num_patch_height, num_patch_width = get_anyres_image_grid_shape(
|
||||
image_sizes[image_idx],
|
||||
self.config.image_grid_pinpoints,
|
||||
self.config.vision_config.image_size,
|
||||
)
|
||||
image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
|
||||
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
|
||||
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
|
||||
image_feature = unpad_image(image_feature, image_sizes[image_idx])
|
||||
max_num_patches = int(vision_aspect_ratio.strip("anyres_max_"))
|
||||
channels, curr_height, curr_width = image_feature.shape
|
||||
ratio = math.sqrt(curr_height * curr_width / (max_num_patches * height**2))
|
||||
if ratio > 1.1:
|
||||
image_feature = image_feature[None]
|
||||
image_feature = nn.functional.interpolate(
|
||||
image_feature, [int(curr_height // ratio), int(curr_width // ratio)], mode="bilinear"
|
||||
)[0]
|
||||
if image_newline is not None:
|
||||
image_feature = torch.cat(
|
||||
(
|
||||
image_feature,
|
||||
image_newline[:, None, None]
|
||||
.expand(*image_feature.shape[:-1], 1)
|
||||
.to(image_feature.device, image_feature.dtype),
|
||||
),
|
||||
dim=-1,
|
||||
)
|
||||
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
|
||||
image_feature = torch.cat((base_image_feature, image_feature), dim=0)
|
||||
else:
|
||||
image_feature = image_feature[0]
|
||||
if image_newline is not None:
|
||||
image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
|
||||
image_feature = image_feature.flatten(0, 1)
|
||||
new_image_features.append(image_feature)
|
||||
feature_lens.append(image_feature.size(0))
|
||||
feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)
|
||||
return new_image_features, feature_lens
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
image_sizes: torch.Tensor,
|
||||
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
||||
vision_feature_select_strategy: Optional[str] = None,
|
||||
vision_aspect_ratio: Optional[str] = None,
|
||||
batch_num_images: Optional[torch.LongTensor] = None,
|
||||
):
|
||||
"""
|
||||
Obtains image last hidden states from the vision tower and apply multimodal projection.
|
||||
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
|
||||
The tensors corresponding to the input images.
|
||||
image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
|
||||
Actual image size of each images (H, W).
|
||||
vision_feature_layer (`Union[int, list[int]]`):
|
||||
The index of the layer to select the vision feature. If multiple indices are provided,
|
||||
the vision feature of the corresponding indices will be concatenated to form the
|
||||
vision features.
|
||||
vision_feature_select_strategy (`str`):
|
||||
The feature selection strategy used to select the vision feature from the vision backbone.
|
||||
Can be one of `"default"` or `"full"`
|
||||
batch_num_images (`torch.LongTensor`, *optional*):
|
||||
Number of images in each sample.
|
||||
Returns:
|
||||
image_features (list[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
|
||||
and are of shape `(num_patches, image_length, embed_dim)`).
|
||||
"""
|
||||
vision_feature_layer = (
|
||||
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
||||
)
|
||||
vision_feature_select_strategy = (
|
||||
vision_feature_select_strategy
|
||||
if vision_feature_select_strategy is not None
|
||||
else self.config.vision_feature_select_strategy
|
||||
)
|
||||
vision_aspect_ratio = (
|
||||
vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
|
||||
)
|
||||
|
||||
# ! infer image_num_patches from image_sizes
|
||||
if batch_num_images is None:
|
||||
# treat this as a single-image case for backward compatibility
|
||||
need_patching = [True] * len(image_sizes)
|
||||
else:
|
||||
need_patching = [n == 1 for n in batch_num_images for _ in range(n)]
|
||||
image_num_patches = [
|
||||
image_size_to_num_patches(
|
||||
image_size=imsize,
|
||||
grid_pinpoints=self.config.image_grid_pinpoints,
|
||||
patch_size=self.config.vision_config.image_size,
|
||||
)
|
||||
if should_patch
|
||||
else 1
|
||||
for imsize, should_patch in zip(image_sizes, need_patching)
|
||||
]
|
||||
if pixel_values.dim() == 5:
|
||||
# stacked if input is (batch_size, num_patches, num_channels, height, width)
|
||||
_pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
|
||||
pixel_values = torch.cat(_pixel_values_list, dim=0)
|
||||
elif pixel_values.dim() != 4:
|
||||
# otherwise has to be stacked from list of (num_patches, num_channels, height, width)
|
||||
raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
|
||||
|
||||
image_features = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||
# If we have one vision feature layer, return the corresponding hidden states,
|
||||
# otherwise, select the hidden states of each feature layer and concatenate them
|
||||
if isinstance(vision_feature_layer, int):
|
||||
selected_image_feature = image_features.hidden_states[vision_feature_layer]
|
||||
else:
|
||||
hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
|
||||
selected_image_feature = torch.cat(hs_pool, dim=-1)
|
||||
|
||||
if vision_feature_select_strategy == "default":
|
||||
selected_image_feature = selected_image_feature[:, 1:]
|
||||
elif vision_feature_select_strategy == "full":
|
||||
selected_image_feature = selected_image_feature
|
||||
image_features = self.multi_modal_projector(selected_image_feature)
|
||||
image_features = torch.split(image_features, image_num_patches, dim=0)
|
||||
|
||||
image_features, feature_lens = self.pack_image_features(
|
||||
image_features,
|
||||
image_sizes,
|
||||
image_newline=self.image_newline,
|
||||
vision_aspect_ratio=vision_aspect_ratio,
|
||||
)
|
||||
return image_features
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor = None,
|
||||
pixel_values: torch.FloatTensor = None,
|
||||
image_sizes: Optional[torch.LongTensor] = None,
|
||||
pixel_values_videos: torch.FloatTensor = None,
|
||||
image_sizes_videos: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
||||
vision_feature_select_strategy: Optional[str] = None,
|
||||
vision_aspect_ratio: Optional[str] = None,
|
||||
batch_num_images: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Union[tuple, LlavaOnevisionModelOutputWithPast]:
|
||||
r"""
|
||||
image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
|
||||
The sizes of the videos in the batch, being (height, width) for each frame in the video.
|
||||
vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
|
||||
Aspect ratio used when processong image features. The default value is "anyres_max_9".
|
||||
batch_num_images (`torch.LongTensor`, *optional*):
|
||||
Number of images in each sample.
|
||||
"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
vision_feature_layer = (
|
||||
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
||||
)
|
||||
vision_feature_select_strategy = (
|
||||
vision_feature_select_strategy
|
||||
if vision_feature_select_strategy is not None
|
||||
else self.config.vision_feature_select_strategy
|
||||
)
|
||||
vision_aspect_ratio = (
|
||||
vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
|
||||
)
|
||||
|
||||
if (input_ids is None) ^ (inputs_embeds is not None):
|
||||
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||
|
||||
# Images are processed with Anyres
|
||||
if pixel_values is not None:
|
||||
image_features = self.get_image_features(
|
||||
pixel_values,
|
||||
image_sizes,
|
||||
vision_feature_layer=vision_feature_layer,
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
batch_num_images=batch_num_images,
|
||||
)
|
||||
image_features = torch.cat(image_features, dim=0)
|
||||
|
||||
if input_ids is None:
|
||||
special_image_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
special_image_mask = special_image_mask.all(-1)
|
||||
else:
|
||||
special_image_mask = input_ids == self.config.image_token_id
|
||||
|
||||
n_image_tokens = (special_image_mask).sum()
|
||||
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
|
||||
if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
|
||||
n_image_features = image_features.shape[0]
|
||||
raise ValueError(
|
||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||
)
|
||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||
|
||||
# Video are simply embedded and further pooled to decrease seq len
|
||||
if pixel_values_videos is not None:
|
||||
video_features = self.get_video_features(
|
||||
pixel_values_videos,
|
||||
vision_feature_layer=vision_feature_layer,
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
)
|
||||
image_newline = (
|
||||
self.image_newline[None, None, :].repeat(video_features.shape[0], 1, 1).to(video_features.device)
|
||||
)
|
||||
video_features = torch.cat((video_features, image_newline), dim=1)
|
||||
video_features = video_features.flatten(0, 1)
|
||||
|
||||
if input_ids is None:
|
||||
special_video_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
special_video_mask = special_video_mask.all(-1)
|
||||
else:
|
||||
special_video_mask = input_ids == self.config.video_token_id
|
||||
|
||||
n_video_tokens = (special_video_mask).sum()
|
||||
special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
|
||||
if not is_torchdynamo_compiling() and inputs_embeds[special_video_mask].numel() != video_features.numel():
|
||||
n_video_features = video_features.shape[0]
|
||||
raise ValueError(
|
||||
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
||||
)
|
||||
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
|
||||
|
||||
outputs = self.language_model(
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=True,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return LlavaOnevisionModelOutputWithPast(
|
||||
last_hidden_state=outputs.last_hidden_state,
|
||||
past_key_values=outputs.past_key_values,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
image_hidden_states=image_features if pixel_values is not None else None,
|
||||
video_hidden_states=video_features if pixel_values_videos is not None else None,
|
||||
)
|
||||
|
||||
def get_video_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
vision_feature_layer: Union[int, list[int]],
|
||||
vision_feature_select_strategy: str,
|
||||
):
|
||||
"""
|
||||
Obtains video last hidden states from the vision tower, apply multimodal projection and pooling.
|
||||
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
|
||||
The tensors corresponding to the input video.
|
||||
vision_feature_layer (`Union[int, list[int]], *optional*, defaults to -2`):
|
||||
The index of the layer to select the vision feature. If multiple indices are provided,
|
||||
the vision feature of the corresponding indices will be concatenated to form the
|
||||
vision features.
|
||||
vision_feature_select_strategy (`str`):
|
||||
The feature selection strategy used to select the vision feature from the vision backbone.
|
||||
Can be one of `"default"` or `"full"`
|
||||
Returns:
|
||||
video_features (list[`torch.Tensor`]): List of video feature tensor, each contains all the visual feature of all patches
|
||||
and are of shape `(num_videos, video_length, embed_dim)`).
|
||||
"""
|
||||
batch_size, frames, channels, height, width = pixel_values.shape
|
||||
pixel_values = pixel_values.view(batch_size * frames, channels, height, width)
|
||||
video_features = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||
|
||||
# If we have one vision feature layer, return the corresponding hidden states,
|
||||
# otherwise, select the hidden states of each feature layer and concatenate them
|
||||
if isinstance(vision_feature_layer, int):
|
||||
selected_video_feature = video_features.hidden_states[vision_feature_layer]
|
||||
else:
|
||||
hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
|
||||
selected_video_feature = torch.cat(hs_pool, dim=-1)
|
||||
|
||||
if vision_feature_select_strategy == "default":
|
||||
selected_video_feature = selected_video_feature[:, 1:]
|
||||
elif vision_feature_select_strategy == "full":
|
||||
selected_video_feature = selected_video_feature
|
||||
video_features = self.multi_modal_projector(selected_video_feature)
|
||||
|
||||
video_features = self.apply_pooling(video_features)
|
||||
video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1)
|
||||
|
||||
return video_features
|
||||
|
||||
def apply_pooling(self, image_features):
|
||||
height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
||||
batch_frames, seq_len, dim = image_features.shape
|
||||
image_features = image_features.view(batch_frames, height, width, -1)
|
||||
image_features = image_features.permute(0, 3, 1, 2).contiguous()
|
||||
|
||||
height, width = image_features.shape[2:]
|
||||
scaled_shape = [math.ceil(height / 2), math.ceil(width / 2)]
|
||||
image_features = nn.functional.interpolate(image_features, size=scaled_shape, mode="bilinear")
|
||||
|
||||
image_features = image_features.permute(0, 2, 3, 1)
|
||||
image_features = image_features.view(batch_frames, -1, dim)
|
||||
return image_features
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The LLAVA-NeXT model which consists of a vision backbone and a language model.
|
||||
"""
|
||||
)
|
||||
class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, GenerationMixin):
|
||||
_checkpoint_conversion_mapping = {
|
||||
"^language_model.model": "model.language_model",
|
||||
"^vision_tower": "model.vision_tower",
|
||||
"^multi_modal_projector": "model.multi_modal_projector",
|
||||
"^image_newline": "model.image_newline",
|
||||
"^language_model.lm_head": "lm_head",
|
||||
}
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: LlavaOnevisionConfig):
|
||||
super().__init__(config)
|
||||
self.model = LlavaOnevisionModel(config)
|
||||
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.model.get_input_embeddings()
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.set_input_embeddings(value)
|
||||
|
||||
def get_output_embeddings(self) -> nn.Module:
|
||||
return self.lm_head
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model.get_decoder()
|
||||
|
||||
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
|
||||
return self.model.pack_image_features(
|
||||
image_features=image_features,
|
||||
image_sizes=image_sizes,
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
image_newline=image_newline,
|
||||
)
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
image_sizes: torch.Tensor,
|
||||
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
||||
vision_feature_select_strategy: Optional[str] = None,
|
||||
):
|
||||
return self.model.get_image_features(
|
||||
pixel_values=pixel_values,
|
||||
image_sizes=image_sizes,
|
||||
vision_feature_layer=vision_feature_layer,
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
)
|
||||
|
||||
# Make modules available throught conditional class for BC
|
||||
@property
|
||||
def language_model(self):
|
||||
return self.model.language_model
|
||||
|
||||
@property
|
||||
def vision_tower(self):
|
||||
return self.model.vision_tower
|
||||
|
||||
@property
|
||||
def multi_modal_projector(self):
|
||||
return self.model.multi_modal_projector
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor = None,
|
||||
pixel_values: torch.FloatTensor = None,
|
||||
image_sizes: Optional[torch.LongTensor] = None,
|
||||
pixel_values_videos: torch.FloatTensor = None,
|
||||
image_sizes_videos: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
||||
vision_feature_select_strategy: Optional[str] = None,
|
||||
vision_aspect_ratio: Optional[str] = None,
|
||||
batch_num_images: Optional[torch.LongTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, LlavaOnevisionCausalLMOutputWithPast]:
|
||||
r"""
|
||||
image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
|
||||
The sizes of the videos in the batch, being (height, width) for each frame in the video.
|
||||
vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
|
||||
Aspect ratio used when processong image features. The default value is "anyres_max_9".
|
||||
batch_num_images (`torch.LongTensor`, *optional*):
|
||||
Number of images in each sample.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
>>> import torch
|
||||
>>> from transformers import LlavaOnevisionProcessor, LlavaOnevisionForConditionalGeneration
|
||||
|
||||
>>> model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype="float16", device_map="cuda:0")
|
||||
>>> processor = LlavaOnevisionProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
|
||||
|
||||
>>> conversation = [
|
||||
... {
|
||||
... "role": "user",
|
||||
... "content": [
|
||||
... {"type": "text", "text": "What is shown in this image?"},
|
||||
... {"type": "image"},
|
||||
... ],
|
||||
... },
|
||||
... ]
|
||||
>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
||||
|
||||
>>> image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
>>> inputs = processor(text=prompt, images=raw_image, return_tensors='pt').to(0, torch.float16)
|
||||
|
||||
>>> output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
>>> processor.batch_decode(output, skip_special_tokens=True)[0]
|
||||
"user\n\nWhat is shown in this image?\nassistant\ncat"
|
||||
```"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
vision_feature_layer = (
|
||||
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
||||
)
|
||||
vision_feature_select_strategy = (
|
||||
vision_feature_select_strategy
|
||||
if vision_feature_select_strategy is not None
|
||||
else self.config.vision_feature_select_strategy
|
||||
)
|
||||
vision_aspect_ratio = (
|
||||
vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
|
||||
)
|
||||
|
||||
outputs = self.model(
|
||||
input_ids=input_ids,
|
||||
pixel_values=pixel_values,
|
||||
pixel_values_videos=pixel_values_videos,
|
||||
image_sizes=image_sizes,
|
||||
image_sizes_videos=image_sizes_videos,
|
||||
vision_aspect_ratio=vision_aspect_ratio,
|
||||
vision_feature_layer=vision_feature_layer,
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
batch_num_images=batch_num_images,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=True,
|
||||
cache_position=cache_position,
|
||||
logits_to_keep=logits_to_keep,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = outputs[0]
|
||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
||||
logits = self.lm_head(hidden_states[:, slice_indices, :])
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss = self.loss_function(
|
||||
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
|
||||
)
|
||||
|
||||
return LlavaOnevisionCausalLMOutputWithPast(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
past_key_values=outputs.past_key_values,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
image_hidden_states=outputs.image_hidden_states,
|
||||
video_hidden_states=outputs.video_hidden_states,
|
||||
)
|
||||
|
||||
def prepare_inputs_for_generation(
|
||||
self,
|
||||
input_ids,
|
||||
past_key_values=None,
|
||||
inputs_embeds=None,
|
||||
pixel_values=None,
|
||||
image_sizes=None,
|
||||
pixel_values_videos=None,
|
||||
image_sizes_videos=None,
|
||||
attention_mask=None,
|
||||
cache_position=None,
|
||||
logits_to_keep=None,
|
||||
**kwargs,
|
||||
):
|
||||
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
||||
|
||||
model_inputs = super().prepare_inputs_for_generation(
|
||||
input_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
cache_position=cache_position,
|
||||
logits_to_keep=logits_to_keep,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if cache_position[0] == 0:
|
||||
# If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
|
||||
# Otherwise we need pixel values to be passed to model
|
||||
model_inputs["pixel_values"] = pixel_values
|
||||
model_inputs["image_sizes"] = image_sizes
|
||||
model_inputs["pixel_values_videos"] = pixel_values_videos
|
||||
model_inputs["image_sizes_videos"] = image_sizes_videos
|
||||
|
||||
return model_inputs
|
||||
|
||||
@staticmethod
|
||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask: torch.Tensor,
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
||||
|
||||
Args:
|
||||
attention_mask (`torch.Tensor`):
|
||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
||||
`(batch_size, 1, query_length, key_value_length)`.
|
||||
sequence_length (`int`):
|
||||
The sequence length being processed.
|
||||
target_length (`int`):
|
||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
||||
dtype (`torch.dtype`):
|
||||
The dtype to use for the 4D attention mask.
|
||||
cache_position (`torch.Tensor`):
|
||||
Indices depicting the position of the input sequence tokens in the sequence.
|
||||
batch_size (`torch.Tensor`):
|
||||
Batch size.
|
||||
"""
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
||||
causal_mask.device
|
||||
)
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
||||
def get_video_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
||||
vision_feature_select_strategy: Optional[str] = None,
|
||||
):
|
||||
return self.model.get_video_features(
|
||||
pixel_values=pixel_values,
|
||||
vision_feature_layer=vision_feature_layer,
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["LlavaOnevisionModel", "LlavaOnevisionForConditionalGeneration", "LlavaOnevisionPreTrainedModel"]
|
|
@ -0,0 +1,791 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers.models.llava_next.image_processing_llava_next_fast import LlavaNextImageProcessorFast
|
||||
from transformers.models.llava_next_video.modeling_llava_next_video import (
|
||||
LlavaNextVideoCausalLMOutputWithPast,
|
||||
LlavaNextVideoForConditionalGeneration,
|
||||
LlavaNextVideoModel,
|
||||
LlavaNextVideoModelOutputWithPast,
|
||||
LlavaNextVideoPreTrainedModel,
|
||||
TransformersKwargs,
|
||||
get_anyres_image_grid_shape,
|
||||
image_size_to_num_patches,
|
||||
unpad_image,
|
||||
)
|
||||
|
||||
from ...cache_utils import Cache
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images
|
||||
from ...image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
OPENAI_CLIP_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
get_image_size,
|
||||
)
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
can_return_tuple,
|
||||
is_torchdynamo_compiling,
|
||||
is_torchvision_available,
|
||||
is_torchvision_v2_available,
|
||||
logging,
|
||||
)
|
||||
|
||||
|
||||
if is_torchvision_available():
|
||||
if is_torchvision_v2_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
else:
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
image_grid_pinpoints (`list[list[int]]`, *optional*):
|
||||
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
||||
based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
|
||||
method.
|
||||
do_pad (`bool`, *optional*):
|
||||
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
|
||||
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
||||
"""
|
||||
|
||||
image_grid_pinpoints: Optional[list[list[int]]]
|
||||
do_pad: Optional[bool]
|
||||
|
||||
|
||||
class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
image_mean = OPENAI_CLIP_MEAN
|
||||
image_std = OPENAI_CLIP_STD
|
||||
size = {"height": 384, "width": 384}
|
||||
crop_size = None
|
||||
default_to_square = False
|
||||
do_resize = True
|
||||
do_center_crop = None
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
do_convert_rgb = True
|
||||
do_pad = True
|
||||
image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]] # fmt: skip
|
||||
model_input_names = ["pixel_values_videos"]
|
||||
|
||||
# Copied from transformers.models.llava.image_processing_llava_fast.LlavaImageProcessorFast.pad_to_square
|
||||
def pad_to_square(
|
||||
self,
|
||||
images: "torch.Tensor",
|
||||
background_color: Union[int, tuple[int, int, int]] = 0,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Pads an image to a square based on the longest edge.
|
||||
|
||||
Args:
|
||||
images (`np.ndarray`):
|
||||
The images to pad.
|
||||
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
|
||||
The color to use for the padding. Can be an integer for single channel or a
|
||||
tuple of integers representing for multi-channel images. If passed as integer
|
||||
in mutli-channel mode, it will default to `0` in subsequent channels.
|
||||
Returns:
|
||||
`torch.Tensor`: The padded images.
|
||||
"""
|
||||
height, width = get_image_size(images, ChannelDimension.FIRST)
|
||||
|
||||
if height == width:
|
||||
return images
|
||||
|
||||
num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]
|
||||
if isinstance(background_color, int):
|
||||
background_color = [background_color] + [0] * (num_channels - 1)
|
||||
elif len(background_color) != num_channels:
|
||||
raise ValueError(
|
||||
f"background_color must have no more than {num_channels} elements to match the number of channels"
|
||||
)
|
||||
|
||||
max_dim = max(height, width)
|
||||
paste_x_left = (max_dim - width) // 2
|
||||
paste_y_left = (max_dim - height) // 2
|
||||
paste_x_right = max_dim - width - paste_x_left
|
||||
paste_y_right = max_dim - height - paste_y_left
|
||||
padded_images = F.pad(
|
||||
images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color
|
||||
)
|
||||
|
||||
return padded_images
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature:
|
||||
if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
|
||||
# if the first element is a list, we assume that all elements are lists
|
||||
batch_num_images = [len(x) for x in images]
|
||||
elif isinstance(images, (tuple, list)):
|
||||
# treat this as a single-image case for backward compatibility
|
||||
batch_num_images = [1] * len(images)
|
||||
else:
|
||||
batch_num_images = [1]
|
||||
kwargs["batch_num_images"] = batch_num_images
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
image_grid_pinpoints: list[list[int]],
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
do_center_crop: bool,
|
||||
crop_size: SizeDict,
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
do_pad: bool,
|
||||
batch_num_images: list[int],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
processed_images = []
|
||||
image_sizes = []
|
||||
|
||||
# only single image patching is supported
|
||||
need_patching = [n == 1 for n in batch_num_images for _ in range(n)]
|
||||
|
||||
# Determine the size tuple
|
||||
if size and size.height and size.width:
|
||||
size_tuple = (size.height, size.width)
|
||||
else:
|
||||
size_tuple = (size.shortest_edge, size.shortest_edge)
|
||||
|
||||
# Determine the patch size
|
||||
if crop_size and crop_size.height:
|
||||
patch_size = crop_size.height
|
||||
elif size and size.height:
|
||||
patch_size = size.height
|
||||
else:
|
||||
patch_size = size.shortest_edge
|
||||
|
||||
for i, image in enumerate(images):
|
||||
if need_patching[i]:
|
||||
image_patches = self._get_image_patches(
|
||||
image,
|
||||
image_grid_pinpoints,
|
||||
size=size_tuple,
|
||||
patch_size=patch_size,
|
||||
interpolation=interpolation,
|
||||
)
|
||||
else:
|
||||
padded_image = self.pad_to_square(
|
||||
images=image, background_color=tuple(int(x * 255) for x in self.image_mean)
|
||||
)
|
||||
image_patches = [padded_image]
|
||||
|
||||
# Group images by size for batched processing
|
||||
processed_image_patches_grouped = {}
|
||||
grouped_image_patches, grouped_image_patches_index = group_images_by_shape(
|
||||
image_patches, disable_grouping=disable_grouping
|
||||
)
|
||||
for shape, stacked_image_patches in grouped_image_patches.items():
|
||||
if do_resize:
|
||||
stacked_image_patches = self.resize(
|
||||
image=stacked_image_patches,
|
||||
size=size,
|
||||
interpolation=interpolation,
|
||||
)
|
||||
if do_center_crop:
|
||||
stacked_image_patches = self.center_crop(stacked_image_patches, crop_size)
|
||||
# Fused rescale and normalize
|
||||
stacked_image_patches = self.rescale_and_normalize(
|
||||
stacked_image_patches, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_image_patches_grouped[shape] = stacked_image_patches
|
||||
processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)
|
||||
processed_image_patches = (
|
||||
torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches
|
||||
)
|
||||
processed_images.append(processed_image_patches)
|
||||
image_sizes.append(get_image_size(image, ChannelDimension.FIRST))
|
||||
|
||||
if do_pad:
|
||||
processed_images = self._pad_for_batching(processed_images)
|
||||
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
||||
return BatchFeature(
|
||||
data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
|
||||
class LlavaOnevisionModelOutputWithPast(LlavaNextVideoModelOutputWithPast):
|
||||
pass
|
||||
|
||||
|
||||
class LlavaOnevisionCausalLMOutputWithPast(LlavaNextVideoCausalLMOutputWithPast):
|
||||
pass
|
||||
|
||||
|
||||
class LlavaOnevisionPreTrainedModel(LlavaNextVideoPreTrainedModel):
|
||||
pass
|
||||
|
||||
|
||||
class LlavaOnevisionModel(LlavaNextVideoModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
del self.vision_resampler
|
||||
|
||||
def pack_image_features(self, image_features, image_sizes, image_newline=None, vision_aspect_ratio="anyres_max_9"):
|
||||
"""
|
||||
Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
|
||||
|
||||
Args:
|
||||
image_features (`list[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
|
||||
List of image feature tensor, each contains all the visual feature of all patches.
|
||||
image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
|
||||
Actual image size of each images (H, W).
|
||||
image_newline (`torch.Tensor` of shape `(embed_dim)`)
|
||||
New line embedding vector.
|
||||
vision_aspect_ratio (`str`, *optional*, "anyres_max_9"):
|
||||
Aspect ratio used when processong image features. The default value is "anyres_max_9".
|
||||
Returns:
|
||||
image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
|
||||
feature_lens (`list[int]`)
|
||||
token length of each image in image_features
|
||||
"""
|
||||
new_image_features = []
|
||||
feature_lens = []
|
||||
for image_idx, image_feature in enumerate(image_features):
|
||||
if image_feature.shape[0] > 1:
|
||||
base_image_feature = image_feature[0]
|
||||
image_feature = image_feature[1:]
|
||||
height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
||||
if height * width != base_image_feature.shape[0]:
|
||||
raise ValueError("The number of patches is not consistent with the image size.")
|
||||
num_patch_height, num_patch_width = get_anyres_image_grid_shape(
|
||||
image_sizes[image_idx],
|
||||
self.config.image_grid_pinpoints,
|
||||
self.config.vision_config.image_size,
|
||||
)
|
||||
image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
|
||||
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
|
||||
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
|
||||
image_feature = unpad_image(image_feature, image_sizes[image_idx])
|
||||
max_num_patches = int(vision_aspect_ratio.strip("anyres_max_"))
|
||||
channels, curr_height, curr_width = image_feature.shape
|
||||
ratio = math.sqrt(curr_height * curr_width / (max_num_patches * height**2))
|
||||
if ratio > 1.1:
|
||||
image_feature = image_feature[None]
|
||||
image_feature = nn.functional.interpolate(
|
||||
image_feature, [int(curr_height // ratio), int(curr_width // ratio)], mode="bilinear"
|
||||
)[0]
|
||||
if image_newline is not None:
|
||||
image_feature = torch.cat(
|
||||
(
|
||||
image_feature,
|
||||
image_newline[:, None, None]
|
||||
.expand(*image_feature.shape[:-1], 1)
|
||||
.to(image_feature.device, image_feature.dtype),
|
||||
),
|
||||
dim=-1,
|
||||
)
|
||||
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
|
||||
image_feature = torch.cat((base_image_feature, image_feature), dim=0)
|
||||
else:
|
||||
image_feature = image_feature[0]
|
||||
if image_newline is not None:
|
||||
image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
|
||||
image_feature = image_feature.flatten(0, 1)
|
||||
new_image_features.append(image_feature)
|
||||
feature_lens.append(image_feature.size(0))
|
||||
feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)
|
||||
return new_image_features, feature_lens
|
||||
|
||||
def apply_pooling(self, image_features):
|
||||
height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
||||
batch_frames, seq_len, dim = image_features.shape
|
||||
image_features = image_features.view(batch_frames, height, width, -1)
|
||||
image_features = image_features.permute(0, 3, 1, 2).contiguous()
|
||||
|
||||
height, width = image_features.shape[2:]
|
||||
scaled_shape = [math.ceil(height / 2), math.ceil(width / 2)]
|
||||
image_features = nn.functional.interpolate(image_features, size=scaled_shape, mode="bilinear")
|
||||
|
||||
image_features = image_features.permute(0, 2, 3, 1)
|
||||
image_features = image_features.view(batch_frames, -1, dim)
|
||||
return image_features
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
image_sizes: torch.Tensor,
|
||||
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
||||
vision_feature_select_strategy: Optional[str] = None,
|
||||
vision_aspect_ratio: Optional[str] = None,
|
||||
batch_num_images: Optional[torch.LongTensor] = None,
|
||||
):
|
||||
"""
|
||||
Obtains image last hidden states from the vision tower and apply multimodal projection.
|
||||
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
|
||||
The tensors corresponding to the input images.
|
||||
image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
|
||||
Actual image size of each images (H, W).
|
||||
vision_feature_layer (`Union[int, list[int]]`):
|
||||
The index of the layer to select the vision feature. If multiple indices are provided,
|
||||
the vision feature of the corresponding indices will be concatenated to form the
|
||||
vision features.
|
||||
vision_feature_select_strategy (`str`):
|
||||
The feature selection strategy used to select the vision feature from the vision backbone.
|
||||
Can be one of `"default"` or `"full"`
|
||||
batch_num_images (`torch.LongTensor`, *optional*):
|
||||
Number of images in each sample.
|
||||
Returns:
|
||||
image_features (list[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
|
||||
and are of shape `(num_patches, image_length, embed_dim)`).
|
||||
"""
|
||||
vision_feature_layer = (
|
||||
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
||||
)
|
||||
vision_feature_select_strategy = (
|
||||
vision_feature_select_strategy
|
||||
if vision_feature_select_strategy is not None
|
||||
else self.config.vision_feature_select_strategy
|
||||
)
|
||||
vision_aspect_ratio = (
|
||||
vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
|
||||
)
|
||||
|
||||
# ! infer image_num_patches from image_sizes
|
||||
if batch_num_images is None:
|
||||
# treat this as a single-image case for backward compatibility
|
||||
need_patching = [True] * len(image_sizes)
|
||||
else:
|
||||
need_patching = [n == 1 for n in batch_num_images for _ in range(n)]
|
||||
image_num_patches = [
|
||||
image_size_to_num_patches(
|
||||
image_size=imsize,
|
||||
grid_pinpoints=self.config.image_grid_pinpoints,
|
||||
patch_size=self.config.vision_config.image_size,
|
||||
)
|
||||
if should_patch
|
||||
else 1
|
||||
for imsize, should_patch in zip(image_sizes, need_patching)
|
||||
]
|
||||
if pixel_values.dim() == 5:
|
||||
# stacked if input is (batch_size, num_patches, num_channels, height, width)
|
||||
_pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
|
||||
pixel_values = torch.cat(_pixel_values_list, dim=0)
|
||||
elif pixel_values.dim() != 4:
|
||||
# otherwise has to be stacked from list of (num_patches, num_channels, height, width)
|
||||
raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
|
||||
|
||||
image_features = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||
# If we have one vision feature layer, return the corresponding hidden states,
|
||||
# otherwise, select the hidden states of each feature layer and concatenate them
|
||||
if isinstance(vision_feature_layer, int):
|
||||
selected_image_feature = image_features.hidden_states[vision_feature_layer]
|
||||
else:
|
||||
hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
|
||||
selected_image_feature = torch.cat(hs_pool, dim=-1)
|
||||
|
||||
if vision_feature_select_strategy == "default":
|
||||
selected_image_feature = selected_image_feature[:, 1:]
|
||||
elif vision_feature_select_strategy == "full":
|
||||
selected_image_feature = selected_image_feature
|
||||
image_features = self.multi_modal_projector(selected_image_feature)
|
||||
image_features = torch.split(image_features, image_num_patches, dim=0)
|
||||
|
||||
image_features, feature_lens = self.pack_image_features(
|
||||
image_features,
|
||||
image_sizes,
|
||||
image_newline=self.image_newline,
|
||||
vision_aspect_ratio=vision_aspect_ratio,
|
||||
)
|
||||
return image_features
|
||||
|
||||
def get_video_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
vision_feature_layer: Union[int, list[int]],
|
||||
vision_feature_select_strategy: str,
|
||||
):
|
||||
"""
|
||||
Obtains video last hidden states from the vision tower, apply multimodal projection and pooling.
|
||||
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
|
||||
The tensors corresponding to the input video.
|
||||
vision_feature_layer (`Union[int, list[int]], *optional*, defaults to -2`):
|
||||
The index of the layer to select the vision feature. If multiple indices are provided,
|
||||
the vision feature of the corresponding indices will be concatenated to form the
|
||||
vision features.
|
||||
vision_feature_select_strategy (`str`):
|
||||
The feature selection strategy used to select the vision feature from the vision backbone.
|
||||
Can be one of `"default"` or `"full"`
|
||||
Returns:
|
||||
video_features (list[`torch.Tensor`]): List of video feature tensor, each contains all the visual feature of all patches
|
||||
and are of shape `(num_videos, video_length, embed_dim)`).
|
||||
"""
|
||||
batch_size, frames, channels, height, width = pixel_values.shape
|
||||
pixel_values = pixel_values.view(batch_size * frames, channels, height, width)
|
||||
video_features = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||
|
||||
# If we have one vision feature layer, return the corresponding hidden states,
|
||||
# otherwise, select the hidden states of each feature layer and concatenate them
|
||||
if isinstance(vision_feature_layer, int):
|
||||
selected_video_feature = video_features.hidden_states[vision_feature_layer]
|
||||
else:
|
||||
hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
|
||||
selected_video_feature = torch.cat(hs_pool, dim=-1)
|
||||
|
||||
if vision_feature_select_strategy == "default":
|
||||
selected_video_feature = selected_video_feature[:, 1:]
|
||||
elif vision_feature_select_strategy == "full":
|
||||
selected_video_feature = selected_video_feature
|
||||
video_features = self.multi_modal_projector(selected_video_feature)
|
||||
|
||||
video_features = self.apply_pooling(video_features)
|
||||
video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1)
|
||||
|
||||
return video_features
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor = None,
|
||||
pixel_values: torch.FloatTensor = None,
|
||||
image_sizes: Optional[torch.LongTensor] = None,
|
||||
pixel_values_videos: torch.FloatTensor = None,
|
||||
image_sizes_videos: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
||||
vision_feature_select_strategy: Optional[str] = None,
|
||||
vision_aspect_ratio: Optional[str] = None,
|
||||
batch_num_images: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Union[tuple, LlavaOnevisionModelOutputWithPast]:
|
||||
r"""
|
||||
image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
|
||||
The sizes of the videos in the batch, being (height, width) for each frame in the video.
|
||||
vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
|
||||
Aspect ratio used when processong image features. The default value is "anyres_max_9".
|
||||
batch_num_images (`torch.LongTensor`, *optional*):
|
||||
Number of images in each sample.
|
||||
"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
vision_feature_layer = (
|
||||
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
||||
)
|
||||
vision_feature_select_strategy = (
|
||||
vision_feature_select_strategy
|
||||
if vision_feature_select_strategy is not None
|
||||
else self.config.vision_feature_select_strategy
|
||||
)
|
||||
vision_aspect_ratio = (
|
||||
vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
|
||||
)
|
||||
|
||||
if (input_ids is None) ^ (inputs_embeds is not None):
|
||||
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||
|
||||
# Images are processed with Anyres
|
||||
if pixel_values is not None:
|
||||
image_features = self.get_image_features(
|
||||
pixel_values,
|
||||
image_sizes,
|
||||
vision_feature_layer=vision_feature_layer,
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
batch_num_images=batch_num_images,
|
||||
)
|
||||
image_features = torch.cat(image_features, dim=0)
|
||||
|
||||
if input_ids is None:
|
||||
special_image_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
special_image_mask = special_image_mask.all(-1)
|
||||
else:
|
||||
special_image_mask = input_ids == self.config.image_token_id
|
||||
|
||||
n_image_tokens = (special_image_mask).sum()
|
||||
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
|
||||
if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
|
||||
n_image_features = image_features.shape[0]
|
||||
raise ValueError(
|
||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||
)
|
||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||
|
||||
# Video are simply embedded and further pooled to decrease seq len
|
||||
if pixel_values_videos is not None:
|
||||
video_features = self.get_video_features(
|
||||
pixel_values_videos,
|
||||
vision_feature_layer=vision_feature_layer,
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
)
|
||||
image_newline = (
|
||||
self.image_newline[None, None, :].repeat(video_features.shape[0], 1, 1).to(video_features.device)
|
||||
)
|
||||
video_features = torch.cat((video_features, image_newline), dim=1)
|
||||
video_features = video_features.flatten(0, 1)
|
||||
|
||||
if input_ids is None:
|
||||
special_video_mask = inputs_embeds == self.get_input_embeddings()(
|
||||
torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
|
||||
)
|
||||
special_video_mask = special_video_mask.all(-1)
|
||||
else:
|
||||
special_video_mask = input_ids == self.config.video_token_id
|
||||
|
||||
n_video_tokens = (special_video_mask).sum()
|
||||
special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
||||
|
||||
if not is_torchdynamo_compiling() and inputs_embeds[special_video_mask].numel() != video_features.numel():
|
||||
n_video_features = video_features.shape[0]
|
||||
raise ValueError(
|
||||
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
||||
)
|
||||
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
|
||||
|
||||
outputs = self.language_model(
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=True,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return LlavaOnevisionModelOutputWithPast(
|
||||
last_hidden_state=outputs.last_hidden_state,
|
||||
past_key_values=outputs.past_key_values,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
image_hidden_states=image_features if pixel_values is not None else None,
|
||||
video_hidden_states=video_features if pixel_values_videos is not None else None,
|
||||
)
|
||||
|
||||
|
||||
class LlavaOnevisionForConditionalGeneration(LlavaNextVideoForConditionalGeneration):
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor = None,
|
||||
pixel_values: torch.FloatTensor = None,
|
||||
image_sizes: Optional[torch.LongTensor] = None,
|
||||
pixel_values_videos: torch.FloatTensor = None,
|
||||
image_sizes_videos: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
||||
vision_feature_select_strategy: Optional[str] = None,
|
||||
vision_aspect_ratio: Optional[str] = None,
|
||||
batch_num_images: Optional[torch.LongTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, LlavaOnevisionCausalLMOutputWithPast]:
|
||||
r"""
|
||||
image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
|
||||
The sizes of the videos in the batch, being (height, width) for each frame in the video.
|
||||
vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
|
||||
Aspect ratio used when processong image features. The default value is "anyres_max_9".
|
||||
batch_num_images (`torch.LongTensor`, *optional*):
|
||||
Number of images in each sample.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
>>> import torch
|
||||
>>> from transformers import LlavaOnevisionProcessor, LlavaOnevisionForConditionalGeneration
|
||||
|
||||
>>> model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype="float16", device_map="cuda:0")
|
||||
>>> processor = LlavaOnevisionProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
|
||||
|
||||
>>> conversation = [
|
||||
... {
|
||||
... "role": "user",
|
||||
... "content": [
|
||||
... {"type": "text", "text": "What is shown in this image?"},
|
||||
... {"type": "image"},
|
||||
... ],
|
||||
... },
|
||||
... ]
|
||||
>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
||||
|
||||
>>> image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
||||
>>> inputs = processor(text=prompt, images=raw_image, return_tensors='pt').to(0, torch.float16)
|
||||
|
||||
>>> output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
>>> processor.batch_decode(output, skip_special_tokens=True)[0]
|
||||
"user\n\nWhat is shown in this image?\nassistant\ncat"
|
||||
```"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
vision_feature_layer = (
|
||||
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
||||
)
|
||||
vision_feature_select_strategy = (
|
||||
vision_feature_select_strategy
|
||||
if vision_feature_select_strategy is not None
|
||||
else self.config.vision_feature_select_strategy
|
||||
)
|
||||
vision_aspect_ratio = (
|
||||
vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
|
||||
)
|
||||
|
||||
outputs = self.model(
|
||||
input_ids=input_ids,
|
||||
pixel_values=pixel_values,
|
||||
pixel_values_videos=pixel_values_videos,
|
||||
image_sizes=image_sizes,
|
||||
image_sizes_videos=image_sizes_videos,
|
||||
vision_aspect_ratio=vision_aspect_ratio,
|
||||
vision_feature_layer=vision_feature_layer,
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
batch_num_images=batch_num_images,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=True,
|
||||
cache_position=cache_position,
|
||||
logits_to_keep=logits_to_keep,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = outputs[0]
|
||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
||||
logits = self.lm_head(hidden_states[:, slice_indices, :])
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss = self.loss_function(
|
||||
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
|
||||
)
|
||||
|
||||
return LlavaOnevisionCausalLMOutputWithPast(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
past_key_values=outputs.past_key_values,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
image_hidden_states=outputs.image_hidden_states,
|
||||
video_hidden_states=outputs.video_hidden_states,
|
||||
)
|
||||
|
||||
def prepare_inputs_for_generation(
|
||||
self,
|
||||
input_ids,
|
||||
past_key_values=None,
|
||||
inputs_embeds=None,
|
||||
pixel_values=None,
|
||||
image_sizes=None,
|
||||
pixel_values_videos=None,
|
||||
image_sizes_videos=None,
|
||||
attention_mask=None,
|
||||
cache_position=None,
|
||||
logits_to_keep=None,
|
||||
**kwargs,
|
||||
):
|
||||
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
||||
|
||||
model_inputs = super().prepare_inputs_for_generation(
|
||||
input_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
cache_position=cache_position,
|
||||
logits_to_keep=logits_to_keep,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if cache_position[0] == 0:
|
||||
# If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
|
||||
# Otherwise we need pixel values to be passed to model
|
||||
model_inputs["pixel_values"] = pixel_values
|
||||
model_inputs["image_sizes"] = image_sizes
|
||||
model_inputs["pixel_values_videos"] = pixel_values_videos
|
||||
model_inputs["image_sizes_videos"] = image_sizes_videos
|
||||
|
||||
return model_inputs
|
||||
|
||||
|
||||
__all__ = [
|
||||
"LlavaOnevisionImageProcessorFast",
|
||||
"LlavaOnevisionModel",
|
||||
"LlavaOnevisionForConditionalGeneration",
|
||||
"LlavaOnevisionPreTrainedModel",
|
||||
]
|
|
@ -0,0 +1,355 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Processor class for LLaVa-Onevision.
|
||||
"""
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_processing_utils import select_best_resolution
|
||||
from ...image_utils import ImageInput, get_image_size, to_numpy_array
|
||||
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import logging
|
||||
from ...video_utils import VideoInput
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
|
||||
# see processing_utils.ProcessingKwargs documentation for usage.
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
"return_mm_token_type_ids": False,
|
||||
},
|
||||
"image_kwargs": {},
|
||||
"videos_kwargs": {},
|
||||
}
|
||||
|
||||
|
||||
class LlavaOnevisionProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
|
||||
|
||||
[`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the
|
||||
[`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
|
||||
|
||||
Args:
|
||||
image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
|
||||
The image processor is a required input.
|
||||
tokenizer ([`LlamaTokenizerFast`], *optional*):
|
||||
The tokenizer is a required input.
|
||||
video_processor ([`LlavaOnevisionVideoProcessor`], *optional*):
|
||||
The video processor is a required input.
|
||||
num_image_tokens (`int`, *optional*):
|
||||
Number of image tokens for one imagethat will be returned by vision tower.
|
||||
vision_feature_select_strategy (`str`, *optional*):
|
||||
The feature selection strategy used to select the vision feature from the vision backbone.
|
||||
Should be same as in model's config
|
||||
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
||||
in a chat into a tokenizable string.
|
||||
image_token (`str`, *optional*, defaults to `"<image>"`):
|
||||
Special token used to denote image location.
|
||||
video_token (`str`, *optional*, defaults to `"<video>"`):
|
||||
Special token used to denote video location.
|
||||
vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
|
||||
Aspect ratio used when processong image features. The default value is "anyres_max_9".
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer", "video_processor"]
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
video_processor_class = "AutoVideoProcessor"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor=None,
|
||||
tokenizer=None,
|
||||
video_processor=None,
|
||||
num_image_tokens=None,
|
||||
vision_feature_select_strategy=None,
|
||||
chat_template=None,
|
||||
image_token="<image>",
|
||||
video_token="<video>",
|
||||
vision_aspect_ratio="anyres_max_9",
|
||||
**kwargs,
|
||||
):
|
||||
self.num_image_tokens = num_image_tokens
|
||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
||||
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
|
||||
self.image_token_id = (
|
||||
tokenizer.image_token_id
|
||||
if getattr(tokenizer, "image_token_id", None)
|
||||
else tokenizer.convert_tokens_to_ids(self.image_token)
|
||||
)
|
||||
self.video_token_id = (
|
||||
tokenizer.video_token_id
|
||||
if getattr(tokenizer, "video_token_id", None)
|
||||
else tokenizer.convert_tokens_to_ids(self.video_token)
|
||||
)
|
||||
self.vision_aspect_ratio = vision_aspect_ratio
|
||||
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos: VideoInput = None,
|
||||
**kwargs: Unpack[LlavaOnevisionProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
||||
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
- **pixel_values_videos** -- Pixel values of a video input to be fed to a model. Returned when `videos` is not `None`.
|
||||
- **image_sizes** -- Size of each image that will be used to unpad an image. Returned when `images` is not `None`.
|
||||
"""
|
||||
|
||||
output_kwargs = self._merge_kwargs(
|
||||
LlavaOnevisionProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if isinstance(text, str):
|
||||
text = [text]
|
||||
elif not isinstance(text, list) and not isinstance(text[0], str):
|
||||
raise TypeError("Invalid input text. Please provide a string, or a list of strings")
|
||||
|
||||
image_inputs = video_inputs = {}
|
||||
|
||||
if images is not None:
|
||||
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
||||
|
||||
batch_num_images = iter(image_inputs["batch_num_images"])
|
||||
image_sizes = iter(image_inputs["image_sizes"])
|
||||
height, width = get_image_size(
|
||||
to_numpy_array(image_inputs["pixel_values"][0][0]),
|
||||
channel_dim=output_kwargs["images_kwargs"].get("data_format"),
|
||||
)
|
||||
text, num_image_tokens = self._expand_image_tokens(
|
||||
text, image_sizes, height, width, self.image_token, batch_num_images
|
||||
)
|
||||
|
||||
if videos is not None:
|
||||
video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
|
||||
|
||||
one_video = video_inputs.get("pixel_values_videos")[0]
|
||||
if isinstance(video_inputs.get("pixel_values_videos")[0], (list, tuple)):
|
||||
one_video = np.array(one_video)
|
||||
else:
|
||||
one_video = to_numpy_array(one_video)
|
||||
height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
|
||||
num_frames = one_video.shape[0] # frame dim is always after batch dim
|
||||
patches_height_width = int(math.sqrt(self.num_image_tokens))
|
||||
pooled_height_width = math.ceil(patches_height_width / 2)
|
||||
num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token
|
||||
text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
|
||||
|
||||
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
||||
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
|
||||
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
||||
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
||||
|
||||
if return_mm_token_type_ids:
|
||||
array_ids = np.array(text_inputs["input_ids"])
|
||||
mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
|
||||
mm_token_type_ids[array_ids == self.image_token_id] = 1
|
||||
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
|
||||
|
||||
return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}, tensor_type=return_tensors)
|
||||
|
||||
def _expand_image_tokens(
|
||||
self,
|
||||
text: list[TextInput],
|
||||
image_sizes: Iterable[Union[list[int], int]],
|
||||
height: int,
|
||||
width: int,
|
||||
special_token: str,
|
||||
batch_num_images: Iterable[int],
|
||||
):
|
||||
prompt_strings = []
|
||||
max_num_vision_tokens = 0
|
||||
for sample in text:
|
||||
if special_token in sample:
|
||||
is_multi_image = next(batch_num_images) != 1
|
||||
else:
|
||||
is_multi_image = False
|
||||
while special_token in sample:
|
||||
if is_multi_image:
|
||||
num_image_tokens = self.num_image_tokens + 1 # one for image_newline
|
||||
else:
|
||||
original_size = next(image_sizes)
|
||||
if not isinstance(original_size, (list, tuple)):
|
||||
# cast to list to avoid numerical precision errors when calculating unpadding
|
||||
original_size = original_size.tolist()
|
||||
orig_height, orig_width = original_size
|
||||
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
|
||||
max_num_vision_tokens = max(max_num_vision_tokens, num_image_tokens)
|
||||
if self.vision_feature_select_strategy == "default":
|
||||
num_image_tokens -= 1
|
||||
sample = sample.replace(special_token, "<placeholder>" * num_image_tokens, 1)
|
||||
prompt_strings.append(sample)
|
||||
text = [sample.replace("<placeholder>", special_token) for sample in prompt_strings]
|
||||
return text, max_num_vision_tokens
|
||||
|
||||
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
|
||||
image_grid_pinpoints = self.image_processor.image_grid_pinpoints
|
||||
|
||||
height_best_resolution, width_best_resolution = select_best_resolution(
|
||||
[orig_height, orig_width], image_grid_pinpoints
|
||||
)
|
||||
scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
|
||||
|
||||
patches_height = patches_width = int(math.sqrt(self.num_image_tokens))
|
||||
unpadded_features, newline_features = self._get_unpadded_features(
|
||||
orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
|
||||
)
|
||||
|
||||
# The base patch covers the entire image (no CLS for SigLIP)
|
||||
base_features = self.num_image_tokens
|
||||
num_image_tokens = unpadded_features + newline_features + base_features
|
||||
return num_image_tokens
|
||||
|
||||
# Adapted from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_unpadded_features
|
||||
def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width):
|
||||
"""
|
||||
Get number of features for a given image with height/width. LLaVA-NeXT is different from LLaVA
|
||||
because it divided each image into patches depending on its resolution. Therefore we need to calculate how many
|
||||
patches an image is divided into and get the number of features from that.
|
||||
"""
|
||||
current_height = patches_height * scale_height
|
||||
current_width = patches_width * scale_width
|
||||
|
||||
original_aspect_ratio = width / height
|
||||
current_aspect_ratio = current_width / current_height
|
||||
if original_aspect_ratio > current_aspect_ratio:
|
||||
new_height = int(round(height * (current_width / width), 7))
|
||||
padding = (current_height - new_height) // 2
|
||||
current_height -= padding * 2
|
||||
else:
|
||||
new_width = int(round(width * (current_height / height), 7))
|
||||
padding = (current_width - new_width) // 2
|
||||
current_width -= padding * 2
|
||||
|
||||
unpadded_features = current_height * current_width
|
||||
newline_features = current_height
|
||||
|
||||
max_num_patches = int(self.vision_aspect_ratio.strip("anyres_max_"))
|
||||
ratio = math.sqrt(current_height * current_width / (max_num_patches * patches_height**2))
|
||||
if ratio > 1.1:
|
||||
unpadded_features = int(current_height // ratio) * int(current_width // ratio)
|
||||
newline_features = int(current_height // ratio)
|
||||
|
||||
return (unpadded_features, newline_features)
|
||||
|
||||
def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
|
||||
"""
|
||||
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
|
||||
Args:
|
||||
image_sizes (list[list[str]], *optional*):
|
||||
The input sizes formatted as (height, width) per each image.
|
||||
video_sizes (list[list[str]], *optional*):
|
||||
The input sizes formatted as (num_frames, height, width) per each video.
|
||||
audio_lengths (list[int], *optional*):
|
||||
The input length formatted as per each audio.
|
||||
Returns:
|
||||
dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
|
||||
to a list containing the number of placeholder tokens required. If the model doesn't accept
|
||||
a certain modality or no input sizes are provided, the dict value is set to an empty list.
|
||||
"""
|
||||
vision_data = {}
|
||||
if image_sizes is not None:
|
||||
images_kwargs = LlavaOnevisionProcessorKwargs._defaults.get("images_kwargs", {})
|
||||
images_kwargs.update(kwargs)
|
||||
|
||||
size = images_kwargs.get("size", None) or self.image_processor.size
|
||||
size = (
|
||||
(size["shortest_edge"], size["shortest_edge"])
|
||||
if "shortest_edge" in size
|
||||
else (min(size["height"], size["width"]), min(size["height"], size["width"]))
|
||||
)
|
||||
processed_height, processed_width = size
|
||||
|
||||
batch_num_image_tokens = []
|
||||
num_image_patches = [1] * len(image_sizes) # llava-ov doesn't batch pixels as Idefics, thus `1` patch`
|
||||
for image_size in image_sizes:
|
||||
orig_height, orig_width = image_size
|
||||
num_image_tokens = self._get_number_of_features(
|
||||
orig_height, orig_width, processed_height, processed_width
|
||||
)
|
||||
if self.vision_feature_select_strategy == "default":
|
||||
num_image_tokens -= 1
|
||||
batch_num_image_tokens.append(num_image_tokens)
|
||||
vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches})
|
||||
|
||||
return MultiModalData(**vision_data)
|
||||
|
||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
||||
refer to the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
|
||||
def decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
||||
the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
@property
|
||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
|
||||
|
||||
__all__ = ["LlavaOnevisionProcessor"]
|
|
@ -0,0 +1,58 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Video processor class for LLaVa-Onevision."""
|
||||
|
||||
from ...image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
OPENAI_CLIP_STD,
|
||||
)
|
||||
from ...processing_utils import Unpack, VideosKwargs
|
||||
from ...utils import is_vision_available
|
||||
from ...utils.import_utils import requires
|
||||
from ...video_processing_utils import (
|
||||
BaseVideoProcessor,
|
||||
)
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from ...image_utils import PILImageResampling
|
||||
|
||||
|
||||
class LlavaOnevisionFastVideoProcessorInitKwargs(VideosKwargs): ...
|
||||
|
||||
|
||||
@requires(backends=("torchvision",))
|
||||
class LlavaOnevisionVideoProcessor(BaseVideoProcessor):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
image_mean = OPENAI_CLIP_MEAN
|
||||
image_std = OPENAI_CLIP_STD
|
||||
size = {"height": 384, "width": 384}
|
||||
rescale_factor = 1 / 255
|
||||
default_to_square = False
|
||||
crop_size = None
|
||||
do_resize = True
|
||||
do_center_crop = None
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
do_convert_rgb = True
|
||||
do_sample_frames = False # Set to False for BC, recommended to set `True` in new models
|
||||
valid_kwargs = LlavaOnevisionFastVideoProcessorInitKwargs
|
||||
model_input_names = ["pixel_values_videos"]
|
||||
|
||||
def __init__(self, **kwargs: Unpack[LlavaOnevisionFastVideoProcessorInitKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
__all__ = ["LlavaOnevisionVideoProcessor"]
|
Loading…
Add table
Add a link
Reference in a new issue