# coding=utf-8 # Copyright 2025 HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional, Union import numpy as np from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...video_utils import VideoInput, make_batched_videos class InternVLImagesKwargs(ImagesKwargs, total=False): crop_to_patches: Optional[bool] min_patches: Optional[int] max_patches: Optional[int] class InternVLProcessorKwargs(ProcessingKwargs, total=False): images_kwargs: InternVLImagesKwargs _defaults = { "text_kwargs": { "padding_side": "left", "return_mm_token_type_ids": False, }, "images_kwargs": { "crop_to_patches": True, }, "videos_kwargs": {}, } class InternVLProcessor(ProcessorMixin): r""" Constructs a InternVL processor which wraps a [`AutoImageProcessor`] and [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and tokenizer functionalities. See the [`~InternVLProcessor.__call__`] and [`~InternVLProcessor.decode`] for more information. Args: image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): The tokenizer is a required input. video_processor ([`AutoVideoProcessor`], *optional*): The video processor is a required input. image_seq_length (`int`, *optional*, defaults to 256): The number of image token to use per image patch. it should be set so that: image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2) chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. """ attributes = ["image_processor", "tokenizer", "video_processor"] image_processor_class = "AutoImageProcessor" video_processor_class = "AutoVideoProcessor" tokenizer_class = "AutoTokenizer" def __init__( self, image_processor=None, tokenizer=None, video_processor=None, image_seq_length: int = 256, chat_template=None, **kwargs, ): self.image_seq_length = image_seq_length self.start_image_token = tokenizer.start_image_token self.end_image_token = tokenizer.end_image_token self.start_image_token_id = tokenizer.start_image_token_id self.end_image_token_id = tokenizer.end_image_token_id self.image_token = tokenizer.context_image_token self.video_token = tokenizer.video_token self.image_token_id = tokenizer.context_image_token_id self.image_ids = [self.image_token_id, self.start_image_token_id, self.end_image_token_id] super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template, **kwargs) def _insert_media_placeholders( self, text: list[str], image_pixel_values, video_pixel_values, image_num_patches: list[int], video_num_patches: list[int], image_num_patches_indices: np.ndarray, video_num_patches_indices: np.ndarray, video_patch_indices: np.ndarray, ): """ Processes interleaved text with and