# coding=utf-8 # Copyright 2025 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Processor class for SmolVLM. """ from datetime import timedelta from typing import TYPE_CHECKING, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, make_nested_list_of_images from ...processing_utils import AllKwargsForChatTemplate, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import BatchEncoding, TextInput from ...utils import is_num2words_available, is_vision_available, logging from ...video_utils import VideoInput if is_vision_available(): from .video_processing_smolvlm import ( DEFAULT_MEDIA_OUTTRO, DEFAULT_VIDEO_INTRO, FRAME_TIMESTAMP_MESSAGE, ) if is_vision_available(): from .video_processing_smolvlm import ( DEFAULT_MEDIA_OUTTRO, DEFAULT_VIDEO_INTRO, FRAME_TIMESTAMP_MESSAGE, ) if TYPE_CHECKING: from ...tokenization_utils_base import PreTokenizedInput logger = logging.get_logger(__name__) if is_num2words_available(): from num2words import num2words else: num2words = None # The correct chat template to be used for videos after #38105 DEFAULT_CHAT_TEMPLATE = "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '' }}{% elif line['type'] == 'video' %}{{ '