Adding all project files
This commit is contained in:
parent
6c9e127bdc
commit
cd4316ad0f
42289 changed files with 8009643 additions and 0 deletions
|
@ -0,0 +1,30 @@
|
|||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_mobilenet_v1 import *
|
||||
from .feature_extraction_mobilenet_v1 import *
|
||||
from .image_processing_mobilenet_v1 import *
|
||||
from .image_processing_mobilenet_v1_fast import *
|
||||
from .modeling_mobilenet_v1 import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,126 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""MobileNetV1 model configuration"""
|
||||
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Mapping
|
||||
|
||||
from packaging import version
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...onnx import OnnxConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class MobileNetV1Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`MobileNetV1Model`]. It is used to instantiate a
|
||||
MobileNetV1 model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the MobileNetV1
|
||||
[google/mobilenet_v1_1.0_224](https://huggingface.co/google/mobilenet_v1_1.0_224) architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
num_channels (`int`, *optional*, defaults to 3):
|
||||
The number of input channels.
|
||||
image_size (`int`, *optional*, defaults to 224):
|
||||
The size (resolution) of each image.
|
||||
depth_multiplier (`float`, *optional*, defaults to 1.0):
|
||||
Shrinks or expands the number of channels in each layer. Default is 1.0, which starts the network with 32
|
||||
channels. This is sometimes also called "alpha" or "width multiplier".
|
||||
min_depth (`int`, *optional*, defaults to 8):
|
||||
All layers will have at least this many channels.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu6"`):
|
||||
The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
|
||||
tf_padding (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use TensorFlow padding rules on the convolution layers.
|
||||
classifier_dropout_prob (`float`, *optional*, defaults to 0.999):
|
||||
The dropout ratio for attached classifiers.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 0.001):
|
||||
The epsilon used by the layer normalization layers.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import MobileNetV1Config, MobileNetV1Model
|
||||
|
||||
>>> # Initializing a "mobilenet_v1_1.0_224" style configuration
|
||||
>>> configuration = MobileNetV1Config()
|
||||
|
||||
>>> # Initializing a model from the "mobilenet_v1_1.0_224" style configuration
|
||||
>>> model = MobileNetV1Model(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "mobilenet_v1"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_channels=3,
|
||||
image_size=224,
|
||||
depth_multiplier=1.0,
|
||||
min_depth=8,
|
||||
hidden_act="relu6",
|
||||
tf_padding=True,
|
||||
classifier_dropout_prob=0.999,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=0.001,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if depth_multiplier <= 0:
|
||||
raise ValueError("depth_multiplier must be greater than zero.")
|
||||
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.depth_multiplier = depth_multiplier
|
||||
self.min_depth = min_depth
|
||||
self.hidden_act = hidden_act
|
||||
self.tf_padding = tf_padding
|
||||
self.classifier_dropout_prob = classifier_dropout_prob
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
|
||||
|
||||
class MobileNetV1OnnxConfig(OnnxConfig):
|
||||
torch_onnx_minimum_version = version.parse("1.11")
|
||||
|
||||
@property
|
||||
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||
return OrderedDict([("pixel_values", {0: "batch"})])
|
||||
|
||||
@property
|
||||
def outputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||
if self.task == "image-classification":
|
||||
return OrderedDict([("logits", {0: "batch"})])
|
||||
else:
|
||||
return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
|
||||
|
||||
@property
|
||||
def atol_for_validation(self) -> float:
|
||||
return 1e-4
|
||||
|
||||
|
||||
__all__ = ["MobileNetV1Config", "MobileNetV1OnnxConfig"]
|
|
@ -0,0 +1,38 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Feature extractor class for MobileNetV1."""
|
||||
|
||||
import warnings
|
||||
|
||||
from ...utils import logging
|
||||
from ...utils.import_utils import requires
|
||||
from .image_processing_mobilenet_v1 import MobileNetV1ImageProcessor
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@requires(backends=("vision",))
|
||||
class MobileNetV1FeatureExtractor(MobileNetV1ImageProcessor):
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
warnings.warn(
|
||||
"The class MobileNetV1FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
|
||||
" Please use MobileNetV1ImageProcessor instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
__all__ = ["MobileNetV1FeatureExtractor"]
|
|
@ -0,0 +1,307 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Image processor class for MobileNetV1."""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
|
||||
from ...image_transforms import (
|
||||
get_resize_output_image_size,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from ...image_utils import (
|
||||
IMAGENET_STANDARD_MEAN,
|
||||
IMAGENET_STANDARD_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
infer_channel_dimension_format,
|
||||
is_scaled_image,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, logging
|
||||
from ...utils.import_utils import requires
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@requires(backends=("vision",))
|
||||
class MobileNetV1ImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a MobileNetV1 image processor.
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
|
||||
`do_resize` in the `preprocess` method.
|
||||
size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 256}`):
|
||||
Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
|
||||
method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
|
||||
`preprocess` method.
|
||||
do_center_crop (`bool`, *optional*, defaults to `True`):
|
||||
Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
|
||||
is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the
|
||||
`preprocess` method.
|
||||
crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
|
||||
Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
|
||||
Can be overridden by the `crop_size` parameter in the `preprocess` method.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
|
||||
parameter in the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
|
||||
`preprocess` method.
|
||||
do_normalize:
|
||||
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
|
||||
method.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
|
||||
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
|
||||
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BILINEAR,
|
||||
do_center_crop: bool = True,
|
||||
crop_size: Optional[dict[str, int]] = None,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
size = size if size is not None else {"shortest_edge": 256}
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
|
||||
crop_size = get_size_dict(crop_size)
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.resample = resample
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
|
||||
|
||||
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
|
||||
def resize(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
size: dict[str, int],
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
|
||||
resized to keep the input aspect ratio.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`dict[str, int]`):
|
||||
Size of the output image.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
||||
Resampling filter to use when resiizing the image.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred.
|
||||
"""
|
||||
default_to_square = True
|
||||
if "shortest_edge" in size:
|
||||
size = size["shortest_edge"]
|
||||
default_to_square = False
|
||||
elif "height" in size and "width" in size:
|
||||
size = (size["height"], size["width"])
|
||||
else:
|
||||
raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
|
||||
|
||||
output_size = get_resize_output_image_size(
|
||||
image,
|
||||
size=size,
|
||||
default_to_square=default_to_square,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
return resize(
|
||||
image,
|
||||
size=output_size,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@filter_out_non_signature_kwargs()
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_center_crop: Optional[bool] = None,
|
||||
crop_size: Optional[dict[str, int]] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
):
|
||||
"""
|
||||
Preprocess an image or batch of images.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio.
|
||||
resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
|
||||
`PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
|
||||
an effect if `do_resize` is set to `True`.
|
||||
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
|
||||
Whether to center crop the image.
|
||||
crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
|
||||
Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image values between [0 - 1].
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use if `do_normalize` is set to `True`.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
||||
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
"""
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
|
||||
crop_size = crop_size if crop_size is not None else self.crop_size
|
||||
crop_size = get_size_dict(crop_size)
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if not valid_images(images):
|
||||
raise ValueError(
|
||||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||
)
|
||||
validate_preprocess_arguments(
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_center_crop=do_center_crop,
|
||||
crop_size=crop_size,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
)
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if do_rescale and is_scaled_image(images[0]):
|
||||
logger.warning_once(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
all_images = []
|
||||
for image in images:
|
||||
if do_resize:
|
||||
image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
|
||||
|
||||
if do_center_crop:
|
||||
image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
|
||||
|
||||
if do_rescale:
|
||||
image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
|
||||
|
||||
if do_normalize:
|
||||
image = self.normalize(
|
||||
image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
|
||||
)
|
||||
|
||||
all_images.append(image)
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||||
for image in all_images
|
||||
]
|
||||
|
||||
data = {"pixel_values": images}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["MobileNetV1ImageProcessor"]
|
|
@ -0,0 +1,43 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fast Image processor class for MobileNetV1."""
|
||||
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
DefaultFastImageProcessorKwargs,
|
||||
Unpack,
|
||||
)
|
||||
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling
|
||||
from ...utils import auto_docstring
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class MobileNetV1ImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BILINEAR
|
||||
image_mean = IMAGENET_STANDARD_MEAN
|
||||
image_std = IMAGENET_STANDARD_STD
|
||||
size = {"shortest_edge": 256}
|
||||
default_to_square = False
|
||||
crop_size = {"height": 224, "width": 224}
|
||||
do_resize = True
|
||||
do_center_crop = True
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> None:
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
__all__ = ["MobileNetV1ImageProcessorFast"]
|
|
@ -0,0 +1,434 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2022 Apple Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch MobileNetV1 model."""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_outputs import BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import auto_docstring, logging
|
||||
from .configuration_mobilenet_v1 import MobileNetV1Config
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def _build_tf_to_pytorch_map(model, config, tf_weights=None):
|
||||
"""
|
||||
A map of modules from TF to PyTorch.
|
||||
"""
|
||||
|
||||
tf_to_pt_map = {}
|
||||
|
||||
if isinstance(model, MobileNetV1ForImageClassification):
|
||||
backbone = model.mobilenet_v1
|
||||
else:
|
||||
backbone = model
|
||||
|
||||
prefix = "MobilenetV1/Conv2d_0/"
|
||||
tf_to_pt_map[prefix + "weights"] = backbone.conv_stem.convolution.weight
|
||||
tf_to_pt_map[prefix + "BatchNorm/beta"] = backbone.conv_stem.normalization.bias
|
||||
tf_to_pt_map[prefix + "BatchNorm/gamma"] = backbone.conv_stem.normalization.weight
|
||||
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.normalization.running_mean
|
||||
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.normalization.running_var
|
||||
|
||||
for i in range(13):
|
||||
tf_index = i + 1
|
||||
pt_index = i * 2
|
||||
|
||||
pointer = backbone.layer[pt_index]
|
||||
prefix = f"MobilenetV1/Conv2d_{tf_index}_depthwise/"
|
||||
tf_to_pt_map[prefix + "depthwise_weights"] = pointer.convolution.weight
|
||||
tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias
|
||||
tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight
|
||||
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean
|
||||
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var
|
||||
|
||||
pointer = backbone.layer[pt_index + 1]
|
||||
prefix = f"MobilenetV1/Conv2d_{tf_index}_pointwise/"
|
||||
tf_to_pt_map[prefix + "weights"] = pointer.convolution.weight
|
||||
tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias
|
||||
tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight
|
||||
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean
|
||||
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var
|
||||
|
||||
if isinstance(model, MobileNetV1ForImageClassification):
|
||||
prefix = "MobilenetV1/Logits/Conv2d_1c_1x1/"
|
||||
tf_to_pt_map[prefix + "weights"] = model.classifier.weight
|
||||
tf_to_pt_map[prefix + "biases"] = model.classifier.bias
|
||||
|
||||
return tf_to_pt_map
|
||||
|
||||
|
||||
def load_tf_weights_in_mobilenet_v1(model, config, tf_checkpoint_path):
|
||||
"""Load TensorFlow checkpoints in a PyTorch model."""
|
||||
try:
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions."
|
||||
)
|
||||
raise
|
||||
|
||||
# Load weights from TF model
|
||||
init_vars = tf.train.list_variables(tf_checkpoint_path)
|
||||
tf_weights = {}
|
||||
for name, shape in init_vars:
|
||||
logger.info(f"Loading TF weight {name} with shape {shape}")
|
||||
array = tf.train.load_variable(tf_checkpoint_path, name)
|
||||
tf_weights[name] = array
|
||||
|
||||
# Build TF to PyTorch weights loading map
|
||||
tf_to_pt_map = _build_tf_to_pytorch_map(model, config, tf_weights)
|
||||
|
||||
for name, pointer in tf_to_pt_map.items():
|
||||
logger.info(f"Importing {name}")
|
||||
if name not in tf_weights:
|
||||
logger.info(f"{name} not in tf pre-trained weights, skipping")
|
||||
continue
|
||||
|
||||
array = tf_weights[name]
|
||||
|
||||
if "depthwise_weights" in name:
|
||||
logger.info("Transposing depthwise")
|
||||
array = np.transpose(array, (2, 3, 0, 1))
|
||||
elif "weights" in name:
|
||||
logger.info("Transposing")
|
||||
if len(pointer.shape) == 2: # copying into linear layer
|
||||
array = array.squeeze().transpose()
|
||||
else:
|
||||
array = np.transpose(array, (3, 2, 0, 1))
|
||||
|
||||
if pointer.shape != array.shape:
|
||||
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
|
||||
|
||||
logger.info(f"Initialize PyTorch weight {name} {array.shape}")
|
||||
pointer.data = torch.from_numpy(array)
|
||||
|
||||
tf_weights.pop(name, None)
|
||||
tf_weights.pop(name + "/RMSProp", None)
|
||||
tf_weights.pop(name + "/RMSProp_1", None)
|
||||
tf_weights.pop(name + "/ExponentialMovingAverage", None)
|
||||
|
||||
logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
|
||||
return model
|
||||
|
||||
|
||||
def apply_tf_padding(features: torch.Tensor, conv_layer: nn.Conv2d) -> torch.Tensor:
|
||||
"""
|
||||
Apply TensorFlow-style "SAME" padding to a convolution layer. See the notes at:
|
||||
https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2
|
||||
"""
|
||||
in_height, in_width = features.shape[-2:]
|
||||
stride_height, stride_width = conv_layer.stride
|
||||
kernel_height, kernel_width = conv_layer.kernel_size
|
||||
|
||||
if in_height % stride_height == 0:
|
||||
pad_along_height = max(kernel_height - stride_height, 0)
|
||||
else:
|
||||
pad_along_height = max(kernel_height - (in_height % stride_height), 0)
|
||||
|
||||
if in_width % stride_width == 0:
|
||||
pad_along_width = max(kernel_width - stride_width, 0)
|
||||
else:
|
||||
pad_along_width = max(kernel_width - (in_width % stride_width), 0)
|
||||
|
||||
pad_left = pad_along_width // 2
|
||||
pad_right = pad_along_width - pad_left
|
||||
pad_top = pad_along_height // 2
|
||||
pad_bottom = pad_along_height - pad_top
|
||||
|
||||
padding = (pad_left, pad_right, pad_top, pad_bottom)
|
||||
return nn.functional.pad(features, padding, "constant", 0.0)
|
||||
|
||||
|
||||
class MobileNetV1ConvLayer(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: MobileNetV1Config,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
kernel_size: int,
|
||||
stride: Optional[int] = 1,
|
||||
groups: Optional[int] = 1,
|
||||
bias: bool = False,
|
||||
use_normalization: Optional[bool] = True,
|
||||
use_activation: Optional[bool or str] = True,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
if in_channels % groups != 0:
|
||||
raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
|
||||
if out_channels % groups != 0:
|
||||
raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")
|
||||
|
||||
padding = 0 if config.tf_padding else int((kernel_size - 1) / 2)
|
||||
|
||||
self.convolution = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
bias=bias,
|
||||
padding_mode="zeros",
|
||||
)
|
||||
|
||||
if use_normalization:
|
||||
self.normalization = nn.BatchNorm2d(
|
||||
num_features=out_channels,
|
||||
eps=config.layer_norm_eps,
|
||||
momentum=0.9997,
|
||||
affine=True,
|
||||
track_running_stats=True,
|
||||
)
|
||||
else:
|
||||
self.normalization = None
|
||||
|
||||
if use_activation:
|
||||
if isinstance(use_activation, str):
|
||||
self.activation = ACT2FN[use_activation]
|
||||
elif isinstance(config.hidden_act, str):
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.activation = config.hidden_act
|
||||
else:
|
||||
self.activation = None
|
||||
|
||||
def forward(self, features: torch.Tensor) -> torch.Tensor:
|
||||
if self.config.tf_padding:
|
||||
features = apply_tf_padding(features, self.convolution)
|
||||
features = self.convolution(features)
|
||||
if self.normalization is not None:
|
||||
features = self.normalization(features)
|
||||
if self.activation is not None:
|
||||
features = self.activation(features)
|
||||
return features
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class MobileNetV1PreTrainedModel(PreTrainedModel):
|
||||
config: MobileNetV1Config
|
||||
load_tf_weights = load_tf_weights_in_mobilenet_v1
|
||||
base_model_prefix = "mobilenet_v1"
|
||||
main_input_name = "pixel_values"
|
||||
supports_gradient_checkpointing = False
|
||||
_no_split_modules = []
|
||||
|
||||
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
|
||||
"""Initialize the weights"""
|
||||
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.BatchNorm2d):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class MobileNetV1Model(MobileNetV1PreTrainedModel):
|
||||
def __init__(self, config: MobileNetV1Config, add_pooling_layer: bool = True):
|
||||
r"""
|
||||
add_pooling_layer (bool, *optional*, defaults to `True`):
|
||||
Whether to add a pooling layer
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
|
||||
depth = 32
|
||||
out_channels = max(int(depth * config.depth_multiplier), config.min_depth)
|
||||
|
||||
self.conv_stem = MobileNetV1ConvLayer(
|
||||
config,
|
||||
in_channels=config.num_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
)
|
||||
|
||||
strides = [1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1]
|
||||
|
||||
self.layer = nn.ModuleList()
|
||||
for i in range(13):
|
||||
in_channels = out_channels
|
||||
|
||||
if strides[i] == 2 or i == 0:
|
||||
depth *= 2
|
||||
out_channels = max(int(depth * config.depth_multiplier), config.min_depth)
|
||||
|
||||
self.layer.append(
|
||||
MobileNetV1ConvLayer(
|
||||
config,
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels,
|
||||
kernel_size=3,
|
||||
stride=strides[i],
|
||||
groups=in_channels,
|
||||
)
|
||||
)
|
||||
|
||||
self.layer.append(
|
||||
MobileNetV1ConvLayer(
|
||||
config,
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
)
|
||||
)
|
||||
|
||||
self.pooler = nn.AdaptiveAvgPool2d((1, 1)) if add_pooling_layer else None
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
raise NotImplementedError
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if pixel_values is None:
|
||||
raise ValueError("You have to specify pixel_values")
|
||||
|
||||
hidden_states = self.conv_stem(pixel_values)
|
||||
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
|
||||
for i, layer_module in enumerate(self.layer):
|
||||
hidden_states = layer_module(hidden_states)
|
||||
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
last_hidden_state = hidden_states
|
||||
|
||||
if self.pooler is not None:
|
||||
pooled_output = torch.flatten(self.pooler(last_hidden_state), start_dim=1)
|
||||
else:
|
||||
pooled_output = None
|
||||
|
||||
if not return_dict:
|
||||
return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None)
|
||||
|
||||
return BaseModelOutputWithPoolingAndNoAttention(
|
||||
last_hidden_state=last_hidden_state,
|
||||
pooler_output=pooled_output,
|
||||
hidden_states=all_hidden_states,
|
||||
)
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
MobileNetV1 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
|
||||
ImageNet.
|
||||
"""
|
||||
)
|
||||
class MobileNetV1ForImageClassification(MobileNetV1PreTrainedModel):
|
||||
def __init__(self, config: MobileNetV1Config) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
self.mobilenet_v1 = MobileNetV1Model(config)
|
||||
|
||||
last_hidden_size = self.mobilenet_v1.layer[-1].convolution.out_channels
|
||||
|
||||
# Classifier head
|
||||
self.dropout = nn.Dropout(config.classifier_dropout_prob, inplace=True)
|
||||
self.classifier = nn.Linear(last_hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: Optional[torch.Tensor] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
labels: Optional[torch.Tensor] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.mobilenet_v1(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
|
||||
|
||||
pooled_output = outputs.pooler_output if return_dict else outputs[1]
|
||||
|
||||
logits = self.classifier(self.dropout(pooled_output))
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.problem_type is None:
|
||||
if self.num_labels == 1:
|
||||
self.config.problem_type = "regression"
|
||||
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
|
||||
self.config.problem_type = "single_label_classification"
|
||||
else:
|
||||
self.config.problem_type = "multi_label_classification"
|
||||
|
||||
if self.config.problem_type == "regression":
|
||||
loss_fct = MSELoss()
|
||||
if self.num_labels == 1:
|
||||
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
||||
else:
|
||||
loss = loss_fct(logits, labels)
|
||||
elif self.config.problem_type == "single_label_classification":
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
elif self.config.problem_type == "multi_label_classification":
|
||||
loss_fct = BCEWithLogitsLoss()
|
||||
loss = loss_fct(logits, labels)
|
||||
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return ImageClassifierOutputWithNoAttention(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"MobileNetV1ForImageClassification",
|
||||
"MobileNetV1Model",
|
||||
"MobileNetV1PreTrainedModel",
|
||||
"load_tf_weights_in_mobilenet_v1",
|
||||
]
|
Loading…
Add table
Add a link
Reference in a new issue