Adding all project files

This commit is contained in:
Martina Burlando 2025-08-02 02:00:33 +02:00
parent 6c9e127bdc
commit cd4316ad0f
42289 changed files with 8009643 additions and 0 deletions

View file

@ -0,0 +1,30 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_mobilenet_v1 import *
from .feature_extraction_mobilenet_v1 import *
from .image_processing_mobilenet_v1 import *
from .image_processing_mobilenet_v1_fast import *
from .modeling_mobilenet_v1 import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View file

@ -0,0 +1,126 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MobileNetV1 model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
class MobileNetV1Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MobileNetV1Model`]. It is used to instantiate a
MobileNetV1 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the MobileNetV1
[google/mobilenet_v1_1.0_224](https://huggingface.co/google/mobilenet_v1_1.0_224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
depth_multiplier (`float`, *optional*, defaults to 1.0):
Shrinks or expands the number of channels in each layer. Default is 1.0, which starts the network with 32
channels. This is sometimes also called "alpha" or "width multiplier".
min_depth (`int`, *optional*, defaults to 8):
All layers will have at least this many channels.
hidden_act (`str` or `function`, *optional*, defaults to `"relu6"`):
The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
tf_padding (`bool`, *optional*, defaults to `True`):
Whether to use TensorFlow padding rules on the convolution layers.
classifier_dropout_prob (`float`, *optional*, defaults to 0.999):
The dropout ratio for attached classifiers.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 0.001):
The epsilon used by the layer normalization layers.
Example:
```python
>>> from transformers import MobileNetV1Config, MobileNetV1Model
>>> # Initializing a "mobilenet_v1_1.0_224" style configuration
>>> configuration = MobileNetV1Config()
>>> # Initializing a model from the "mobilenet_v1_1.0_224" style configuration
>>> model = MobileNetV1Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "mobilenet_v1"
def __init__(
self,
num_channels=3,
image_size=224,
depth_multiplier=1.0,
min_depth=8,
hidden_act="relu6",
tf_padding=True,
classifier_dropout_prob=0.999,
initializer_range=0.02,
layer_norm_eps=0.001,
**kwargs,
):
super().__init__(**kwargs)
if depth_multiplier <= 0:
raise ValueError("depth_multiplier must be greater than zero.")
self.num_channels = num_channels
self.image_size = image_size
self.depth_multiplier = depth_multiplier
self.min_depth = min_depth
self.hidden_act = hidden_act
self.tf_padding = tf_padding
self.classifier_dropout_prob = classifier_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
class MobileNetV1OnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict([("pixel_values", {0: "batch"})])
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "image-classification":
return OrderedDict([("logits", {0: "batch"})])
else:
return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
@property
def atol_for_validation(self) -> float:
return 1e-4
__all__ = ["MobileNetV1Config", "MobileNetV1OnnxConfig"]

View file

@ -0,0 +1,38 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Feature extractor class for MobileNetV1."""
import warnings
from ...utils import logging
from ...utils.import_utils import requires
from .image_processing_mobilenet_v1 import MobileNetV1ImageProcessor
logger = logging.get_logger(__name__)
@requires(backends=("vision",))
class MobileNetV1FeatureExtractor(MobileNetV1ImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class MobileNetV1FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use MobileNetV1ImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
__all__ = ["MobileNetV1FeatureExtractor"]

View file

@ -0,0 +1,307 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for MobileNetV1."""
from typing import Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_preprocess_arguments,
)
from ...utils import TensorType, filter_out_non_signature_kwargs, logging
from ...utils.import_utils import requires
logger = logging.get_logger(__name__)
@requires(backends=("vision",))
class MobileNetV1ImageProcessor(BaseImageProcessor):
r"""
Constructs a MobileNetV1 image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in the `preprocess` method.
size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 256}`):
Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the
`preprocess` method.
crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
Can be overridden by the `crop_size` parameter in the `preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize:
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Optional[dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_center_crop: bool = True,
crop_size: Optional[dict[str, int]] = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 256}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
def resize(
self,
image: np.ndarray,
size: dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
resized to keep the input aspect ratio.
Args:
image (`np.ndarray`):
Image to resize.
size (`dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
default_to_square = True
if "shortest_edge" in size:
size = size["shortest_edge"]
default_to_square = False
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
output_size = get_resize_output_image_size(
image,
size=size,
default_to_square=default_to_square,
input_data_format=input_data_format,
)
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
@filter_out_non_signature_kwargs()
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
size: Optional[dict[str, int]] = None,
resample: PILImageResampling = None,
do_center_crop: Optional[bool] = None,
crop_size: Optional[dict[str, int]] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio.
resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
`PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
an effect if `do_resize` is set to `True`.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image.
crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use if `do_normalize` is set to `True`.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use if `do_normalize` is set to `True`.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size)
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
images = make_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
validate_preprocess_arguments(
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_center_crop=do_center_crop,
crop_size=crop_size,
do_resize=do_resize,
size=size,
resample=resample,
)
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if do_rescale and is_scaled_image(images[0]):
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
all_images = []
for image in images:
if do_resize:
image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
if do_center_crop:
image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
if do_rescale:
image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
if do_normalize:
image = self.normalize(
image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
)
all_images.append(image)
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in all_images
]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
__all__ = ["MobileNetV1ImageProcessor"]

View file

@ -0,0 +1,43 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Image processor class for MobileNetV1."""
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
Unpack,
)
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling
from ...utils import auto_docstring
@auto_docstring
class MobileNetV1ImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BILINEAR
image_mean = IMAGENET_STANDARD_MEAN
image_std = IMAGENET_STANDARD_STD
size = {"shortest_edge": 256}
default_to_square = False
crop_size = {"height": 224, "width": 224}
do_resize = True
do_center_crop = True
do_rescale = True
do_normalize = True
def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> None:
super().__init__(**kwargs)
__all__ = ["MobileNetV1ImageProcessorFast"]

View file

@ -0,0 +1,434 @@
# coding=utf-8
# Copyright 2022 Apple Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch MobileNetV1 model."""
from typing import Optional, Union
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention
from ...modeling_utils import PreTrainedModel
from ...utils import auto_docstring, logging
from .configuration_mobilenet_v1 import MobileNetV1Config
logger = logging.get_logger(__name__)
def _build_tf_to_pytorch_map(model, config, tf_weights=None):
"""
A map of modules from TF to PyTorch.
"""
tf_to_pt_map = {}
if isinstance(model, MobileNetV1ForImageClassification):
backbone = model.mobilenet_v1
else:
backbone = model
prefix = "MobilenetV1/Conv2d_0/"
tf_to_pt_map[prefix + "weights"] = backbone.conv_stem.convolution.weight
tf_to_pt_map[prefix + "BatchNorm/beta"] = backbone.conv_stem.normalization.bias
tf_to_pt_map[prefix + "BatchNorm/gamma"] = backbone.conv_stem.normalization.weight
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.normalization.running_mean
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.normalization.running_var
for i in range(13):
tf_index = i + 1
pt_index = i * 2
pointer = backbone.layer[pt_index]
prefix = f"MobilenetV1/Conv2d_{tf_index}_depthwise/"
tf_to_pt_map[prefix + "depthwise_weights"] = pointer.convolution.weight
tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias
tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var
pointer = backbone.layer[pt_index + 1]
prefix = f"MobilenetV1/Conv2d_{tf_index}_pointwise/"
tf_to_pt_map[prefix + "weights"] = pointer.convolution.weight
tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias
tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var
if isinstance(model, MobileNetV1ForImageClassification):
prefix = "MobilenetV1/Logits/Conv2d_1c_1x1/"
tf_to_pt_map[prefix + "weights"] = model.classifier.weight
tf_to_pt_map[prefix + "biases"] = model.classifier.bias
return tf_to_pt_map
def load_tf_weights_in_mobilenet_v1(model, config, tf_checkpoint_path):
"""Load TensorFlow checkpoints in a PyTorch model."""
try:
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
# Load weights from TF model
init_vars = tf.train.list_variables(tf_checkpoint_path)
tf_weights = {}
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_checkpoint_path, name)
tf_weights[name] = array
# Build TF to PyTorch weights loading map
tf_to_pt_map = _build_tf_to_pytorch_map(model, config, tf_weights)
for name, pointer in tf_to_pt_map.items():
logger.info(f"Importing {name}")
if name not in tf_weights:
logger.info(f"{name} not in tf pre-trained weights, skipping")
continue
array = tf_weights[name]
if "depthwise_weights" in name:
logger.info("Transposing depthwise")
array = np.transpose(array, (2, 3, 0, 1))
elif "weights" in name:
logger.info("Transposing")
if len(pointer.shape) == 2: # copying into linear layer
array = array.squeeze().transpose()
else:
array = np.transpose(array, (3, 2, 0, 1))
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
logger.info(f"Initialize PyTorch weight {name} {array.shape}")
pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None)
tf_weights.pop(name + "/RMSProp", None)
tf_weights.pop(name + "/RMSProp_1", None)
tf_weights.pop(name + "/ExponentialMovingAverage", None)
logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
return model
def apply_tf_padding(features: torch.Tensor, conv_layer: nn.Conv2d) -> torch.Tensor:
"""
Apply TensorFlow-style "SAME" padding to a convolution layer. See the notes at:
https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2
"""
in_height, in_width = features.shape[-2:]
stride_height, stride_width = conv_layer.stride
kernel_height, kernel_width = conv_layer.kernel_size
if in_height % stride_height == 0:
pad_along_height = max(kernel_height - stride_height, 0)
else:
pad_along_height = max(kernel_height - (in_height % stride_height), 0)
if in_width % stride_width == 0:
pad_along_width = max(kernel_width - stride_width, 0)
else:
pad_along_width = max(kernel_width - (in_width % stride_width), 0)
pad_left = pad_along_width // 2
pad_right = pad_along_width - pad_left
pad_top = pad_along_height // 2
pad_bottom = pad_along_height - pad_top
padding = (pad_left, pad_right, pad_top, pad_bottom)
return nn.functional.pad(features, padding, "constant", 0.0)
class MobileNetV1ConvLayer(nn.Module):
def __init__(
self,
config: MobileNetV1Config,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: Optional[int] = 1,
groups: Optional[int] = 1,
bias: bool = False,
use_normalization: Optional[bool] = True,
use_activation: Optional[bool or str] = True,
) -> None:
super().__init__()
self.config = config
if in_channels % groups != 0:
raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
if out_channels % groups != 0:
raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")
padding = 0 if config.tf_padding else int((kernel_size - 1) / 2)
self.convolution = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=bias,
padding_mode="zeros",
)
if use_normalization:
self.normalization = nn.BatchNorm2d(
num_features=out_channels,
eps=config.layer_norm_eps,
momentum=0.9997,
affine=True,
track_running_stats=True,
)
else:
self.normalization = None
if use_activation:
if isinstance(use_activation, str):
self.activation = ACT2FN[use_activation]
elif isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
else:
self.activation = None
def forward(self, features: torch.Tensor) -> torch.Tensor:
if self.config.tf_padding:
features = apply_tf_padding(features, self.convolution)
features = self.convolution(features)
if self.normalization is not None:
features = self.normalization(features)
if self.activation is not None:
features = self.activation(features)
return features
@auto_docstring
class MobileNetV1PreTrainedModel(PreTrainedModel):
config: MobileNetV1Config
load_tf_weights = load_tf_weights_in_mobilenet_v1
base_model_prefix = "mobilenet_v1"
main_input_name = "pixel_values"
supports_gradient_checkpointing = False
_no_split_modules = []
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.BatchNorm2d):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@auto_docstring
class MobileNetV1Model(MobileNetV1PreTrainedModel):
def __init__(self, config: MobileNetV1Config, add_pooling_layer: bool = True):
r"""
add_pooling_layer (bool, *optional*, defaults to `True`):
Whether to add a pooling layer
"""
super().__init__(config)
self.config = config
depth = 32
out_channels = max(int(depth * config.depth_multiplier), config.min_depth)
self.conv_stem = MobileNetV1ConvLayer(
config,
in_channels=config.num_channels,
out_channels=out_channels,
kernel_size=3,
stride=2,
)
strides = [1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1]
self.layer = nn.ModuleList()
for i in range(13):
in_channels = out_channels
if strides[i] == 2 or i == 0:
depth *= 2
out_channels = max(int(depth * config.depth_multiplier), config.min_depth)
self.layer.append(
MobileNetV1ConvLayer(
config,
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=strides[i],
groups=in_channels,
)
)
self.layer.append(
MobileNetV1ConvLayer(
config,
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
)
)
self.pooler = nn.AdaptiveAvgPool2d((1, 1)) if add_pooling_layer else None
# Initialize weights and apply final processing
self.post_init()
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.conv_stem(pixel_values)
all_hidden_states = () if output_hidden_states else None
for i, layer_module in enumerate(self.layer):
hidden_states = layer_module(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
last_hidden_state = hidden_states
if self.pooler is not None:
pooled_output = torch.flatten(self.pooler(last_hidden_state), start_dim=1)
else:
pooled_output = None
if not return_dict:
return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None)
return BaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=all_hidden_states,
)
@auto_docstring(
custom_intro="""
MobileNetV1 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.
"""
)
class MobileNetV1ForImageClassification(MobileNetV1PreTrainedModel):
def __init__(self, config: MobileNetV1Config) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.mobilenet_v1 = MobileNetV1Model(config)
last_hidden_size = self.mobilenet_v1.layer[-1].convolution.out_channels
# Classifier head
self.dropout = nn.Dropout(config.classifier_dropout_prob, inplace=True)
self.classifier = nn.Linear(last_hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
# Initialize weights and apply final processing
self.post_init()
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mobilenet_v1(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(self.dropout(pooled_output))
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutputWithNoAttention(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
)
__all__ = [
"MobileNetV1ForImageClassification",
"MobileNetV1Model",
"MobileNetV1PreTrainedModel",
"load_tf_weights_in_mobilenet_v1",
]