# coding=utf-8 # Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch TextNet model.""" from typing import Any, Optional, Union import torch import torch.nn as nn from torch import Tensor from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from transformers import PreTrainedModel from transformers.activations import ACT2CLS from transformers.modeling_outputs import ( BackboneOutput, BaseModelOutputWithNoAttention, BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention, ) from transformers.models.textnet.configuration_textnet import TextNetConfig from transformers.utils import logging from transformers.utils.backbone_utils import BackboneMixin from ...utils import auto_docstring logger = logging.get_logger(__name__) class TextNetConvLayer(nn.Module): def __init__(self, config: TextNetConfig): super().__init__() self.kernel_size = config.stem_kernel_size self.stride = config.stem_stride self.activation_function = config.stem_act_func padding = ( (config.kernel_size[0] // 2, config.kernel_size[1] // 2) if isinstance(config.stem_kernel_size, tuple) else config.stem_kernel_size // 2 ) self.conv = nn.Conv2d( config.stem_num_channels, config.stem_out_channels, kernel_size=config.stem_kernel_size, stride=config.stem_stride, padding=padding, bias=False, ) self.batch_norm = nn.BatchNorm2d(config.stem_out_channels, config.batch_norm_eps) self.activation = nn.Identity() if self.activation_function is not None: self.activation = ACT2CLS[self.activation_function]() def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.conv(hidden_states) hidden_states = self.batch_norm(hidden_states) return self.activation(hidden_states) class TextNetRepConvLayer(nn.Module): r""" This layer supports re-parameterization by combining multiple convolutional branches (e.g., main convolution, vertical, horizontal, and identity branches) during training. At inference time, these branches can be collapsed into a single convolution for efficiency, as per the re-parameterization paradigm. The "Rep" in the name stands for "re-parameterization" (introduced by RepVGG). """ def __init__(self, config: TextNetConfig, in_channels: int, out_channels: int, kernel_size: int, stride: int): super().__init__() self.num_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2) self.activation_function = nn.ReLU() self.main_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=False, ) self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps) vertical_padding = ((kernel_size[0] - 1) // 2, 0) horizontal_padding = (0, (kernel_size[1] - 1) // 2) if kernel_size[1] != 1: self.vertical_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=(kernel_size[0], 1), stride=stride, padding=vertical_padding, bias=False, ) self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps) else: self.vertical_conv, self.vertical_batch_norm = None, None if kernel_size[0] != 1: self.horizontal_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=(1, kernel_size[1]), stride=stride, padding=horizontal_padding, bias=False, ) self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps) else: self.horizontal_conv, self.horizontal_batch_norm = None, None self.rbr_identity = ( nn.BatchNorm2d(num_features=in_channels, eps=config.batch_norm_eps) if out_channels == in_channels and stride == 1 else None ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: main_outputs = self.main_conv(hidden_states) main_outputs = self.main_batch_norm(main_outputs) # applies a convolution with a vertical kernel if self.vertical_conv is not None: vertical_outputs = self.vertical_conv(hidden_states) vertical_outputs = self.vertical_batch_norm(vertical_outputs) main_outputs = main_outputs + vertical_outputs # applies a convolution with a horizontal kernel if self.horizontal_conv is not None: horizontal_outputs = self.horizontal_conv(hidden_states) horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs) main_outputs = main_outputs + horizontal_outputs if self.rbr_identity is not None: id_out = self.rbr_identity(hidden_states) main_outputs = main_outputs + id_out return self.activation_function(main_outputs) class TextNetStage(nn.Module): def __init__(self, config: TextNetConfig, depth: int): super().__init__() kernel_size = config.conv_layer_kernel_sizes[depth] stride = config.conv_layer_strides[depth] num_layers = len(kernel_size) stage_in_channel_size = config.hidden_sizes[depth] stage_out_channel_size = config.hidden_sizes[depth + 1] in_channels = [stage_in_channel_size] + [stage_out_channel_size] * (num_layers - 1) out_channels = [stage_out_channel_size] * num_layers stage = [] for stage_config in zip(in_channels, out_channels, kernel_size, stride): stage.append(TextNetRepConvLayer(config, *stage_config)) self.stage = nn.ModuleList(stage) def forward(self, hidden_state): for block in self.stage: hidden_state = block(hidden_state) return hidden_state class TextNetEncoder(nn.Module): def __init__(self, config: TextNetConfig): super().__init__() stages = [] num_stages = len(config.conv_layer_kernel_sizes) for stage_ix in range(num_stages): stages.append(TextNetStage(config, stage_ix)) self.stages = nn.ModuleList(stages) def forward( self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> BaseModelOutputWithNoAttention: hidden_states = [hidden_state] for stage in self.stages: hidden_state = stage(hidden_state) hidden_states.append(hidden_state) if not return_dict: output = (hidden_state,) return output + (hidden_states,) if output_hidden_states else output return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states) @auto_docstring class TextNetPreTrainedModel(PreTrainedModel): config: TextNetConfig base_model_prefix = "textnet" main_input_name = "pixel_values" def _init_weights(self, module): if isinstance(module, (nn.Linear, nn.Conv2d)): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.BatchNorm2d): module.weight.data.fill_(1.0) if module.bias is not None: module.bias.data.zero_() @auto_docstring class TextNetModel(TextNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.stem = TextNetConvLayer(config) self.encoder = TextNetEncoder(config) self.pooler = nn.AdaptiveAvgPool2d((2, 2)) self.post_init() @auto_docstring def forward( self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None ) -> Union[tuple[Any, list[Any]], tuple[Any], BaseModelOutputWithPoolingAndNoAttention]: return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) hidden_state = self.stem(pixel_values) encoder_outputs = self.encoder( hidden_state, output_hidden_states=output_hidden_states, return_dict=return_dict ) last_hidden_state = encoder_outputs[0] pooled_output = self.pooler(last_hidden_state) if not return_dict: output = (last_hidden_state, pooled_output) return output + (encoder_outputs[1],) if output_hidden_states else output return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs[1] if output_hidden_states else None, ) @auto_docstring( custom_intro=""" TextNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. """ ) class TextNetForImageClassification(TextNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.textnet = TextNetModel(config) self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) self.flatten = nn.Flatten() self.fc = nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity() # classification head self.classifier = nn.ModuleList([self.avg_pool, self.flatten]) # initialize weights and apply final processing self.post_init() @auto_docstring def forward( self, pixel_values: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> ImageClassifierOutputWithNoAttention: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). Examples: ```python >>> import torch >>> import requests >>> from transformers import TextNetForImageClassification, TextNetImageProcessor >>> from PIL import Image >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base") >>> model = TextNetForImageClassification.from_pretrained("czczup/textnet-base") >>> inputs = processor(images=image, return_tensors="pt") >>> with torch.no_grad(): ... outputs = model(**inputs) >>> outputs.logits.shape torch.Size([1, 2]) ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.textnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) last_hidden_state = outputs[0] for layer in self.classifier: last_hidden_state = layer(last_hidden_state) logits = self.fc(last_hidden_state) loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return (loss,) + output if loss is not None else output return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @auto_docstring( custom_intro=""" TextNet backbone, to be used with frameworks like DETR and MaskFormer. """ ) class TextNetBackbone(TextNetPreTrainedModel, BackboneMixin): def __init__(self, config): super().__init__(config) super()._init_backbone(config) self.textnet = TextNetModel(config) self.num_features = config.hidden_sizes # initialize weights and apply final processing self.post_init() @auto_docstring def forward( self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None ) -> Union[tuple[tuple], BackboneOutput]: r""" Examples: ```python >>> import torch >>> import requests >>> from PIL import Image >>> from transformers import AutoImageProcessor, AutoBackbone >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> processor = AutoImageProcessor.from_pretrained("czczup/textnet-base") >>> model = AutoBackbone.from_pretrained("czczup/textnet-base") >>> inputs = processor(image, return_tensors="pt") >>> with torch.no_grad(): >>> outputs = model(**inputs) ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) outputs = self.textnet(pixel_values, output_hidden_states=True, return_dict=return_dict) hidden_states = outputs.hidden_states if return_dict else outputs[2] feature_maps = () for idx, stage in enumerate(self.stage_names): if stage in self.out_features: feature_maps += (hidden_states[idx],) if not return_dict: output = (feature_maps,) if output_hidden_states: hidden_states = outputs.hidden_states if return_dict else outputs[2] output += (hidden_states,) return output return BackboneOutput( feature_maps=feature_maps, hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=None, ) __all__ = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel", "TextNetForImageClassification"]