team-10/venv/Lib/site-packages/transformers/models/eomt/modeling_eomt.py
2025-08-02 02:00:33 +02:00

1249 lines
54 KiB
Python

# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/eomt/modular_eomt.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_eomt.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections.abc
import math
from dataclasses import dataclass
from typing import Callable, Optional, Union
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor, nn
from ...activations import ACT2FN
from ...file_utils import ModelOutput, is_scipy_available, requires_backends
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...utils import auto_docstring, can_return_tuple, is_accelerate_available
from .configuration_eomt import EomtConfig
if is_scipy_available():
from scipy.optimize import linear_sum_assignment
if is_accelerate_available():
from accelerate import PartialState
from accelerate.utils import reduce
@dataclass
@auto_docstring(
custom_intro="""
Class for outputs of [`EomtForUniversalSegmentationOutput`].
This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
[`~EomtImageProcessor.post_process_instance_segmentation`] or
[`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
[`~EomtImageProcessor] for details regarding usage.
"""
)
class EomtForUniversalSegmentationOutput(ModelOutput):
r"""
loss (`torch.Tensor`, *optional*):
The computed loss, returned when labels are present.
class_queries_logits (`torch.FloatTensor`):
A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
query. Note the `+ 1` is needed because we incorporate the null class.
masks_queries_logits (`torch.FloatTensor`):
A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
query.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Last hidden states (final feature map) of the last layer.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Self and Cross Attentions weights from transformer decoder.
patch_offsets (`list[torch.Tensor]`, *optional*):
list of tuples indicating the image index and start and end positions of patches for semantic segementation.
"""
loss: Optional[torch.FloatTensor] = None
class_queries_logits: Optional[torch.FloatTensor] = None
masks_queries_logits: Optional[torch.FloatTensor] = None
last_hidden_state: Optional[torch.FloatTensor] = None
hidden_states: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[tuple[torch.FloatTensor]] = None
patch_offsets: Optional[list[torch.Tensor]] = None
# Adapted from https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/point_features.py
def sample_point(
input_features: torch.Tensor, point_coordinates: torch.Tensor, add_dim=False, **kwargs
) -> torch.Tensor:
"""
A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.
Args:
input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
A tensor that contains features map on a height * width grid
point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
2)):
A tensor that contains [0, 1] * [0, 1] normalized point coordinates
add_dim (`bool`):
boolean value to keep track of added dimension
Returns:
point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
height_grid, width_grid):
A tensor that contains features for points in `point_coordinates`.
"""
if point_coordinates.dim() == 3:
add_dim = True
point_coordinates = point_coordinates.unsqueeze(2)
# use nn.function.grid_sample to get features for points in `point_coordinates` via bilinear interpolation
point_features = torch.nn.functional.grid_sample(input_features, 2.0 * point_coordinates - 1.0, **kwargs)
if add_dim:
point_features = point_features.squeeze(3)
return point_features
def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
"""
A pair wise version of the dice loss, see `dice_loss` for usage.
Args:
inputs (`torch.Tensor`):
A tensor representing a mask
labels (`torch.Tensor`):
A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
`torch.Tensor`: The computed loss between each pairs.
"""
inputs = inputs.sigmoid().flatten(1)
numerator = 2 * torch.matmul(inputs, labels.T)
# using broadcasting to get a [num_queries, NUM_CLASSES] matrix
denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
loss = 1 - (numerator + 1) / (denominator + 1)
return loss
def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
r"""
A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.
Args:
inputs (`torch.Tensor`):
A tensor representing a mask.
labels (`torch.Tensor`):
A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
loss (`torch.Tensor`): The computed loss between each pairs.
"""
height_and_width = inputs.shape[1]
criterion = nn.BCEWithLogitsLoss(reduction="none")
cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs))
cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs))
loss_pos = torch.matmul(cross_entropy_loss_pos / height_and_width, labels.T)
loss_neg = torch.matmul(cross_entropy_loss_neg / height_and_width, (1 - labels).T)
loss = loss_pos + loss_neg
return loss
# Adapted from https://github.com/facebookresearch/Eomt/blob/main/eomt/modeling/matcher.py
class EomtHungarianMatcher(nn.Module):
"""This class computes an assignment between the labels and the predictions of the network.
For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
un-matched (and thus treated as non-objects).
"""
def __init__(
self, cost_class: float = 1.0, cost_mask: float = 1.0, cost_dice: float = 1.0, num_points: int = 12544
):
"""Creates the matcher
Params:
cost_class (`float`, *optional*, defaults to 1.0):
Relative weight of the classification error in the matching cost.
cost_mask (`float`, *optional*, defaults to 1.0):
This is the relative weight of the focal loss of the binary mask in the matching cost.
cost_dice (`float`, *optional*, defaults to 1.0):
This is the relative weight of the dice loss of the binary mask in the matching cost.
num_points (`int`, *optional*, defaults to 12544):
No. of points to sample on which the mask loss will be calculated. The same set of K points are
uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
matching.
"""
super().__init__()
if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
raise ValueError("All costs can't be 0")
self.num_points = num_points
self.cost_class = cost_class
self.cost_mask = cost_mask
self.cost_dice = cost_dice
@torch.no_grad()
def forward(
self,
masks_queries_logits: torch.Tensor,
class_queries_logits: torch.Tensor,
mask_labels: torch.Tensor,
class_labels: torch.Tensor,
) -> list[tuple[Tensor]]:
"""
Params:
masks_queries_logits (`torch.Tensor`):
A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
class_queries_logits (`torch.Tensor`):
A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
class_labels (`torch.Tensor`):
A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
target) containing the class labels.
mask_labels (`torch.Tensor`):
A tensor of dim `num_target_boxes, height, width` containing the target masks.
Returns:
matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected labels (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
"""
indices: list[tuple[np.array]] = []
# iterate through batch size
batch_size = masks_queries_logits.shape[0]
for i in range(batch_size):
pred_probs = class_queries_logits[i].softmax(-1)
pred_mask = masks_queries_logits[i]
# Compute the classification cost. Contrary to the loss, we don't use the NLL, but approximate it in 1 - proba[target class]. The 1 is a constant that doesn't change the matching, it can be omitted.
cost_class = -pred_probs[:, class_labels[i]]
target_mask = mask_labels[i].to(pred_mask)
target_mask = target_mask[:, None]
pred_mask = pred_mask[:, None]
# Sample ground truth and predicted masks
point_coordinates = torch.rand(1, self.num_points, 2, device=pred_mask.device)
target_coordinates = point_coordinates.repeat(target_mask.shape[0], 1, 1)
target_mask = sample_point(target_mask, target_coordinates, align_corners=False).squeeze(1)
pred_coordinates = point_coordinates.repeat(pred_mask.shape[0], 1, 1)
pred_mask = sample_point(pred_mask, pred_coordinates, align_corners=False).squeeze(1)
# compute the cross entropy loss between each mask pairs -> shape (num_queries, num_labels)
cost_mask = pair_wise_sigmoid_cross_entropy_loss(pred_mask, target_mask)
# Compute the dice loss between each mask pairs -> shape (num_queries, num_labels)
cost_dice = pair_wise_dice_loss(pred_mask, target_mask)
# final cost matrix
cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice
# eliminate infinite values in cost_matrix to avoid the error ``ValueError: cost matrix is infeasible``
cost_matrix = torch.minimum(cost_matrix, torch.tensor(1e10))
cost_matrix = torch.maximum(cost_matrix, torch.tensor(-1e10))
cost_matrix = torch.nan_to_num(cost_matrix, 0)
# do the assignment using the hungarian algorithm in scipy
assigned_indices: tuple[np.array] = linear_sum_assignment(cost_matrix.cpu())
indices.append(assigned_indices)
# It could be stacked in one tensor
matched_indices = [
(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices
]
return matched_indices
def dice_loss(inputs: Tensor, labels: Tensor, num_masks: int) -> Tensor:
r"""
Compute the DICE loss, similar to generalized IOU for masks as follows:
$$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$
In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow
$$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$
Args:
inputs (`torch.Tensor`):
A tensor representing a mask.
labels (`torch.Tensor`):
A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
(0 for the negative class and 1 for the positive class).
num_masks (`int`):
The number of masks present in the current batch, used for normalization.
Returns:
`torch.Tensor`: The computed loss.
"""
probs = inputs.sigmoid().flatten(1)
numerator = 2 * (probs * labels).sum(-1)
denominator = probs.sum(-1) + labels.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
loss = loss.sum() / num_masks
return loss
def sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor, num_masks: int) -> torch.Tensor:
r"""
Args:
inputs (`torch.Tensor`):
A float tensor of arbitrary shape.
labels (`torch.Tensor`):
A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
(0 for the negative class and 1 for the positive class).
Returns:
loss (`torch.Tensor`): The computed loss.
"""
criterion = nn.BCEWithLogitsLoss(reduction="none")
cross_entropy_loss = criterion(inputs, labels)
loss = cross_entropy_loss.mean(1).sum() / num_masks
return loss
# Adapted from https://github.com/facebookresearch/Eomt/blob/main/eomt/modeling/criterion.py
class EomtLoss(nn.Module):
def __init__(self, config: EomtConfig, weight_dict: dict[str, float]):
"""
The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
of matched ground-truth / prediction (supervise class and mask)
Args:
config (`EomtConfig`):
The configuration for Eomt model also containing loss calculation specific parameters.
weight_dict (`dict[str, float]`):
A dictionary of weights to be applied to the different losses.
"""
super().__init__()
requires_backends(self, ["scipy"])
self.num_labels = config.num_labels
self.weight_dict = weight_dict
# Weight to apply to the null class
self.eos_coef = config.no_object_weight
empty_weight = torch.ones(self.num_labels + 1)
empty_weight[-1] = self.eos_coef
self.register_buffer("empty_weight", empty_weight)
# pointwise mask loss parameters
self.num_points = config.train_num_points
self.oversample_ratio = config.oversample_ratio
self.importance_sample_ratio = config.importance_sample_ratio
self.matcher = EomtHungarianMatcher(
cost_class=config.class_weight,
cost_dice=config.dice_weight,
cost_mask=config.mask_weight,
num_points=self.num_points,
)
def _max_by_axis(self, sizes: list[list[int]]) -> list[int]:
maxes = sizes[0]
for sublist in sizes[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
# Adapted from nested_tensor_from_tensor_list() in original implementation
def _pad_images_to_max_in_batch(self, tensors: list[Tensor]) -> tuple[Tensor, Tensor]:
# get the maximum size in the batch
max_size = self._max_by_axis([list(tensor.shape) for tensor in tensors])
# compute final size
batch_shape = [len(tensors)] + max_size
batch_size, _, height, width = batch_shape
dtype = tensors[0].dtype
device = tensors[0].device
padded_tensors = torch.zeros(batch_shape, dtype=dtype, device=device)
padding_masks = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
# pad the tensors to the size of the biggest one
for tensor, padded_tensor, padding_mask in zip(tensors, padded_tensors, padding_masks):
padded_tensor[: tensor.shape[0], : tensor.shape[1], : tensor.shape[2]].copy_(tensor)
padding_mask[: tensor.shape[1], : tensor.shape[2]] = False
return padded_tensors, padding_masks
def loss_labels(
self, class_queries_logits: Tensor, class_labels: list[Tensor], indices: tuple[np.array]
) -> dict[str, Tensor]:
"""Compute the losses related to the labels using cross entropy.
Args:
class_queries_logits (`torch.Tensor`):
A tensor of shape `batch_size, num_queries, num_labels`
class_labels (`list[torch.Tensor]`):
List of class labels of shape `(labels)`.
indices (`tuple[np.array])`:
The indices computed by the Hungarian matcher.
Returns:
`dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
- **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
"""
pred_logits = class_queries_logits
batch_size, num_queries, _ = pred_logits.shape
criterion = nn.CrossEntropyLoss(weight=self.empty_weight)
idx = self._get_predictions_permutation_indices(indices) # shape of (batch_size, num_queries)
target_classes_o = torch.cat(
[target[j] for target, (_, j) in zip(class_labels, indices)]
) # shape of (batch_size, num_queries)
target_classes = torch.full(
(batch_size, num_queries), fill_value=self.num_labels, dtype=torch.int64, device=pred_logits.device
)
target_classes[idx] = target_classes_o
# Permute target_classes (batch_size, num_queries, num_labels) -> (batch_size, num_labels, num_queries)
pred_logits_transposed = pred_logits.transpose(1, 2)
loss_ce = criterion(pred_logits_transposed, target_classes)
losses = {"loss_cross_entropy": loss_ce}
return losses
def loss_masks(
self,
masks_queries_logits: torch.Tensor,
mask_labels: list[torch.Tensor],
indices: tuple[np.array],
num_masks: int,
) -> dict[str, torch.Tensor]:
"""Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.
Args:
masks_queries_logits (`torch.Tensor`):
A tensor of shape `(batch_size, num_queries, height, width)`.
mask_labels (`torch.Tensor`):
List of mask labels of shape `(labels, height, width)`.
indices (`tuple[np.array])`:
The indices computed by the Hungarian matcher.
num_masks (`int)`:
The number of masks, used for normalization.
Returns:
losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
- **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
masks.
- **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
masks.
"""
src_idx = self._get_predictions_permutation_indices(indices)
tgt_idx = self._get_targets_permutation_indices(indices)
# shape (batch_size * num_queries, height, width)
pred_masks = masks_queries_logits[src_idx]
# shape (batch_size, num_queries, height, width)
# pad all and stack the targets to the num_labels dimension
target_masks, _ = self._pad_images_to_max_in_batch(mask_labels)
target_masks = target_masks[tgt_idx]
# No need to upsample predictions as we are using normalized coordinates
pred_masks = pred_masks[:, None]
target_masks = target_masks[:, None]
# Sample point coordinates
with torch.no_grad():
point_coordinates = self.sample_points_using_uncertainty(
pred_masks,
lambda logits: self.calculate_uncertainty(logits),
self.num_points,
self.oversample_ratio,
self.importance_sample_ratio,
)
point_labels = sample_point(target_masks, point_coordinates, align_corners=False).squeeze(1)
point_logits = sample_point(pred_masks, point_coordinates, align_corners=False).squeeze(1)
losses = {
"loss_mask": sigmoid_cross_entropy_loss(point_logits, point_labels, num_masks),
"loss_dice": dice_loss(point_logits, point_labels, num_masks),
}
del pred_masks
del target_masks
return losses
def _get_predictions_permutation_indices(self, indices):
# Permute predictions following indices
batch_indices = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
predictions_indices = torch.cat([src for (src, _) in indices])
return batch_indices, predictions_indices
def _get_targets_permutation_indices(self, indices):
# Permute labels following indices
batch_indices = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
target_indices = torch.cat([tgt for (_, tgt) in indices])
return batch_indices, target_indices
def calculate_uncertainty(self, logits: torch.Tensor) -> torch.Tensor:
"""
In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
for the foreground class in `classes`.
Args:
logits (`torch.Tensor`):
A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
the number of foreground classes. The values are logits.
Returns:
scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
uncertain locations having the highest uncertainty score.
"""
uncertainty_scores = -(torch.abs(logits))
return uncertainty_scores
def sample_points_using_uncertainty(
self,
logits: torch.Tensor,
uncertainty_function,
num_points: int,
oversample_ratio: int,
importance_sample_ratio: float,
) -> torch.Tensor:
"""
This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
prediction as input.
Args:
logits (`float`):
Logit predictions for P points.
uncertainty_function:
A function that takes logit predictions for P points and returns their uncertainties.
num_points (`int`):
The number of points P to sample.
oversample_ratio (`int`):
Oversampling parameter.
importance_sample_ratio (`float`):
Ratio of points that are sampled via importance sampling.
Returns:
point_coordinates (`torch.Tensor`):
Coordinates for P sampled points.
"""
num_boxes = logits.shape[0]
num_points_sampled = int(num_points * oversample_ratio)
# Get random point coordinates
point_coordinates = torch.rand(num_boxes, num_points_sampled, 2, device=logits.device)
# Get sampled prediction value for the point coordinates
point_logits = sample_point(logits, point_coordinates, align_corners=False)
# Calculate the uncertainties based on the sampled prediction values of the points
point_uncertainties = uncertainty_function(point_logits)
num_uncertain_points = int(importance_sample_ratio * num_points)
num_random_points = num_points - num_uncertain_points
idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
shift = num_points_sampled * torch.arange(num_boxes, dtype=torch.long, device=logits.device)
idx += shift[:, None]
point_coordinates = point_coordinates.view(-1, 2)[idx.view(-1), :].view(num_boxes, num_uncertain_points, 2)
if num_random_points > 0:
point_coordinates = torch.cat(
[point_coordinates, torch.rand(num_boxes, num_random_points, 2, device=logits.device)],
dim=1,
)
return point_coordinates
def forward(
self,
masks_queries_logits: torch.Tensor,
class_queries_logits: torch.Tensor,
mask_labels: list[torch.Tensor],
class_labels: list[torch.Tensor],
auxiliary_predictions: Optional[dict[str, torch.Tensor]] = None,
) -> dict[str, torch.Tensor]:
"""
This performs the loss computation.
Args:
masks_queries_logits (`torch.Tensor`):
A tensor of shape `(batch_size, num_queries, height, width)`.
class_queries_logits (`torch.Tensor`):
A tensor of shape `(batch_size, num_queries, num_labels)`.
mask_labels (`torch.Tensor`):
List of mask labels of shape `(labels, height, width)`.
class_labels (`list[torch.Tensor]`):
List of class labels of shape `(labels)`.
auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from
the inner layers of the EomtMaskedAttentionDecoder.
Returns:
losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
- **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
- **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
masks.
- **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
masks.
if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional
losses for each auxiliary predictions.
"""
# retrieve the matching between the outputs of the last layer and the labels
indices = self.matcher(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
# compute the average number of target masks for normalization purposes
num_masks = self.get_num_masks(class_labels, device=class_labels[0].device)
# get all the losses
losses: dict[str, Tensor] = {
**self.loss_masks(masks_queries_logits, mask_labels, indices, num_masks),
**self.loss_labels(class_queries_logits, class_labels, indices),
}
# in case of auxiliary losses, we repeat this process with the output of each intermediate layer.
if auxiliary_predictions is not None:
for idx, aux_outputs in enumerate(auxiliary_predictions):
masks_queries_logits = aux_outputs["masks_queries_logits"]
class_queries_logits = aux_outputs["class_queries_logits"]
loss_dict = self.forward(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
loss_dict = {f"{key}_{idx}": value for key, value in loss_dict.items()}
losses.update(loss_dict)
return losses
def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor:
"""
Computes the average number of target masks across the batch, for normalization purposes.
"""
num_masks = sum([len(classes) for classes in class_labels])
num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device)
world_size = 1
if is_accelerate_available():
if PartialState._shared_state != {}:
num_masks = reduce(num_masks)
world_size = PartialState().num_processes
num_masks = torch.clamp(num_masks / world_size, min=1)
return num_masks
class EomtPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
num_channels = pixel_values.shape[1]
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
f" Expected {self.num_channels} but got {num_channels}."
)
embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
return embeddings
class EomtEmbeddings(nn.Module):
"""
Construct the CLS token, mask token, position and patch embeddings.
"""
def __init__(self, config: EomtConfig) -> None:
super().__init__()
self.config = config
self.patch_size = config.patch_size
self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
self.patch_embeddings = EomtPatchEmbeddings(config)
num_patches = self.patch_embeddings.num_patches
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.num_prefix_tokens = 1 + config.num_register_tokens # 1 for [CLS]
self.position_embeddings = nn.Embedding(num_patches, config.hidden_size)
self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
batch_size, _, _, _ = pixel_values.shape
target_dtype = self.patch_embeddings.projection.weight.dtype
embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
register_tokens = self.register_tokens.expand(batch_size, -1, -1)
embeddings = embeddings + self.position_embeddings(self.position_ids)
embeddings = torch.cat([cls_tokens, register_tokens, embeddings], dim=1)
embeddings = self.dropout(embeddings)
return embeddings
def eager_attention_forward(
module: nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attention_mask: Optional[torch.Tensor],
scaling: float,
dropout: float = 0.0,
**kwargs,
):
attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
attn_output = torch.matmul(attn_weights, value)
attn_output = attn_output.transpose(1, 2).contiguous()
return attn_output, attn_weights
class EomtAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
self.is_causal = False
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
**kwargs,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Input shape: Batch x Time x Channel"""
batch_size, seq_length, embed_dim = hidden_states.shape
queries = self.q_proj(hidden_states)
keys = self.k_proj(hidden_states)
values = self.v_proj(hidden_states)
queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
queries,
keys,
values,
attention_mask,
is_causal=self.is_causal,
scaling=self.scale,
dropout=0.0 if not self.training else self.dropout,
)
attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
attn_output = self.out_proj(attn_output)
return attn_output, attn_weights
class EomtLayerScale(nn.Module):
def __init__(self, config) -> None:
super().__init__()
self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
return hidden_state * self.lambda1
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # binarize
output = input.div(keep_prob) * random_tensor
return output
class EomtDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return f"p={self.drop_prob}"
class EomtMLP(nn.Module):
def __init__(self, config) -> None:
super().__init__()
in_features = out_features = config.hidden_size
hidden_features = int(config.hidden_size * config.mlp_ratio)
self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
if isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = self.fc1(hidden_state)
hidden_state = self.activation(hidden_state)
hidden_state = self.fc2(hidden_state)
return hidden_state
class EomtSwiGLUFFN(nn.Module):
def __init__(self, config) -> None:
super().__init__()
in_features = out_features = config.hidden_size
hidden_features = int(config.hidden_size * config.mlp_ratio)
hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = self.weights_in(hidden_state)
x1, x2 = hidden_state.chunk(2, dim=-1)
hidden = nn.functional.silu(x1) * x2
return self.weights_out(hidden)
class EomtLayer(GradientCheckpointingLayer):
"""This corresponds to the Block class in the original implementation."""
def __init__(self, config: EomtConfig) -> None:
super().__init__()
self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.attention = EomtAttention(config)
self.layer_scale1 = EomtLayerScale(config)
self.drop_path = EomtDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
if config.use_swiglu_ffn:
self.mlp = EomtSwiGLUFFN(config)
else:
self.mlp = EomtMLP(config)
self.layer_scale2 = EomtLayerScale(config)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
self_attention_outputs = self.attention(
self.norm1(hidden_states), # in Eomt, layernorm is applied before self-attention
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
attention_output = self.layer_scale1(attention_output)
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
# first residual connection
hidden_states = self.drop_path(attention_output) + hidden_states
# in Eomt, layernorm is also applied after self-attention
layer_output = self.norm2(hidden_states)
layer_output = self.mlp(layer_output)
layer_output = self.layer_scale2(layer_output)
# second residual connection
layer_output = self.drop_path(layer_output) + hidden_states
outputs = (layer_output,) + outputs
return outputs
class EomtLayerNorm2d(nn.LayerNorm):
def __init__(self, num_channels, eps=1e-6, affine=True):
super().__init__(num_channels, eps=eps, elementwise_affine=affine)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = hidden_state.permute(0, 2, 3, 1)
hidden_state = F.layer_norm(hidden_state, self.normalized_shape, self.weight, self.bias, self.eps)
hidden_state = hidden_state.permute(0, 3, 1, 2)
return hidden_state
class EomtScaleLayer(nn.Module):
def __init__(self, config: EomtConfig):
super().__init__()
hidden_size = config.hidden_size
self.conv1 = nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2)
self.activation = ACT2FN[config.hidden_act]
self.conv2 = nn.Conv2d(
hidden_size,
hidden_size,
kernel_size=3,
padding=1,
groups=hidden_size,
bias=False,
)
self.layernorm2d = EomtLayerNorm2d(hidden_size)
def forward(self, hidden_states: torch.tensor) -> torch.Tensor:
hidden_states = self.conv1(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.conv2(hidden_states)
hidden_states = self.layernorm2d(hidden_states)
return hidden_states
class EomtScaleBlock(nn.Module):
def __init__(self, config: EomtConfig):
super().__init__()
self.num_blocks = config.num_upscale_blocks
self.block = nn.ModuleList([EomtScaleLayer(config) for _ in range(self.num_blocks)])
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
for block in self.block:
hidden_states = block(hidden_states)
return hidden_states
class EomtMaskHead(nn.Module):
def __init__(self, config: EomtConfig):
super().__init__()
hidden_size = config.hidden_size
self.fc1 = nn.Linear(hidden_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, hidden_size)
self.activation = ACT2FN[config.hidden_act]
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.activation(self.fc1(hidden_states))
hidden_states = self.activation(self.fc2(hidden_states))
hidden_states = self.fc3(hidden_states)
return hidden_states
@auto_docstring
class EomtPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config: EomtConfig
base_model_prefix = "eomt"
main_input_name = "pixel_values"
supports_gradient_checkpointing = False
_no_split_modules = ["EomtLayer"]
_supports_sdpa = True
_supports_flash_attn = True
def _init_weights(self, module: nn.Module) -> None:
std = self.config.initializer_range
if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
if module.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
nn.init.uniform_(module.bias, -bound, bound)
elif isinstance(module, nn.LayerNorm):
module.weight.data.fill_(1.0)
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=1)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, EomtLayerScale):
if hasattr(module, "lambda1"):
module.lambda1.data.fill_(self.config.layerscale_value)
elif isinstance(module, EomtEmbeddings):
module.cls_token.data = nn.init.trunc_normal_(
module.cls_token.data.to(torch.float32), mean=0.0, std=std
).to(module.cls_token.dtype)
module.register_tokens.data.zero_()
@auto_docstring(
custom_intro="""
The EoMT Model with head on top for instance/semantic/panoptic segmentation.
"""
)
class EomtForUniversalSegmentation(EomtPreTrainedModel):
main_input_name = "pixel_values"
def __init__(self, config: EomtConfig) -> None:
super().__init__(config)
self.config = config
self.num_hidden_layers = config.num_hidden_layers
self.embeddings = EomtEmbeddings(config)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.query = nn.Embedding(config.num_queries, config.hidden_size)
self.layers = nn.ModuleList([EomtLayer(config) for _ in range(config.num_hidden_layers)])
self.upscale_block = EomtScaleBlock(config)
self.mask_head = EomtMaskHead(config)
self.class_predictor = nn.Linear(config.hidden_size, config.num_labels + 1)
self.grid_size = (config.image_size // config.patch_size, config.image_size // config.patch_size)
self.weight_dict: dict[str, float] = {
"loss_cross_entropy": config.class_weight,
"loss_mask": config.mask_weight,
"loss_dice": config.dice_weight,
}
self.criterion = EomtLoss(config=config, weight_dict=self.weight_dict)
self.register_buffer("attn_mask_probs", torch.ones(config.num_blocks))
self.post_init()
def get_loss_dict(
self,
masks_queries_logits: Tensor,
class_queries_logits: Tensor,
mask_labels: Tensor,
class_labels: Tensor,
auxiliary_predictions: dict[str, Tensor],
) -> dict[str, Tensor]:
loss_dict: dict[str, Tensor] = self.criterion(
masks_queries_logits=masks_queries_logits,
class_queries_logits=class_queries_logits,
mask_labels=mask_labels,
class_labels=class_labels,
auxiliary_predictions=auxiliary_predictions,
)
# weight each loss by `self.weight_dict[<LOSS_NAME>]` including auxiliary losses
for key, weight in self.weight_dict.items():
for loss_key, loss in loss_dict.items():
if key in loss_key:
loss *= weight
return loss_dict
def get_loss(self, loss_dict: dict[str, Tensor]) -> Tensor:
return sum(loss_dict.values())
@auto_docstring
@can_return_tuple
def forward(
self,
pixel_values: Tensor,
mask_labels: Optional[list[Tensor]] = None,
class_labels: Optional[list[Tensor]] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
patch_offsets: Optional[list[Tensor]] = None,
) -> EomtForUniversalSegmentationOutput:
r"""
mask_labels (`list[torch.Tensor]`, *optional*):
list of mask labels of shape `(num_labels, height, width)` to be fed to a model
class_labels (`list[torch.LongTensor]`, *optional*):
list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
patch_offsets (`list[torch.Tensor]`, *optional*):
list of tuples indicating the image index and start and end positions of patches for semantic segementation.
"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
masks_queries_logits_per_layer, class_queries_logits_per_layer = (), ()
attention_mask = None
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
hidden_states = self.embeddings(pixel_values)
for idx, layer_module in enumerate(self.layers):
if output_hidden_states:
all_hidden_states += (hidden_states,)
if idx == self.num_hidden_layers - self.config.num_blocks:
query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device)
hidden_states = torch.cat((query, hidden_states), dim=1)
if idx >= self.num_hidden_layers - self.config.num_blocks and (
self.training or self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks] > 0
):
norm_hidden_states = self.layernorm(hidden_states)
masks_queries_logits, class_queries_logits = self.predict(norm_hidden_states)
masks_queries_logits_per_layer += (masks_queries_logits,)
class_queries_logits_per_layer += (class_queries_logits,)
attention_mask = torch.ones(
hidden_states.shape[0],
hidden_states.shape[1],
hidden_states.shape[1],
device=hidden_states.device,
dtype=torch.bool,
)
interpolated_logits = F.interpolate(masks_queries_logits, size=self.grid_size, mode="bilinear")
interpolated_logits = interpolated_logits.view(
interpolated_logits.size(0), interpolated_logits.size(1), -1
)
num_query_tokens = self.config.num_queries
encoder_start_tokens = num_query_tokens + self.embeddings.num_prefix_tokens
# Set attention mask for queries to focus on encoder tokens based on interpolated logits
attention_mask[:, :num_query_tokens, encoder_start_tokens:] = interpolated_logits > 0
# Disable attention mask for random query tokens.
attention_mask = self._disable_attention_mask(
attention_mask,
prob=self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks],
num_query_tokens=num_query_tokens,
encoder_start_tokens=encoder_start_tokens,
device=attention_mask.device,
)
# Expand attention mask to 4d mask.
attention_mask = attention_mask[:, None, ...].expand(-1, self.config.num_attention_heads, -1, -1)
attention_mask = attention_mask.float().masked_fill(~attention_mask, -1e9)
layer_outputs = layer_module(hidden_states, attention_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions += (layer_outputs[1],)
sequence_output = self.layernorm(hidden_states)
if output_hidden_states:
all_hidden_states += (sequence_output,)
masks_queries_logits, class_queries_logits = self.predict(sequence_output)
masks_queries_logits_per_layer += (masks_queries_logits,)
class_queries_logits_per_layer += (class_queries_logits,)
loss = None
if mask_labels is not None and class_labels is not None:
loss = 0.0
for masks_queries_logits, class_queries_logits in zip(
masks_queries_logits_per_layer, class_queries_logits_per_layer
):
loss_dict = self.get_loss_dict(
masks_queries_logits=masks_queries_logits,
class_queries_logits=class_queries_logits,
mask_labels=mask_labels,
class_labels=class_labels,
auxiliary_predictions=None,
)
loss += self.get_loss(loss_dict)
return EomtForUniversalSegmentationOutput(
loss=loss,
masks_queries_logits=masks_queries_logits,
class_queries_logits=class_queries_logits,
last_hidden_state=sequence_output,
hidden_states=all_hidden_states,
attentions=all_attentions,
patch_offsets=patch_offsets,
)
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
def predict(self, logits: torch.Tensor):
query_tokens = logits[:, : self.config.num_queries, :]
class_logits = self.class_predictor(query_tokens)
prefix_tokens = logits[:, self.config.num_queries + self.embeddings.num_prefix_tokens :, :]
prefix_tokens = prefix_tokens.transpose(1, 2)
prefix_tokens = prefix_tokens.reshape(prefix_tokens.shape[0], -1, *self.grid_size)
query_tokens = self.mask_head(query_tokens)
prefix_tokens = self.upscale_block(prefix_tokens)
mask_logits = torch.einsum("bqc, bchw -> bqhw", query_tokens, prefix_tokens)
return mask_logits, class_logits
@staticmethod
def _disable_attention_mask(attn_mask, prob, num_query_tokens, encoder_start_tokens, device):
if prob < 1:
# Generate random queries to disable based on the probs
random_queries = torch.rand(attn_mask.shape[0], num_query_tokens, device=device) > prob
# Disable attention to the query tokens, considering the prefix tokens
attn_mask[:, :num_query_tokens, encoder_start_tokens:][random_queries] = 1
return attn_mask
__all__ = ["EomtPreTrainedModel", "EomtForUniversalSegmentation"]