271 lines
11 KiB
Python
271 lines
11 KiB
Python
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import torch
|
|
import torch.nn as nn
|
|
|
|
from ..image_transforms import center_to_corners_format
|
|
from ..utils import is_scipy_available
|
|
from .loss_for_object_detection import HungarianMatcher, ImageLoss, _set_aux_loss, generalized_box_iou
|
|
|
|
|
|
if is_scipy_available():
|
|
from scipy.optimize import linear_sum_assignment
|
|
|
|
|
|
# Similar to the one used in `DeformableDetr` but we reduce with sum and normalize by num_boxes
|
|
# instead of mean.
|
|
def sigmoid_focal_loss(
|
|
inputs: torch.Tensor,
|
|
targets: torch.Tensor,
|
|
num_boxes: int,
|
|
alpha: float = 0.25,
|
|
gamma: float = 2,
|
|
):
|
|
"""
|
|
Loss used in RetinaNet for dense detection: https://huggingface.co/papers/1708.02002.
|
|
|
|
Args:
|
|
inputs (`torch.FloatTensor` of arbitrary shape):
|
|
The predictions for each example.
|
|
targets (`torch.FloatTensor` with the same shape as `inputs`)
|
|
A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
|
|
and 1 for the positive class).
|
|
num_boxes (`int`):
|
|
The total number of boxes in the batch.
|
|
alpha (`float`, *optional*, defaults to 0.25):
|
|
Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
|
|
gamma (`int`, *optional*, defaults to 2):
|
|
Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
|
|
|
|
Returns:
|
|
Loss tensor
|
|
"""
|
|
prob = inputs.sigmoid()
|
|
ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
|
|
# add modulating factor
|
|
p_t = prob * targets + (1 - prob) * (1 - targets)
|
|
loss = ce_loss * ((1 - p_t) ** gamma)
|
|
|
|
if alpha >= 0:
|
|
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
|
|
loss = alpha_t * loss
|
|
|
|
return loss.sum() / num_boxes
|
|
|
|
|
|
class GroundingDinoHungarianMatcher(HungarianMatcher):
|
|
@torch.no_grad()
|
|
def forward(self, outputs, targets):
|
|
"""
|
|
Args:
|
|
outputs (`dict`):
|
|
A dictionary that contains at least these entries:
|
|
* "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
|
|
* "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
|
|
* "label_maps": Tuple of tensors of dim [num_classes, hidden_dim].
|
|
targets (`list[dict]`):
|
|
A list of targets (len(targets) = batch_size), where each target is a dict containing:
|
|
* "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
|
|
ground-truth
|
|
objects in the target) containing the class labels
|
|
* "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
|
|
|
|
Returns:
|
|
`list[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
|
|
- index_i is the indices of the selected predictions (in order)
|
|
- index_j is the indices of the corresponding selected targets (in order)
|
|
For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
|
|
"""
|
|
batch_size, num_queries = outputs["logits"].shape[:2]
|
|
|
|
# We flatten to compute the cost matrices in a batch
|
|
out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, hidden_dim]
|
|
out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
|
|
label_maps = outputs["label_maps"]
|
|
|
|
# First take the label map for each class in each batch and then concatenate them
|
|
label_maps = torch.cat([label_map[target["class_labels"]] for label_map, target in zip(label_maps, targets)])
|
|
# Normalize label maps based on number of tokens per class
|
|
label_maps = label_maps / label_maps.sum(dim=-1, keepdim=True)
|
|
|
|
# Also concat the target labels and boxes
|
|
target_bbox = torch.cat([v["boxes"] for v in targets])
|
|
|
|
# Compute the classification cost.
|
|
alpha = 0.25
|
|
gamma = 2.0
|
|
neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
|
|
pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
|
|
# Compute the classification cost by taking pos and neg cost in the appropriate index
|
|
class_cost = (pos_cost_class - neg_cost_class) @ label_maps.t()
|
|
|
|
# Compute the L1 cost between boxes
|
|
bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
|
|
|
|
# Compute the giou cost between boxes
|
|
giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
|
|
|
|
# Final cost matrix
|
|
cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
|
|
cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
|
|
|
|
sizes = [len(v["boxes"]) for v in targets]
|
|
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
|
|
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
|
|
|
|
|
|
class GroundingDinoImageLoss(ImageLoss):
|
|
"""
|
|
This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
|
|
compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
|
|
matched ground-truth / prediction (supervise class and box).
|
|
|
|
Args:
|
|
matcher (`GroundingDinoHungarianMatcher`):
|
|
Module able to compute a matching between targets and proposals.
|
|
focal_alpha (`float`):
|
|
Alpha parameter in focal loss.
|
|
losses (`list[str]`):
|
|
List of all the losses to be applied. See `get_loss` for a list of all available losses.
|
|
"""
|
|
|
|
def __init__(self, matcher, focal_alpha, losses):
|
|
nn.Module.__init__(self)
|
|
self.matcher = matcher
|
|
self.focal_alpha = focal_alpha
|
|
self.losses = losses
|
|
|
|
def _get_target_classes_one_hot(self, outputs, targets, indices):
|
|
"""
|
|
Create one_hot based on the matching indices
|
|
"""
|
|
logits = outputs["logits"]
|
|
# Add offsets to class_labels to select the correct label map
|
|
class_labels = torch.cat(
|
|
[
|
|
target["class_labels"][J] + len(outputs["label_maps"][i]) if i > 0 else target["class_labels"][J]
|
|
for i, (target, (_, J)) in enumerate(zip(targets, indices))
|
|
]
|
|
)
|
|
label_maps = torch.cat(outputs["label_maps"], dim=0)
|
|
|
|
idx = self._get_source_permutation_idx(indices)
|
|
target_classes_onehot = torch.zeros_like(logits, device=logits.device, dtype=torch.long)
|
|
target_classes_onehot[idx] = label_maps[class_labels].to(torch.long)
|
|
|
|
return target_classes_onehot
|
|
|
|
def loss_labels(self, outputs, targets, indices, num_boxes):
|
|
"""
|
|
Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
|
|
of dim [nb_target_boxes]
|
|
"""
|
|
if "logits" not in outputs:
|
|
raise KeyError("No logits were found in the outputs")
|
|
if "text_mask" not in outputs:
|
|
raise KeyError("No text_mask were found in the outputs")
|
|
|
|
target_classes_onehot = self._get_target_classes_one_hot(outputs, targets, indices)
|
|
source_logits = outputs["logits"]
|
|
text_mask = outputs["text_mask"]
|
|
|
|
# Select only valid logits
|
|
source_logits = torch.masked_select(source_logits, text_mask)
|
|
target_classes_onehot = torch.masked_select(target_classes_onehot, text_mask)
|
|
|
|
target_classes_onehot = target_classes_onehot.float()
|
|
loss_ce = sigmoid_focal_loss(
|
|
inputs=source_logits,
|
|
targets=target_classes_onehot,
|
|
num_boxes=num_boxes,
|
|
alpha=self.focal_alpha,
|
|
gamma=2,
|
|
)
|
|
|
|
losses = {"loss_ce": loss_ce}
|
|
|
|
return losses
|
|
|
|
|
|
def GroundingDinoForObjectDetectionLoss(
|
|
logits,
|
|
labels,
|
|
device,
|
|
pred_boxes,
|
|
config,
|
|
label_maps,
|
|
text_mask,
|
|
outputs_class=None,
|
|
outputs_coord=None,
|
|
encoder_logits=None,
|
|
encoder_pred_boxes=None,
|
|
):
|
|
# First: create the matcher
|
|
matcher = GroundingDinoHungarianMatcher(
|
|
class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost
|
|
)
|
|
# Second: create the criterion
|
|
losses = ["labels", "boxes", "cardinality"]
|
|
criterion = GroundingDinoImageLoss(
|
|
matcher=matcher,
|
|
focal_alpha=config.focal_alpha,
|
|
losses=losses,
|
|
)
|
|
criterion.to(device)
|
|
# Third: compute the losses, based on outputs and labels
|
|
outputs_loss = {}
|
|
outputs_loss["logits"] = logits
|
|
outputs_loss["pred_boxes"] = pred_boxes
|
|
outputs_loss["label_maps"] = label_maps
|
|
outputs_loss["text_mask"] = text_mask
|
|
|
|
auxiliary_outputs = None
|
|
if config.auxiliary_loss:
|
|
auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord)
|
|
for aux_output in auxiliary_outputs:
|
|
aux_output["label_maps"] = label_maps
|
|
aux_output["text_mask"] = text_mask
|
|
outputs_loss["auxiliary_outputs"] = auxiliary_outputs
|
|
|
|
loss_dict = criterion(outputs_loss, labels)
|
|
|
|
if config.two_stage:
|
|
encoder_outputs_loss = {
|
|
"logits": encoder_logits,
|
|
"pred_boxes": encoder_pred_boxes,
|
|
"label_maps": label_maps,
|
|
"text_mask": text_mask,
|
|
}
|
|
encoder_loss_dict = criterion(encoder_outputs_loss, labels)
|
|
encoder_loss_dict = {k + "_enc": v for k, v in encoder_loss_dict.items()}
|
|
loss_dict.update(encoder_loss_dict)
|
|
# Fourth: compute total loss, as a weighted sum of the various losses
|
|
weight_dict = {
|
|
"loss_ce": 2.0,
|
|
"loss_bbox": config.bbox_loss_coefficient,
|
|
"loss_giou": config.giou_loss_coefficient,
|
|
}
|
|
|
|
if config.two_stage:
|
|
enc_weight_dict = {k + "_enc": v for k, v in weight_dict.items()}
|
|
weight_dict.update(enc_weight_dict)
|
|
|
|
if config.auxiliary_loss:
|
|
aux_weight_dict = {}
|
|
for i in range(config.decoder_layers - 1):
|
|
aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
|
|
weight_dict.update(aux_weight_dict)
|
|
|
|
loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
|
|
return loss, loss_dict, auxiliary_outputs
|