# Copyright 2025 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC from functools import partial from typing import Optional import torch import torch.nn as nn from .cache_utils import Cache from .modeling_outputs import ( BaseModelOutputWithPast, QuestionAnsweringModelOutput, SequenceClassifierOutputWithPast, TokenClassifierOutput, ) from .models.auto import AutoModel from .processing_utils import Unpack from .utils import TransformersKwargs, auto_docstring, can_return_tuple, logging logger = logging.get_logger(__name__) class GradientCheckpointingLayer(nn.Module): """Base class for layers with gradient checkpointing. This class enables gradient checkpointing functionality for a layer. By default, gradient checkpointing is disabled (`gradient_checkpointing = False`). When `model.set_gradient_checkpointing()` is called, gradient checkpointing is enabled by setting `gradient_checkpointing = True` and assigning a checkpointing function to `_gradient_checkpointing_func`. Important: When using gradient checkpointing with `use_reentrant=True`, inputs that require gradients (e.g. hidden states) must be passed as positional arguments (`*args`) rather than keyword arguments to properly propagate gradients. Example: ```python >>> # Correct - hidden_states passed as positional arg >>> out = self.layer(hidden_states, attention_mask=attention_mask) >>> # Incorrect - hidden_states passed as keyword arg >>> out = self.layer(hidden_states=hidden_states, attention_mask=attention_mask) ``` """ gradient_checkpointing = False def __call__(self, *args, **kwargs): if self.gradient_checkpointing and self.training: do_warn = False layer_name = self.__class__.__name__ message = f"Caching is incompatible with gradient checkpointing in {layer_name}. Setting" if "use_cache" in kwargs and kwargs["use_cache"]: kwargs["use_cache"] = False message += " `use_cache=False`," do_warn = True # different names for the same thing in different layers if "past_key_value" in kwargs and kwargs["past_key_value"] is not None: kwargs["past_key_value"] = None message += " `past_key_value=None`," do_warn = True if "past_key_values" in kwargs and kwargs["past_key_values"] is not None: kwargs["past_key_values"] = None message += " `past_key_values=None`," do_warn = True if "layer_past" in kwargs and kwargs["layer_past"] is not None: kwargs["layer_past"] = None message += " `layer_past=None`," do_warn = True # warn if anything was changed if do_warn: message = message.rstrip(",") + "." logger.warning(message) return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args) return super().__call__(*args, **kwargs) @auto_docstring class GenericForSequenceClassification(ABC): base_model_prefix = "model" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels # Similar to `self.model = AutoModel.from_config(config)` but allows to change the base model name if needed in the child class setattr(self, self.base_model_prefix, AutoModel.from_config(config)) self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) # Initialize weights and apply final processing self.post_init() @can_return_tuple @auto_docstring def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ) -> SequenceClassifierOutputWithPast: transformer_outputs: BaseModelOutputWithPast = getattr(self, self.base_model_prefix)( input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, **kwargs, ) hidden_states = transformer_outputs.last_hidden_state logits = self.score(hidden_states) if input_ids is not None: batch_size = input_ids.shape[0] else: batch_size = inputs_embeds.shape[0] if self.config.pad_token_id is None and batch_size != 1: raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") if self.config.pad_token_id is None: last_non_pad_token = -1 elif input_ids is not None: # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) else: last_non_pad_token = -1 logger.warning_once( f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " "unexpected if using padding tokens in conjunction with `inputs_embeds.`" ) pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] loss = None if labels is not None: loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) return SequenceClassifierOutputWithPast( loss=loss, logits=pooled_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) @auto_docstring class GenericForQuestionAnswering(ABC): base_model_prefix = "model" def __init__(self, config): super().__init__(config) # Similar to `self.model = AutoModel.from_config(config)` but allows to change the base model name if needed in the child class setattr(self, self.base_model_prefix, AutoModel.from_config(config)) self.qa_outputs = nn.Linear(config.hidden_size, 2) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return getattr(self, self.base_model_prefix).embed_tokens def set_input_embeddings(self, value): getattr(self, self.base_model_prefix).embed_tokens = value @can_return_tuple @auto_docstring def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, start_positions: Optional[torch.LongTensor] = None, end_positions: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> QuestionAnsweringModelOutput: outputs: BaseModelOutputWithPast = getattr(self, self.base_model_prefix)( input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs, ) sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() end_logits = end_logits.squeeze(-1).contiguous() loss = None if start_positions is not None and end_positions is not None: loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) return QuestionAnsweringModelOutput( loss=loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @auto_docstring class GenericForTokenClassification(ABC): base_model_prefix = "model" def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels # Similar to `self.model = AutoModel.from_config(config)` but allows to change the base model name if needed in the child class setattr(self, self.base_model_prefix, AutoModel.from_config(config)) if getattr(config, "classifier_dropout", None) is not None: classifier_dropout = config.classifier_dropout elif getattr(config, "hidden_dropout", None) is not None: classifier_dropout = config.hidden_dropout else: classifier_dropout = 0.1 self.dropout = nn.Dropout(classifier_dropout) self.score = nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing self.post_init() @can_return_tuple @auto_docstring def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, **kwargs, ) -> TokenClassifierOutput: outputs: BaseModelOutputWithPast = getattr(self, self.base_model_prefix)( input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, **kwargs, ) sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.score(sequence_output) loss = None if labels is not None: loss = self.loss_function(logits, labels, self.config) return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )