team-10/venv/Lib/site-packages/transformers/models/dpr/modeling_dpr.py
2025-08-02 02:00:33 +02:00

592 lines
22 KiB
Python

# coding=utf-8
# Copyright 2018 DPR Authors, The Hugging Face Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch DPR model for Open Domain Question Answering."""
from dataclasses import dataclass
from typing import Optional, Union
import torch
from torch import Tensor, nn
from ...modeling_outputs import BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
auto_docstring,
logging,
)
from ..bert.modeling_bert import BertModel
from .configuration_dpr import DPRConfig
logger = logging.get_logger(__name__)
##########
# Outputs
##########
@dataclass
@auto_docstring(
custom_intro="""
Class for outputs of [`DPRQuestionEncoder`].
"""
)
class DPRContextEncoderOutput(ModelOutput):
r"""
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
"""
pooler_output: torch.FloatTensor
hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
attentions: Optional[tuple[torch.FloatTensor, ...]] = None
@dataclass
@auto_docstring(
custom_intro="""
Class for outputs of [`DPRQuestionEncoder`].
"""
)
class DPRQuestionEncoderOutput(ModelOutput):
r"""
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
"""
pooler_output: torch.FloatTensor
hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
attentions: Optional[tuple[torch.FloatTensor, ...]] = None
@dataclass
@auto_docstring(
custom_intro="""
Class for outputs of [`DPRQuestionEncoder`].
"""
)
class DPRReaderOutput(ModelOutput):
r"""
start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
Logits of the start index of the span for each passage.
end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
Logits of the end index of the span for each passage.
relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`):
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
question, compared to all the other passages.
"""
start_logits: torch.FloatTensor
end_logits: Optional[torch.FloatTensor] = None
relevance_logits: Optional[torch.FloatTensor] = None
hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
attentions: Optional[tuple[torch.FloatTensor, ...]] = None
@auto_docstring
class DPRPreTrainedModel(PreTrainedModel):
_supports_sdpa = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class DPREncoder(DPRPreTrainedModel):
base_model_prefix = "bert_model"
def __init__(self, config: DPRConfig):
super().__init__(config)
self.bert_model = BertModel(config, add_pooling_layer=False)
if self.bert_model.config.hidden_size <= 0:
raise ValueError("Encoder hidden_size can't be zero")
self.projection_dim = config.projection_dim
if self.projection_dim > 0:
self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
input_ids: Tensor,
attention_mask: Optional[Tensor] = None,
token_type_ids: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
) -> Union[BaseModelOutputWithPooling, tuple[Tensor, ...]]:
outputs = self.bert_model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
pooled_output = sequence_output[:, 0, :]
if self.projection_dim > 0:
pooled_output = self.encode_proj(pooled_output)
if not return_dict:
return (sequence_output, pooled_output) + outputs[2:]
return BaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@property
def embeddings_size(self) -> int:
if self.projection_dim > 0:
return self.encode_proj.out_features
return self.bert_model.config.hidden_size
class DPRSpanPredictor(DPRPreTrainedModel):
base_model_prefix = "encoder"
def __init__(self, config: DPRConfig):
super().__init__(config)
self.encoder = DPREncoder(config)
self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2)
self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1)
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
input_ids: Tensor,
attention_mask: Tensor,
inputs_embeds: Optional[Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
) -> Union[DPRReaderOutput, tuple[Tensor, ...]]:
# notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length
n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2]
# feed encoder
outputs = self.encoder(
input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
# compute logits
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
relevance_logits = self.qa_classifier(sequence_output[:, 0, :])
# resize
start_logits = start_logits.view(n_passages, sequence_length)
end_logits = end_logits.view(n_passages, sequence_length)
relevance_logits = relevance_logits.view(n_passages)
if not return_dict:
return (start_logits, end_logits, relevance_logits) + outputs[2:]
return DPRReaderOutput(
start_logits=start_logits,
end_logits=end_logits,
relevance_logits=relevance_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
##################
# PreTrainedModel
##################
class DPRPretrainedContextEncoder(DPRPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config: DPRConfig
load_tf_weights = None
base_model_prefix = "ctx_encoder"
class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config: DPRConfig
load_tf_weights = None
base_model_prefix = "question_encoder"
class DPRPretrainedReader(DPRPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config: DPRConfig
load_tf_weights = None
base_model_prefix = "span_predictor"
###############
# Actual Models
###############
@auto_docstring(
custom_intro="""
The bare DPRContextEncoder transformer outputting pooler outputs as context representations.
"""
)
class DPRContextEncoder(DPRPretrainedContextEncoder):
def __init__(self, config: DPRConfig):
super().__init__(config)
self.config = config
self.ctx_encoder = DPREncoder(config)
# Initialize weights and apply final processing
self.post_init()
@auto_docstring
def forward(
self,
input_ids: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
token_type_ids: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[DPRContextEncoderOutput, tuple[Tensor, ...]]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs (for a pair title+text for example):
```
tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
```
(b) For single sequences (for a question for example):
```
tokens: [CLS] the dog is hairy . [SEP]
token_type_ids: 0 0 0 0 0 0 0
```
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
Examples:
```python
>>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
>>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
>>> model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
>>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
>>> embeddings = model(input_ids).pooler_output
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = (
torch.ones(input_shape, device=device)
if input_ids is None
else (input_ids != self.config.pad_token_id)
)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
outputs = self.ctx_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return outputs[1:]
return DPRContextEncoderOutput(
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
@auto_docstring(
custom_intro="""
The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.
"""
)
class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
def __init__(self, config: DPRConfig):
super().__init__(config)
self.config = config
self.question_encoder = DPREncoder(config)
# Initialize weights and apply final processing
self.post_init()
@auto_docstring
def forward(
self,
input_ids: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
token_type_ids: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[DPRQuestionEncoderOutput, tuple[Tensor, ...]]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs (for a pair title+text for example):
```
tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
```
(b) For single sequences (for a question for example):
```
tokens: [CLS] the dog is hairy . [SEP]
token_type_ids: 0 0 0 0 0 0 0
```
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
Examples:
```python
>>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
>>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
>>> model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
>>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
>>> embeddings = model(input_ids).pooler_output
```
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = (
torch.ones(input_shape, device=device)
if input_ids is None
else (input_ids != self.config.pad_token_id)
)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
outputs = self.question_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return outputs[1:]
return DPRQuestionEncoderOutput(
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
@auto_docstring(
custom_intro="""
The bare DPRReader transformer outputting span predictions.
"""
)
class DPRReader(DPRPretrainedReader):
def __init__(self, config: DPRConfig):
super().__init__(config)
self.config = config
self.span_predictor = DPRSpanPredictor(config)
# Initialize weights and apply final processing
self.post_init()
@auto_docstring
def forward(
self,
input_ids: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
inputs_embeds: Optional[Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[DPRReaderOutput, tuple[Tensor, ...]]:
r"""
input_ids (`tuple[torch.LongTensor]` of shapes `(n_passages, sequence_length)`):
Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should
be formatted with [CLS] and [SEP] with the format:
`[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
rather than the left.
Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details.
[What are input IDs?](../glossary#input-ids)
inputs_embeds (`torch.FloatTensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
Examples:
```python
>>> from transformers import DPRReader, DPRReaderTokenizer
>>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
>>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
>>> encoded_inputs = tokenizer(
... questions=["What is love ?"],
... titles=["Haddaway"],
... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
... return_tensors="pt",
... )
>>> outputs = model(**encoded_inputs)
>>> start_logits = outputs.start_logits
>>> end_logits = outputs.end_logits
>>> relevance_logits = outputs.relevance_logits
```
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
return self.span_predictor(
input_ids,
attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
__all__ = [
"DPRContextEncoder",
"DPRPretrainedContextEncoder",
"DPRPreTrainedModel",
"DPRPretrainedQuestionEncoder",
"DPRPretrainedReader",
"DPRQuestionEncoder",
"DPRReader",
]