# coding=utf-8 # Copyright 2018 DPR Authors, The Hugging Face Team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch DPR model for Open Domain Question Answering.""" from dataclasses import dataclass from typing import Optional, Union import torch from torch import Tensor, nn from ...modeling_outputs import BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel from ...utils import ( ModelOutput, auto_docstring, logging, ) from ..bert.modeling_bert import BertModel from .configuration_dpr import DPRConfig logger = logging.get_logger(__name__) ########## # Outputs ########## @dataclass @auto_docstring( custom_intro=""" Class for outputs of [`DPRQuestionEncoder`]. """ ) class DPRContextEncoderOutput(ModelOutput): r""" pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`): The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer. This output is to be used to embed contexts for nearest neighbors queries with questions embeddings. """ pooler_output: torch.FloatTensor hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None attentions: Optional[tuple[torch.FloatTensor, ...]] = None @dataclass @auto_docstring( custom_intro=""" Class for outputs of [`DPRQuestionEncoder`]. """ ) class DPRQuestionEncoderOutput(ModelOutput): r""" pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`): The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer. This output is to be used to embed questions for nearest neighbors queries with context embeddings. """ pooler_output: torch.FloatTensor hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None attentions: Optional[tuple[torch.FloatTensor, ...]] = None @dataclass @auto_docstring( custom_intro=""" Class for outputs of [`DPRQuestionEncoder`]. """ ) class DPRReaderOutput(ModelOutput): r""" start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`): Logits of the start index of the span for each passage. end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`): Logits of the end index of the span for each passage. relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`): Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the question, compared to all the other passages. """ start_logits: torch.FloatTensor end_logits: Optional[torch.FloatTensor] = None relevance_logits: Optional[torch.FloatTensor] = None hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None attentions: Optional[tuple[torch.FloatTensor, ...]] = None @auto_docstring class DPRPreTrainedModel(PreTrainedModel): _supports_sdpa = True def _init_weights(self, module): """Initialize the weights""" if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) class DPREncoder(DPRPreTrainedModel): base_model_prefix = "bert_model" def __init__(self, config: DPRConfig): super().__init__(config) self.bert_model = BertModel(config, add_pooling_layer=False) if self.bert_model.config.hidden_size <= 0: raise ValueError("Encoder hidden_size can't be zero") self.projection_dim = config.projection_dim if self.projection_dim > 0: self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim) # Initialize weights and apply final processing self.post_init() def forward( self, input_ids: Tensor, attention_mask: Optional[Tensor] = None, token_type_ids: Optional[Tensor] = None, inputs_embeds: Optional[Tensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, return_dict: bool = False, ) -> Union[BaseModelOutputWithPooling, tuple[Tensor, ...]]: outputs = self.bert_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] pooled_output = sequence_output[:, 0, :] if self.projection_dim > 0: pooled_output = self.encode_proj(pooled_output) if not return_dict: return (sequence_output, pooled_output) + outputs[2:] return BaseModelOutputWithPooling( last_hidden_state=sequence_output, pooler_output=pooled_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @property def embeddings_size(self) -> int: if self.projection_dim > 0: return self.encode_proj.out_features return self.bert_model.config.hidden_size class DPRSpanPredictor(DPRPreTrainedModel): base_model_prefix = "encoder" def __init__(self, config: DPRConfig): super().__init__(config) self.encoder = DPREncoder(config) self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2) self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1) # Initialize weights and apply final processing self.post_init() def forward( self, input_ids: Tensor, attention_mask: Tensor, inputs_embeds: Optional[Tensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, return_dict: bool = False, ) -> Union[DPRReaderOutput, tuple[Tensor, ...]]: # notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2] # feed encoder outputs = self.encoder( input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] # compute logits logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() end_logits = end_logits.squeeze(-1).contiguous() relevance_logits = self.qa_classifier(sequence_output[:, 0, :]) # resize start_logits = start_logits.view(n_passages, sequence_length) end_logits = end_logits.view(n_passages, sequence_length) relevance_logits = relevance_logits.view(n_passages) if not return_dict: return (start_logits, end_logits, relevance_logits) + outputs[2:] return DPRReaderOutput( start_logits=start_logits, end_logits=end_logits, relevance_logits=relevance_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) ################## # PreTrainedModel ################## class DPRPretrainedContextEncoder(DPRPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config: DPRConfig load_tf_weights = None base_model_prefix = "ctx_encoder" class DPRPretrainedQuestionEncoder(DPRPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config: DPRConfig load_tf_weights = None base_model_prefix = "question_encoder" class DPRPretrainedReader(DPRPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config: DPRConfig load_tf_weights = None base_model_prefix = "span_predictor" ############### # Actual Models ############### @auto_docstring( custom_intro=""" The bare DPRContextEncoder transformer outputting pooler outputs as context representations. """ ) class DPRContextEncoder(DPRPretrainedContextEncoder): def __init__(self, config: DPRConfig): super().__init__(config) self.config = config self.ctx_encoder = DPREncoder(config) # Initialize weights and apply final processing self.post_init() @auto_docstring def forward( self, input_ids: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, token_type_ids: Optional[Tensor] = None, inputs_embeds: Optional[Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[DPRContextEncoderOutput, tuple[Tensor, ...]]: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be formatted with [CLS] and [SEP] tokens as follows: (a) For sequence pairs (for a pair title+text for example): ``` tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 ``` (b) For single sequences (for a question for example): ``` tokens: [CLS] the dog is hairy . [SEP] token_type_ids: 0 0 0 0 0 0 0 ``` DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) Examples: ```python >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base") >>> model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base") >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"] >>> embeddings = model(input_ids).pooler_output ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = ( torch.ones(input_shape, device=device) if input_ids is None else (input_ids != self.config.pad_token_id) ) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) outputs = self.ctx_encoder( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) if not return_dict: return outputs[1:] return DPRContextEncoderOutput( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) @auto_docstring( custom_intro=""" The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations. """ ) class DPRQuestionEncoder(DPRPretrainedQuestionEncoder): def __init__(self, config: DPRConfig): super().__init__(config) self.config = config self.question_encoder = DPREncoder(config) # Initialize weights and apply final processing self.post_init() @auto_docstring def forward( self, input_ids: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, token_type_ids: Optional[Tensor] = None, inputs_embeds: Optional[Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[DPRQuestionEncoderOutput, tuple[Tensor, ...]]: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be formatted with [CLS] and [SEP] tokens as follows: (a) For sequence pairs (for a pair title+text for example): ``` tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 ``` (b) For single sequences (for a question for example): ``` tokens: [CLS] the dog is hairy . [SEP] token_type_ids: 0 0 0 0 0 0 0 ``` DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) Examples: ```python >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base") >>> model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base") >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"] >>> embeddings = model(input_ids).pooler_output ``` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = ( torch.ones(input_shape, device=device) if input_ids is None else (input_ids != self.config.pad_token_id) ) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) outputs = self.question_encoder( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) if not return_dict: return outputs[1:] return DPRQuestionEncoderOutput( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) @auto_docstring( custom_intro=""" The bare DPRReader transformer outputting span predictions. """ ) class DPRReader(DPRPretrainedReader): def __init__(self, config: DPRConfig): super().__init__(config) self.config = config self.span_predictor = DPRSpanPredictor(config) # Initialize weights and apply final processing self.post_init() @auto_docstring def forward( self, input_ids: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, inputs_embeds: Optional[Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[DPRReaderOutput, tuple[Tensor, ...]]: r""" input_ids (`tuple[torch.LongTensor]` of shapes `(n_passages, sequence_length)`): Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should be formatted with [CLS] and [SEP] with the format: `[CLS] [SEP] [SEP] ` DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details. [What are input IDs?](../glossary#input-ids) inputs_embeds (`torch.FloatTensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. Examples: ```python >>> from transformers import DPRReader, DPRReaderTokenizer >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base") >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base") >>> encoded_inputs = tokenizer( ... questions=["What is love ?"], ... titles=["Haddaway"], ... texts=["'What Is Love' is a song recorded by the artist Haddaway"], ... return_tensors="pt", ... ) >>> outputs = model(**encoded_inputs) >>> start_logits = outputs.start_logits >>> end_logits = outputs.end_logits >>> relevance_logits = outputs.relevance_logits ``` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) return self.span_predictor( input_ids, attention_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) __all__ = [ "DPRContextEncoder", "DPRPretrainedContextEncoder", "DPRPreTrainedModel", "DPRPretrainedQuestionEncoder", "DPRPretrainedReader", "DPRQuestionEncoder", "DPRReader", ]