Adding all project files

This commit is contained in:
Martina Burlando 2025-08-02 02:00:33 +02:00
parent 6c9e127bdc
commit cd4316ad0f
42289 changed files with 8009643 additions and 0 deletions

View file

@ -0,0 +1,28 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ....utils import _LazyModule
from ....utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_xlm_prophetnet import *
from .modeling_xlm_prophetnet import *
from .tokenization_xlm_prophetnet import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View file

@ -0,0 +1,181 @@
# coding=utf-8
# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""XLM-ProphetNet model configuration"""
from typing import Callable, Optional, Union
from ....configuration_utils import PretrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
class XLMProphetNetConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`XLMProphetNetModel`]. It is used to instantiate a
XLMProphetNet model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the XLMProphetNet
[microsoft/xprophetnet-large-wiki100-cased](https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
activation_dropout (`float`, *optional*, defaults to 0.1):
The dropout ratio for activations inside the fully connected layer.
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
the `inputs_ids` passed when calling [`XLMProphetNetModel`].
hidden_size (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
num_encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
num_encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the `intermediate` (often named feed-forward) layer in decoder.
num_decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
num_decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
add_cross_attention (`bool`, *optional*, defaults to `True`):
Whether cross-attention layers should be added to the model.
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
Whether this is an encoder/decoder model.
pad_token_id (`int`, *optional*, defaults to 1)
Padding token id.
bos_token_id (`int`, *optional*, defaults to 0)
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2)
End of stream token id.
ngram (`int`, *optional*, defaults to 2)
Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first
token.
num_buckets (`int`, *optional*, defaults to 32)
The number of buckets to use for each attention layer. This is for relative position calculation. See the
[T5 paper](see https://huggingface.co/papers/1910.10683) for more details.
relative_max_distance (`int`, *optional*, defaults to 128)
Relative distances greater than this number will be put into the last same bucket. This is for relative
position calculation. See the [T5 paper](see https://huggingface.co/papers/1910.10683) for more details.
disable_ngram_loss (`bool`, *optional*, defaults to `False`):
Whether be trained predicting only the next first token.
eps (`float`, *optional*, defaults to 0.0):
Controls the `epsilon` parameter value for label smoothing in the loss calculation. If set to 0, no label
smoothing is performed.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
"""
model_type = "xlm-prophetnet"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "num_encoder_attention_heads",
}
def __init__(
self,
activation_dropout: Optional[float] = 0.1,
activation_function: Optional[Union[str, Callable]] = "gelu",
vocab_size: Optional[int] = 30522,
hidden_size: Optional[int] = 1024,
encoder_ffn_dim: Optional[int] = 4096,
num_encoder_layers: Optional[int] = 12,
num_encoder_attention_heads: Optional[int] = 16,
decoder_ffn_dim: Optional[int] = 4096,
num_decoder_layers: Optional[int] = 12,
num_decoder_attention_heads: Optional[int] = 16,
attention_dropout: Optional[float] = 0.1,
dropout: Optional[float] = 0.1,
max_position_embeddings: Optional[int] = 512,
init_std: Optional[float] = 0.02,
is_encoder_decoder: Optional[bool] = True,
add_cross_attention: Optional[bool] = True,
decoder_start_token_id: Optional[int] = 0,
ngram: Optional[int] = 2,
num_buckets: Optional[int] = 32,
relative_max_distance: Optional[int] = 128,
disable_ngram_loss: Optional[bool] = False,
eps: Optional[float] = 0.0,
use_cache: Optional[bool] = True,
pad_token_id: Optional[int] = 0,
bos_token_id: Optional[int] = 1,
eos_token_id: Optional[int] = 2,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.encoder_ffn_dim = encoder_ffn_dim
self.num_encoder_layers = num_encoder_layers
self.num_encoder_attention_heads = num_encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.num_decoder_layers = num_decoder_layers
self.num_decoder_attention_heads = num_decoder_attention_heads
self.max_position_embeddings = max_position_embeddings
self.init_std = init_std # Normal(0, this parameter)
self.activation_function = activation_function
# parameters for xlmprophetnet
self.ngram = ngram
self.num_buckets = num_buckets
self.relative_max_distance = relative_max_distance
self.disable_ngram_loss = disable_ngram_loss
self.eps = eps
# 3 Types of Dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.dropout = dropout
self.use_cache = use_cache
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
add_cross_attention=add_cross_attention,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
@property
def num_hidden_layers(self) -> int:
return self.num_encoder_layers + self.num_decoder_layers
@num_hidden_layers.setter
def num_hidden_layers(self, value):
raise NotImplementedError(
"This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and"
" `num_decoder_layers`."
)
__all__ = ["XLMProphetNetConfig"]

View file

@ -0,0 +1,322 @@
# coding=utf-8
# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from shutil import copyfile
from typing import Any, Optional
from ....tokenization_utils import PreTrainedTokenizer
from ....utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = ""
VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
class XLMProphetNetTokenizer(PreTrainedTokenizer):
"""
Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
bos_token (`str`, *optional*, defaults to `"[SEP]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"[SEP]"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="[SEP]",
eos_token="[SEP]",
sep_token="[SEP]",
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
sp_model_kwargs: Optional[dict[str, Any]] = None,
**kwargs,
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
try:
import sentencepiece as spm
except ImportError:
logger.warning(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
" pip install sentencepiece"
)
raise
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
# Original fairseq vocab and spm vocab must be "aligned":
# Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
# -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
# fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's' | '▁de' | '-'
# spm | '<unk>' | '<s>' | '</s>' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a'
# put special tokens and [unused] tokens into the vocab
self.fairseq_tokens_to_ids = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4}
for i in range(10):
tok = f"[unused{i}]"
self.fairseq_tokens_to_ids[tok] = 5 + i
# The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab
self.fairseq_offset = 12
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
# TODO ArthurZ fairseq_ids_to_tokens should be removed
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
unk_token=unk_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
try:
import sentencepiece as spm
except ImportError:
logger.warning(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
" pip install sentencepiece"
)
raise
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def get_special_tokens_mask(
self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
) -> list[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`list[int]`):
List of IDs.
token_ids_1 (`list[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1]
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
) -> list[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLMProphetNet
does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`list[int]`):
List of IDs.
token_ids_1 (`list[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`list[int]`: List of zeros.
"""
sep = [self.sep_token_id]
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0]
return len(token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
@property
def vocab_size(self):
return len(self.sp_model) + self.fairseq_offset
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> str:
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
spm_id = self.sp_model.PieceToId(token)
# Need to return unknown token if the SP model returned 0
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
) -> list[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A XLMProphetNet sequence has the following format:
- single sequence: `X [SEP]`
- pair of sequences: `A [SEP] B [SEP]`
Args:
token_ids_0 (`list[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (`list[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return token_ids_0 + [self.sep_token_id]
sep = [self.sep_token_id]
return token_ids_0 + sep + token_ids_1 + sep
__all__ = ["XLMProphetNetTokenizer"]