Adding all project files
This commit is contained in:
parent
6c9e127bdc
commit
cd4316ad0f
42289 changed files with 8009643 additions and 0 deletions
|
@ -0,0 +1,28 @@
|
|||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_evolla import *
|
||||
from .modeling_evolla import *
|
||||
from .processing_evolla import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,279 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2025 Westlake Representational Learning Lab (Fajie Yuan Lab) team and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Evolla model configuration"""
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...modeling_rope_utils import rope_config_validation
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class SaProtConfig(PretrainedConfig):
|
||||
r"""This is the configuration class to store the configuration of a [`EvollaSaProtProteinEncoder`]. It is used to instantiate a
|
||||
SaProt model according to the specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 446):
|
||||
Vocabulary size of the protein sequence model. Defines the number of different tokens that can be represented
|
||||
by the `inputs_ids` passed when calling [`EvollaModel`].
|
||||
mask_token_id (`int`, *optional*, defaults to 4):
|
||||
The id of the *mask* token in the protein sequence model.
|
||||
pad_token_id (`int`, *optional*, defaults to 1):
|
||||
The id of the *padding* token in the protein sequence model.
|
||||
hidden_size (`int`, *optional*, defaults to 1280):
|
||||
Dimensionality of the protein sequence model layers and the pooler layer.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 33):
|
||||
Number of hidden layers in the protein sequence model.
|
||||
num_attention_heads (`int`, *optional*, defaults to 20):
|
||||
Number of attention heads for each attention layer in the protein sequence model.
|
||||
intermediate_size (`int`, *optional*, defaults to 5120):
|
||||
Dimensionality of the intermediate layers in the protein sequence model.
|
||||
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the hidden layers in the protein sequence model.
|
||||
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities in the protein sequence model.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 1026):
|
||||
The maximum sequence length that the protein sequence model might ever be used with. Typically set this to
|
||||
something large just in case (e.g., 512 or 1024 or 2048).
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon value for the layer normalization layer in the protein sequence model.
|
||||
position_embedding_type (`str`, *optional*, defaults to `"rotary"`):
|
||||
The type of position embedding to use in the protein sequence model. Currently only `"rotary"` is supported.
|
||||
emb_layer_norm_before (`bool`, *optional*, defaults to `False`):
|
||||
Whether to apply layer normalization before the position embedding in the protein sequence model.
|
||||
token_dropout (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply dropout to the tokens in the protein sequence model."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=446,
|
||||
mask_token_id=4,
|
||||
pad_token_id=1,
|
||||
hidden_size=1280,
|
||||
num_hidden_layers=33,
|
||||
num_attention_heads=20,
|
||||
intermediate_size=5120,
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=1026,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-05,
|
||||
position_embedding_type="rotary",
|
||||
use_cache=True,
|
||||
emb_layer_norm_before=False,
|
||||
token_dropout=True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.use_cache = use_cache
|
||||
self.emb_layer_norm_before = emb_layer_norm_before
|
||||
self.token_dropout = token_dropout
|
||||
|
||||
|
||||
class EvollaConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`EvollaModel`]. It is used to instantiate an
|
||||
Evolla model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the Evolla-10B.
|
||||
|
||||
e.g. [westlake-repl/Evolla-10B-hf](https://huggingface.co/westlake-repl/Evolla-10B-hf)
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
protein_encoder_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize [`SaProtConfig`].
|
||||
vocab_size (`int`, *optional*, defaults to 128256):
|
||||
Vocabulary size of the Evolla llama model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`EvollaModel`].
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimensionality of the llama layers and the pooler layer.
|
||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||
Dimensionality of the intermediate layers in the llama model.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the llama model.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the llama model.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
Number of key-value pairs for each attention layer in the llama model.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the llama model. If string, `"gelu"`, `"relu"`,
|
||||
`"selu"` and `"silu"` are supported.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon value for the RMS-norm layer in the llama model.
|
||||
rope_theta (`float`, *optional*, defaults to 500000.0):
|
||||
The threshold value for the RoPE layer in the llama model.
|
||||
rope_scaling (`float`, *optional*):
|
||||
The scaling factor for the RoPE layer in the llama model.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in the attention layer.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention layer.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in the MLP layer.
|
||||
aligner_ffn_mult (`int`, *optional*, defaults to 4):
|
||||
The FFN multiplier for the aligner layer.
|
||||
aligner_enable_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use bias in the aligner layer.
|
||||
aligner_attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities in the aligner layer.
|
||||
aligner_num_add_layers (`int`, *optional*, defaults to 8):
|
||||
The number of additional layers for the aligner layer.
|
||||
resampler_depth (`int`, *optional*, defaults to 6):
|
||||
The depth of the resampler layer in the llama model.
|
||||
resampler_dim_head (`int`, *optional*, defaults to 64):
|
||||
The dimension of the heads in the resampler layer in the llama model.
|
||||
resampler_heads (`int`, *optional*, defaults to 8):
|
||||
The number of heads in the resampler layer in the llama model.
|
||||
resampler_num_latents (`int`, *optional*, defaults to 64):
|
||||
The number of latents in the resampler layer in the llama model.
|
||||
resampler_ff_mult (`int`, *optional*, defaults to 4):
|
||||
The FFN multiplier for the resampler layer.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the *padding* token.
|
||||
bos_token_id (`int`, *optional*, defaults to 128000):
|
||||
The id of the *beginning-of-sequence* token.
|
||||
eos_token_id (`int`, *optional*, defaults to 128009):
|
||||
The id of the *end-of-sequence* token.
|
||||
use_cache (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to tie the input and output word embeddings.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import EvollaModel, EvollaConfig
|
||||
|
||||
>>> # Initializing a Evolla evolla-10b style configuration
|
||||
>>> configuration = EvollaConfig()
|
||||
|
||||
>>> # Initializing a model from the evolla-10b style configuration
|
||||
>>> model = EvollaModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "EvollaModel"
|
||||
sub_configs = {"protein_encoder_config": SaProtConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
protein_encoder_config=None,
|
||||
vocab_size=128256, # llama vocab size
|
||||
hidden_size=4096, # llama hidden size
|
||||
intermediate_size=14336, # llama intermediate size
|
||||
num_hidden_layers=32, # llama num layers
|
||||
num_attention_heads=32, # llama num heads
|
||||
num_key_value_heads=8, # llama num key-value heads
|
||||
hidden_act="silu", # llama activation function
|
||||
max_position_embeddings=8192, # llama rope max length
|
||||
rms_norm_eps=1e-05,
|
||||
rope_theta=500000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
aligner_ffn_mult=4,
|
||||
aligner_enable_bias=True,
|
||||
aligner_attention_probs_dropout_prob=0.1,
|
||||
aligner_num_add_layers=8,
|
||||
resampler_depth=6,
|
||||
resampler_dim_head=64,
|
||||
resampler_heads=8,
|
||||
resampler_num_latents=64,
|
||||
resampler_ff_mult=4,
|
||||
initializer_range=0.02,
|
||||
pad_token_id=None,
|
||||
bos_token_id=128000,
|
||||
eos_token_id=128009,
|
||||
use_cache=False,
|
||||
tie_word_embeddings=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.tie_word_embeddings = tie_word_embeddings
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
self.aligner_ffn_mult = aligner_ffn_mult
|
||||
self.aligner_enable_bias = aligner_enable_bias
|
||||
self.aligner_attention_probs_dropout_prob = aligner_attention_probs_dropout_prob
|
||||
self.aligner_num_add_layers = aligner_num_add_layers
|
||||
self.use_cache = use_cache
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
self.resampler_depth = resampler_depth
|
||||
self.resampler_dim_head = resampler_dim_head
|
||||
self.resampler_heads = resampler_heads
|
||||
self.resampler_num_latents = resampler_num_latents
|
||||
self.resampler_ff_mult = resampler_ff_mult
|
||||
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
# Validate the correctness of rotary position embeddings parameters
|
||||
# BC: if there is a 'type' field, copy it it to 'rope_type'.
|
||||
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
||||
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
||||
rope_config_validation(self)
|
||||
|
||||
# Subconfig
|
||||
if protein_encoder_config is None:
|
||||
protein_encoder_config = {}
|
||||
logger.info("`protein_encoder_config` is `None`. Initializing the `SaProtConfig` with default values.")
|
||||
self.protein_encoder_config = SaProtConfig(**protein_encoder_config)
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["EvollaConfig"]
|
1761
venv/Lib/site-packages/transformers/models/evolla/modeling_evolla.py
Normal file
1761
venv/Lib/site-packages/transformers/models/evolla/modeling_evolla.py
Normal file
File diff suppressed because it is too large
Load diff
1008
venv/Lib/site-packages/transformers/models/evolla/modular_evolla.py
Normal file
1008
venv/Lib/site-packages/transformers/models/evolla/modular_evolla.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,247 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Processor class for EVOLLA.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...processing_utils import (
|
||||
ProcessorMixin,
|
||||
)
|
||||
from ..auto import AutoTokenizer
|
||||
|
||||
|
||||
PROTEIN_VALID_KEYS = ["aa_seq", "foldseek", "msa"]
|
||||
|
||||
|
||||
class EvollaProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.
|
||||
|
||||
[`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
|
||||
docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.
|
||||
|
||||
Args:
|
||||
protein_tokenizer (`EsmTokenizer`):
|
||||
An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
|
||||
tokenizer (`LlamaTokenizerFast`, *optional*):
|
||||
An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
|
||||
protein_max_length (`int`, *optional*, defaults to 1024):
|
||||
The maximum length of the sequence to be generated.
|
||||
text_max_length (`int`, *optional*, defaults to 512):
|
||||
The maximum length of the text to be generated.
|
||||
"""
|
||||
|
||||
attributes = ["protein_tokenizer", "tokenizer"]
|
||||
valid_kwargs = ["sequence_max_length"]
|
||||
# protein_tokenizer_class = "EsmTokenizer"
|
||||
# tokenizer_class = "LlamaTokenizerFast"
|
||||
protein_tokenizer_class = "AutoTokenizer"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
protein_tokenizer_dir_name = "protein_tokenizer"
|
||||
# tokenizer_dir_name = "text_tokenizer"
|
||||
|
||||
def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs):
|
||||
if protein_tokenizer is None:
|
||||
raise ValueError("You need to specify an `protein_tokenizer`.")
|
||||
if tokenizer is None:
|
||||
raise ValueError("You need to specify a `tokenizer`.")
|
||||
|
||||
super().__init__(protein_tokenizer, tokenizer)
|
||||
|
||||
self.tokenizer.pad_token = "<|reserved_special_token_0|>"
|
||||
self.protein_max_length = protein_max_length
|
||||
self.text_max_length = text_max_length
|
||||
|
||||
def process_proteins(self, proteins, protein_max_length=1024):
|
||||
sa_sequences = []
|
||||
for protein in proteins:
|
||||
aa_seq = protein.get("aa_seq")
|
||||
foldseek = protein.get("foldseek")
|
||||
sa_sequence = "".join([s.upper() + f.lower() for s, f in zip(aa_seq, foldseek)])
|
||||
sa_sequences.append(sa_sequence)
|
||||
|
||||
sa_tokens = self.protein_tokenizer.batch_encode_plus(
|
||||
sa_sequences, return_tensors="pt", truncation=True, max_length=protein_max_length, padding=True
|
||||
)
|
||||
return sa_tokens
|
||||
|
||||
def process_text(
|
||||
self,
|
||||
texts,
|
||||
text_max_length: int = 512,
|
||||
):
|
||||
prompts = []
|
||||
for messages in texts:
|
||||
prompt = self.tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
prompts.append(prompt)
|
||||
|
||||
prompt_inputs = self.tokenizer(
|
||||
prompts,
|
||||
add_special_tokens=False,
|
||||
return_tensors="pt",
|
||||
padding="longest",
|
||||
truncation=True,
|
||||
max_length=text_max_length,
|
||||
)
|
||||
return prompt_inputs
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
proteins: Optional[Union[list[dict], dict]] = None,
|
||||
messages_list: Optional[Union[list[list[dict]], list[dict]]] = None,
|
||||
protein_max_length: Optional[int] = None,
|
||||
text_max_length: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
|
||||
the model.
|
||||
|
||||
Args:
|
||||
proteins (`Union[List[dict], dict]`):
|
||||
A list of dictionaries or a single dictionary containing the following keys:
|
||||
- `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
|
||||
- `"foldseek"` (`str`) -- The foldseek string of the protein.
|
||||
messages_list (`Union[List[List[dict]], List[dict]]`):
|
||||
A list of lists of dictionaries or a list of dictionaries containing the following keys:
|
||||
- `"role"` (`str`) -- The role of the message.
|
||||
- `"content"` (`str`) -- The content of the message.
|
||||
protein_max_length (`int`, *optional*, defaults to 1024):
|
||||
The maximum length of the sequence to be generated.
|
||||
text_max_length (`int`, *optional*, defaults to 512):
|
||||
The maximum length of the text.
|
||||
|
||||
Return:
|
||||
a dict with following keys:
|
||||
- `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
|
||||
- `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
|
||||
- `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
|
||||
- `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
|
||||
"""
|
||||
# proteins and messages_list should be provided
|
||||
if proteins is None or messages_list is None:
|
||||
raise ValueError("You need to specify `messages_list` and `proteins`.")
|
||||
|
||||
protein_max_length = protein_max_length if protein_max_length is not None else self.protein_max_length
|
||||
text_max_length = text_max_length if text_max_length is not None else self.text_max_length
|
||||
|
||||
# proteins should be List[dict]
|
||||
if isinstance(proteins, dict):
|
||||
proteins = [proteins]
|
||||
# messages_list should be List[List[dict]]
|
||||
if isinstance(messages_list, (list, tuple)) and not isinstance(messages_list[0], (list, tuple)):
|
||||
messages_list = [messages_list]
|
||||
# Check if batched proteins are in the correct format
|
||||
if isinstance(proteins, (list, tuple)) and not all(isinstance(p, dict) for p in proteins):
|
||||
raise ValueError("The proteins should be a list of dictionaries, but not all elements are dictionaries.")
|
||||
if isinstance(proteins, (list, tuple)) and not all(
|
||||
all(k in PROTEIN_VALID_KEYS for k in p.keys()) for p in proteins
|
||||
):
|
||||
raise ValueError(
|
||||
"There should be a list of dictionaries with keys: "
|
||||
f"{', '.join(PROTEIN_VALID_KEYS)} for each protein."
|
||||
f"But got: {proteins}"
|
||||
)
|
||||
# Check if batched messages_list is in the correct format
|
||||
if isinstance(messages_list, (list, tuple)):
|
||||
for messages in messages_list:
|
||||
if not isinstance(messages, (list, tuple)):
|
||||
raise ValueError(f"Each messages in messages_list should be a list instead of {type(messages)}.")
|
||||
if not all(isinstance(m, dict) for m in messages):
|
||||
raise ValueError(
|
||||
"Each message in messages_list should be a list of dictionaries, but not all elements are dictionaries."
|
||||
)
|
||||
if any(len(m.keys()) != 2 for m in messages) or any(
|
||||
set(m.keys()) != {"role", "content"} for m in messages
|
||||
):
|
||||
raise ValueError(
|
||||
"Each message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'."
|
||||
f"But got: {messages}"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"The messages_list should be a list of lists of dictionaries, but it's {type(messages_list)}."
|
||||
)
|
||||
sa_tokens = self.process_proteins(proteins, protein_max_length)
|
||||
|
||||
text_tokens = self.process_text(messages_list, text_max_length)
|
||||
|
||||
return BatchFeature(
|
||||
data={
|
||||
"protein_input_ids": sa_tokens["input_ids"],
|
||||
"protein_attention_mask": sa_tokens["attention_mask"],
|
||||
"input_ids": text_tokens["input_ids"],
|
||||
"attention_mask": text_tokens["attention_mask"],
|
||||
}
|
||||
)
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
def protein_batch_decode(self, *args, **kwargs):
|
||||
return self.protein_tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def protein_decode(self, *args, **kwargs):
|
||||
return self.protein_tokenizer.decode(*args, **kwargs)
|
||||
|
||||
# overwrite to save the protein tokenizer in a separate folder
|
||||
# Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
|
||||
def save_pretrained(self, save_directory, **kwargs):
|
||||
# only save the protein tokenizer in sub_dir
|
||||
self.protein_tokenizer.save_pretrained(os.path.join(save_directory, self.protein_tokenizer_dir_name))
|
||||
|
||||
# we modify the attributes so that only the text tokenizer are saved in the main folder
|
||||
protein_tokenizer_present = "protein_tokenizer" in self.attributes
|
||||
# find the correct position of it in the attributes list
|
||||
protein_tokenizer_index = self.attributes.index("protein_tokenizer") if protein_tokenizer_present else None
|
||||
if protein_tokenizer_present and protein_tokenizer_index is not None:
|
||||
self.attributes.remove("protein_tokenizer")
|
||||
|
||||
outputs = super().save_pretrained(save_directory, **kwargs)
|
||||
|
||||
if protein_tokenizer_present and protein_tokenizer_index is not None:
|
||||
self.attributes.insert(protein_tokenizer_index, "protein_tokenizer")
|
||||
|
||||
return outputs
|
||||
|
||||
# overwirte to load the protein tokenizer from a separate folder
|
||||
# Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||
processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
|
||||
if isinstance(processor, tuple):
|
||||
processor = processor[0]
|
||||
protein_tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path, subfolder=cls.protein_tokenizer_dir_name
|
||||
)
|
||||
|
||||
processor.protein_tokenizer = protein_tokenizer
|
||||
|
||||
return processor
|
||||
|
||||
|
||||
__all__ = ["EvollaProcessor"]
|
Loading…
Add table
Add a link
Reference in a new issue