Adding all project files

2025-08-02 02:00:33 +02:00 · 2025-08-02 02:00:33 +02:00 · cd4316ad0f
commit cd4316ad0f
parent 6c9e127bdc
42289 changed files with 8009643 additions and 0 deletions
--- a/venv/Lib/site-packages/transformers/models/evolla/init.py
+++ b/venv/Lib/site-packages/transformers/models/evolla/init.py
@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_evolla import *
+    from .modeling_evolla import *
+    from .processing_evolla import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
--- a/venv/Lib/site-packages/transformers/models/evolla/pycache/init.cpython-39.pyc
+++ b/venv/Lib/site-packages/transformers/models/evolla/pycache/init.cpython-39.pyc
--- a/venv/Lib/site-packages/transformers/models/evolla/pycache/configuration_evolla.cpython-39.pyc
+++ b/venv/Lib/site-packages/transformers/models/evolla/pycache/configuration_evolla.cpython-39.pyc
--- a/venv/Lib/site-packages/transformers/models/evolla/pycache/modeling_evolla.cpython-39.pyc
+++ b/venv/Lib/site-packages/transformers/models/evolla/pycache/modeling_evolla.cpython-39.pyc
--- a/venv/Lib/site-packages/transformers/models/evolla/pycache/modular_evolla.cpython-39.pyc
+++ b/venv/Lib/site-packages/transformers/models/evolla/pycache/modular_evolla.cpython-39.pyc
--- a/venv/Lib/site-packages/transformers/models/evolla/pycache/processing_evolla.cpython-39.pyc
+++ b/venv/Lib/site-packages/transformers/models/evolla/pycache/processing_evolla.cpython-39.pyc
--- a/venv/Lib/site-packages/transformers/models/evolla/configuration_evolla.py
+++ b/venv/Lib/site-packages/transformers/models/evolla/configuration_evolla.py
@ -0,0 +1,279 @@
+# coding=utf-8
+# Copyright 2025 Westlake Representational Learning Lab (Fajie Yuan Lab) team and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evolla model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SaProtConfig(PretrainedConfig):
+    r"""This is the configuration class to store the configuration of a [`EvollaSaProtProteinEncoder`]. It is used to instantiate a
+    SaProt model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 446):
+            Vocabulary size of the protein sequence model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`EvollaModel`].
+        mask_token_id (`int`, *optional*, defaults to 4):
+            The id of the *mask* token in the protein sequence model.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the *padding* token in the protein sequence model.
+        hidden_size (`int`, *optional*, defaults to 1280):
+            Dimensionality of the protein sequence model layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 33):
+            Number of hidden layers in the protein sequence model.
+        num_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the protein sequence model.
+        intermediate_size (`int`, *optional*, defaults to 5120):
+            Dimensionality of the intermediate layers in the protein sequence model.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the hidden layers in the protein sequence model.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities in the protein sequence model.
+        max_position_embeddings (`int`, *optional*, defaults to 1026):
+            The maximum sequence length that the protein sequence model might ever be used with. Typically set this to
+            something large just in case (e.g., 512 or 1024 or 2048).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value for the layer normalization layer in the protein sequence model.
+        position_embedding_type (`str`, *optional*, defaults to `"rotary"`):
+            The type of position embedding to use in the protein sequence model. Currently only `"rotary"` is supported.
+        emb_layer_norm_before (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization before the position embedding in the protein sequence model.
+        token_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to apply dropout to the tokens in the protein sequence model."""
+
+    def __init__(
+        self,
+        vocab_size=446,
+        mask_token_id=4,
+        pad_token_id=1,
+        hidden_size=1280,
+        num_hidden_layers=33,
+        num_attention_heads=20,
+        intermediate_size=5120,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=1026,
+        initializer_range=0.02,
+        layer_norm_eps=1e-05,
+        position_embedding_type="rotary",
+        use_cache=True,
+        emb_layer_norm_before=False,
+        token_dropout=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.emb_layer_norm_before = emb_layer_norm_before
+        self.token_dropout = token_dropout
+
+
+class EvollaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EvollaModel`]. It is used to instantiate an
+    Evolla model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Evolla-10B.
+
+    e.g. [westlake-repl/Evolla-10B-hf](https://huggingface.co/westlake-repl/Evolla-10B-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        protein_encoder_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SaProtConfig`].
+        vocab_size (`int`, *optional*, defaults to 128256):
+            Vocabulary size of the Evolla llama model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`EvollaModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the llama layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimensionality of the intermediate layers in the llama model.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the llama model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the llama model.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            Number of key-value pairs for each attention layer in the llama model.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the llama model. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"silu"` are supported.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value for the RMS-norm layer in the llama model.
+        rope_theta (`float`, *optional*, defaults to 500000.0):
+            The threshold value for the RoPE layer in the llama model.
+        rope_scaling (`float`, *optional*):
+            The scaling factor for the RoPE layer in the llama model.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the attention layer.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention layer.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the MLP layer.
+        aligner_ffn_mult (`int`, *optional*, defaults to 4):
+            The FFN multiplier for the aligner layer.
+        aligner_enable_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the aligner layer.
+        aligner_attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities in the aligner layer.
+        aligner_num_add_layers (`int`, *optional*, defaults to 8):
+            The number of additional layers for the aligner layer.
+        resampler_depth (`int`, *optional*, defaults to 6):
+            The depth of the resampler layer in the llama model.
+        resampler_dim_head (`int`, *optional*, defaults to 64):
+            The dimension of the heads in the resampler layer in the llama model.
+        resampler_heads (`int`, *optional*, defaults to 8):
+            The number of heads in the resampler layer in the llama model.
+        resampler_num_latents (`int`, *optional*, defaults to 64):
+            The number of latents in the resampler layer in the llama model.
+        resampler_ff_mult (`int`, *optional*, defaults to 4):
+            The FFN multiplier for the resampler layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        bos_token_id (`int`, *optional*, defaults to 128000):
+            The id of the *beginning-of-sequence* token.
+        eos_token_id (`int`, *optional*, defaults to 128009):
+            The id of the *end-of-sequence* token.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the input and output word embeddings.
+
+    Example:
+
+    ```python
+    >>> from transformers import EvollaModel, EvollaConfig
+
+    >>> # Initializing a Evolla evolla-10b style configuration
+    >>> configuration = EvollaConfig()
+
+    >>> # Initializing a model from the evolla-10b style configuration
+    >>> model = EvollaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "EvollaModel"
+    sub_configs = {"protein_encoder_config": SaProtConfig}
+
+    def __init__(
+        self,
+        protein_encoder_config=None,
+        vocab_size=128256,  # llama vocab size
+        hidden_size=4096,  # llama hidden size
+        intermediate_size=14336,  # llama intermediate size
+        num_hidden_layers=32,  # llama num layers
+        num_attention_heads=32,  # llama num heads
+        num_key_value_heads=8,  # llama num key-value heads
+        hidden_act="silu",  # llama activation function
+        max_position_embeddings=8192,  # llama rope max length
+        rms_norm_eps=1e-05,
+        rope_theta=500000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        aligner_ffn_mult=4,
+        aligner_enable_bias=True,
+        aligner_attention_probs_dropout_prob=0.1,
+        aligner_num_add_layers=8,
+        resampler_depth=6,
+        resampler_dim_head=64,
+        resampler_heads=8,
+        resampler_num_latents=64,
+        resampler_ff_mult=4,
+        initializer_range=0.02,
+        pad_token_id=None,
+        bos_token_id=128000,
+        eos_token_id=128009,
+        use_cache=False,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.rms_norm_eps = rms_norm_eps
+        self.tie_word_embeddings = tie_word_embeddings
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.aligner_ffn_mult = aligner_ffn_mult
+        self.aligner_enable_bias = aligner_enable_bias
+        self.aligner_attention_probs_dropout_prob = aligner_attention_probs_dropout_prob
+        self.aligner_num_add_layers = aligner_num_add_layers
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+
+        self.resampler_depth = resampler_depth
+        self.resampler_dim_head = resampler_dim_head
+        self.resampler_heads = resampler_heads
+        self.resampler_num_latents = resampler_num_latents
+        self.resampler_ff_mult = resampler_ff_mult
+
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # Subconfig
+        if protein_encoder_config is None:
+            protein_encoder_config = {}
+            logger.info("`protein_encoder_config` is `None`. Initializing the `SaProtConfig` with default values.")
+        self.protein_encoder_config = SaProtConfig(**protein_encoder_config)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["EvollaConfig"]
--- a/venv/Lib/site-packages/transformers/models/evolla/modeling_evolla.py
+++ b/venv/Lib/site-packages/transformers/models/evolla/modeling_evolla.py
--- a/venv/Lib/site-packages/transformers/models/evolla/modular_evolla.py
+++ b/venv/Lib/site-packages/transformers/models/evolla/modular_evolla.py
--- a/venv/Lib/site-packages/transformers/models/evolla/processing_evolla.py
+++ b/venv/Lib/site-packages/transformers/models/evolla/processing_evolla.py
@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for EVOLLA.
+"""
+
+import os
+from typing import Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import (
+    ProcessorMixin,
+)
+from ..auto import AutoTokenizer
+
+
+PROTEIN_VALID_KEYS = ["aa_seq", "foldseek", "msa"]
+
+
+class EvollaProcessor(ProcessorMixin):
+    r"""
+    Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.
+
+    [`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
+    docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.
+
+    Args:
+        protein_tokenizer (`EsmTokenizer`):
+            An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
+        tokenizer (`LlamaTokenizerFast`, *optional*):
+            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
+        protein_max_length (`int`, *optional*, defaults to 1024):
+            The maximum length of the sequence to be generated.
+        text_max_length (`int`, *optional*, defaults to 512):
+            The maximum length of the text to be generated.
+    """
+
+    attributes = ["protein_tokenizer", "tokenizer"]
+    valid_kwargs = ["sequence_max_length"]
+    # protein_tokenizer_class = "EsmTokenizer"
+    # tokenizer_class = "LlamaTokenizerFast"
+    protein_tokenizer_class = "AutoTokenizer"
+    tokenizer_class = "AutoTokenizer"
+    protein_tokenizer_dir_name = "protein_tokenizer"
+    # tokenizer_dir_name = "text_tokenizer"
+
+    def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs):
+        if protein_tokenizer is None:
+            raise ValueError("You need to specify an `protein_tokenizer`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(protein_tokenizer, tokenizer)
+
+        self.tokenizer.pad_token = "<|reserved_special_token_0|>"
+        self.protein_max_length = protein_max_length
+        self.text_max_length = text_max_length
+
+    def process_proteins(self, proteins, protein_max_length=1024):
+        sa_sequences = []
+        for protein in proteins:
+            aa_seq = protein.get("aa_seq")
+            foldseek = protein.get("foldseek")
+            sa_sequence = "".join([s.upper() + f.lower() for s, f in zip(aa_seq, foldseek)])
+            sa_sequences.append(sa_sequence)
+
+        sa_tokens = self.protein_tokenizer.batch_encode_plus(
+            sa_sequences, return_tensors="pt", truncation=True, max_length=protein_max_length, padding=True
+        )
+        return sa_tokens
+
+    def process_text(
+        self,
+        texts,
+        text_max_length: int = 512,
+    ):
+        prompts = []
+        for messages in texts:
+            prompt = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            prompts.append(prompt)
+
+        prompt_inputs = self.tokenizer(
+            prompts,
+            add_special_tokens=False,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=text_max_length,
+        )
+        return prompt_inputs
+
+    def __call__(
+        self,
+        proteins: Optional[Union[list[dict], dict]] = None,
+        messages_list: Optional[Union[list[list[dict]], list[dict]]] = None,
+        protein_max_length: Optional[int] = None,
+        text_max_length: Optional[int] = None,
+        **kwargs,
+    ):
+        r"""This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
+        the model.
+
+        Args:
+            proteins (`Union[List[dict], dict]`):
+                A list of dictionaries or a single dictionary containing the following keys:
+                    - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
+                    - `"foldseek"` (`str`) -- The foldseek string of the protein.
+            messages_list (`Union[List[List[dict]], List[dict]]`):
+                A list of lists of dictionaries or a list of dictionaries containing the following keys:
+                    - `"role"` (`str`) -- The role of the message.
+                    - `"content"` (`str`) -- The content of the message.
+            protein_max_length (`int`, *optional*, defaults to 1024):
+                The maximum length of the sequence to be generated.
+            text_max_length (`int`, *optional*, defaults to 512):
+                The maximum length of the text.
+
+        Return:
+            a dict with following keys:
+                - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
+                - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
+                - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
+                - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
+        """
+        # proteins and messages_list should be provided
+        if proteins is None or messages_list is None:
+            raise ValueError("You need to specify `messages_list` and `proteins`.")
+
+        protein_max_length = protein_max_length if protein_max_length is not None else self.protein_max_length
+        text_max_length = text_max_length if text_max_length is not None else self.text_max_length
+
+        # proteins should be List[dict]
+        if isinstance(proteins, dict):
+            proteins = [proteins]
+        # messages_list should be List[List[dict]]
+        if isinstance(messages_list, (list, tuple)) and not isinstance(messages_list[0], (list, tuple)):
+            messages_list = [messages_list]
+        # Check if batched proteins are in the correct format
+        if isinstance(proteins, (list, tuple)) and not all(isinstance(p, dict) for p in proteins):
+            raise ValueError("The proteins should be a list of dictionaries, but not all elements are dictionaries.")
+        if isinstance(proteins, (list, tuple)) and not all(
+            all(k in PROTEIN_VALID_KEYS for k in p.keys()) for p in proteins
+        ):
+            raise ValueError(
+                "There should be a list of dictionaries with keys: "
+                f"{', '.join(PROTEIN_VALID_KEYS)} for each protein."
+                f"But got: {proteins}"
+            )
+        # Check if batched messages_list is in the correct format
+        if isinstance(messages_list, (list, tuple)):
+            for messages in messages_list:
+                if not isinstance(messages, (list, tuple)):
+                    raise ValueError(f"Each messages in messages_list should be a list instead of {type(messages)}.")
+                if not all(isinstance(m, dict) for m in messages):
+                    raise ValueError(
+                        "Each message in messages_list should be a list of dictionaries, but not all elements are dictionaries."
+                    )
+                if any(len(m.keys()) != 2 for m in messages) or any(
+                    set(m.keys()) != {"role", "content"} for m in messages
+                ):
+                    raise ValueError(
+                        "Each message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'."
+                        f"But got: {messages}"
+                    )
+        else:
+            raise ValueError(
+                f"The messages_list should be a list of lists of dictionaries, but it's {type(messages_list)}."
+            )
+        sa_tokens = self.process_proteins(proteins, protein_max_length)
+
+        text_tokens = self.process_text(messages_list, text_max_length)
+
+        return BatchFeature(
+            data={
+                "protein_input_ids": sa_tokens["input_ids"],
+                "protein_attention_mask": sa_tokens["attention_mask"],
+                "input_ids": text_tokens["input_ids"],
+                "attention_mask": text_tokens["attention_mask"],
+            }
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def protein_batch_decode(self, *args, **kwargs):
+        return self.protein_tokenizer.batch_decode(*args, **kwargs)
+
+    def protein_decode(self, *args, **kwargs):
+        return self.protein_tokenizer.decode(*args, **kwargs)
+
+    # overwrite to save the protein tokenizer in a separate folder
+    # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
+    def save_pretrained(self, save_directory, **kwargs):
+        # only save the protein tokenizer in sub_dir
+        self.protein_tokenizer.save_pretrained(os.path.join(save_directory, self.protein_tokenizer_dir_name))
+
+        # we modify the attributes so that only the text tokenizer are saved in the main folder
+        protein_tokenizer_present = "protein_tokenizer" in self.attributes
+        # find the correct position of it in the attributes list
+        protein_tokenizer_index = self.attributes.index("protein_tokenizer") if protein_tokenizer_present else None
+        if protein_tokenizer_present and protein_tokenizer_index is not None:
+            self.attributes.remove("protein_tokenizer")
+
+        outputs = super().save_pretrained(save_directory, **kwargs)
+
+        if protein_tokenizer_present and protein_tokenizer_index is not None:
+            self.attributes.insert(protein_tokenizer_index, "protein_tokenizer")
+
+        return outputs
+
+    # overwirte to load the protein tokenizer from a separate folder
+    # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
+        if isinstance(processor, tuple):
+            processor = processor[0]
+        protein_tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path, subfolder=cls.protein_tokenizer_dir_name
+        )
+
+        processor.protein_tokenizer = protein_tokenizer
+
+        return processor
+
+
+__all__ = ["EvollaProcessor"]