team-10/venv/Lib/site-packages/transformers/models/dia/processing_dia.py

# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processor class for Dia"""

import math
from pathlib import Path
from typing import Optional, Union

from ...audio_utils import AudioInput, make_list_of_audio
from ...feature_extraction_utils import BatchFeature
from ...processing_utils import AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack
from ...utils import is_soundfile_available, is_torch_available


if is_torch_available():
    import torch

if is_soundfile_available():
    import soundfile as sf


class DiaAudioKwargs(AudioKwargs, total=False):
    bos_token_id: int
    eos_token_id: int
    pad_token_id: int
    delay_pattern: list[int]
    generation: bool


class DiaProcessorKwargs(ProcessingKwargs, total=False):
    audio_kwargs: DiaAudioKwargs
    _defaults = {
        "text_kwargs": {
            "padding": True,
            "padding_side": "right",
            "add_special_tokens": False,
        },
        "audio_kwargs": {
            "eos_token_id": 1024,
            "pad_token_id": 1025,
            "bos_token_id": 1026,
            "delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15],
            "generation": True,
            "sampling_rate": 44100,
        },
        "common_kwargs": {"return_tensors": "pt"},
    }


class DiaProcessor(ProcessorMixin):
    r"""
    Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
    a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-
    nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
    information.

    Args:
        feature_extractor (`DiaFeatureExtractor`):
            An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`DiaTokenizer`):
            An instance of [`DiaTokenizer`]. The tokenizer is a required input.
        audio_tokenizer (`DacModel`):
            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
    """

    feature_extractor_class = "DiaFeatureExtractor"
    tokenizer_class = "DiaTokenizer"
    audio_tokenizer_class = "DacModel"

    def __init__(self, feature_extractor, tokenizer, audio_tokenizer):
        super().__init__(feature_extractor, tokenizer, audio_tokenizer=audio_tokenizer)

    @property
    def model_input_names(self):
        """
        We no longer pass the raw audio values but the codebooks encoded by the `audio_tokenizer`.
        Conventions may differ between audio models due to architectural choices.
        """
        tokenizer_input_names = self.tokenizer.model_input_names
        audio_tokenizer_input_names = ["decoder_input_ids", "decoder_attention_mask"]
        return list(dict.fromkeys(tokenizer_input_names + audio_tokenizer_input_names))

    def __call__(
        self,
        text: Union[str, list[str]],
        audio: Optional[AudioInput] = None,
        output_labels: Optional[bool] = False,
        **kwargs: Unpack[DiaProcessorKwargs],
    ):
        """
        Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
        forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
        DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
        to the docstring of the above methods for more information.
        """
        if not is_torch_available():
            raise ValueError(
                "The `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't "
                "find it in your environment. You can install torch via `pip install torch`."
            )

        if text is None:
            raise ValueError("You need to specify the `text` input to process.")

        output_kwargs = self._merge_kwargs(
            DiaProcessorKwargs,
            **kwargs,
        )

        text_kwargs = output_kwargs["text_kwargs"]
        audio_kwargs = output_kwargs["audio_kwargs"]
        common_kwargs = output_kwargs["common_kwargs"]

        return_tensors = common_kwargs.pop("return_tensors", None)
        if return_tensors != "pt":
            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")

        data = {}

        # Text
        if isinstance(text, str):
            text = [text]
        elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
            raise ValueError("Invalid input text. Please provide a string, or a list of strings")

        encodings = self.tokenizer(text, **text_kwargs)
        data.update(encodings)

        # Audio
        delay_pattern = audio_kwargs.pop("delay_pattern", None)
        audio_bos_token_id = audio_kwargs.pop("bos_token_id", None)
        audio_eos_token_id = audio_kwargs.pop("eos_token_id", None)
        audio_pad_token_id = audio_kwargs.pop("pad_token_id", None)
        generation = audio_kwargs.pop("generation", True)
        if (
            audio_bos_token_id is None
            or audio_eos_token_id is None
            or audio_pad_token_id is None
            or delay_pattern is None
        ):
            raise ValueError(
                "To enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, "
                "`pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those."
            )

        if generation and output_labels:
            raise ValueError(
                f"Labels with `generation` is incompatible, got generation={generation}, output_labels={output_labels}."
            )

        batch_size = data["input_ids"].shape[0]
        num_channels = len(delay_pattern)
        max_delay = max(delay_pattern)

        # Voice cloning generation / general training
        if audio is not None:
            audio = make_list_of_audio(audio)
            input_audios = self.feature_extractor(audio, **audio_kwargs)

            compression_rate = math.prod(self.audio_tokenizer.config.downsampling_ratios)
            max_encoded_sequence_len = input_audios["padding_mask"][0].shape[-1] // compression_rate

            decoder_input_ids = []
            decoder_attention_mask = []
            # TODO: dac with batching is currently broken, but non-batch is working
            # refer to https://gist.github.com/vasqu/643a45b680cf39fd7467271ee2eb6f80 for a validation script
            for padding_mask, audio in zip(input_audios["padding_mask"], input_audios["input_values"]):
                # get current length with hop length in mind (as if it were sampled as a single audio)
                base_pad_len = self.feature_extractor.hop_length
                current_audio_len = math.ceil(padding_mask.sum(dim=-1) / base_pad_len) * base_pad_len

                encoded_sequence_len = current_audio_len // compression_rate
                padding_len = max_encoded_sequence_len - encoded_sequence_len

                # compute non-padded forward pass; one extra bos (and eos if training) is added
                with torch.no_grad():
                    audio = audio[None, ..., :current_audio_len].to(self.audio_tokenizer.device)
                    input_ids = self.audio_tokenizer.encode(audio).audio_codes.transpose(1, 2)

                if not generation:
                    input_ids = torch.nn.functional.pad(
                        input_ids, pad=(0, 0, 0, 1, 0, 0), mode="constant", value=audio_eos_token_id
                    )

                # apply padding
                # +1 for the bos within the real sequence
                input_ids = torch.nn.functional.pad(
                    input_ids, pad=(0, 0, padding_len + 1, 0, 0, 0), mode="constant", value=audio_bos_token_id
                )
                num_valid_inputs = encoded_sequence_len + 1 + max_delay  # sequence + bos + delay
                num_valid_inputs += 0 if generation else 1  # eos if training
                attention_mask = torch.tensor([0] * padding_len + [1] * num_valid_inputs, dtype=torch.long)[None, :]

                decoder_input_ids.append(input_ids)
                decoder_attention_mask.append(attention_mask)

            decoder_input_ids = torch.cat(decoder_input_ids, dim=0)
            decoder_attention_mask = torch.cat(decoder_attention_mask, dim=0)
        # TTS generation
        elif generation:
            # all bos to start with TTS
            decoder_input_ids = torch.full((batch_size, 1, num_channels), audio_bos_token_id, dtype=torch.long)

            # we preemptively add the delay
            decoder_attention_mask = torch.ones(size=(batch_size, 1 + max_delay), dtype=torch.long)
        else:
            raise ValueError("If you try to train, you should provide audio data as well.")

        if batch_size != decoder_input_ids.shape[0]:
            raise ValueError(
                f"Need the same amount of samples for both text and audio, but got text samples={batch_size} and "
                f"audio samples = {decoder_input_ids.shape[0]} instead."
            )

        # prepare shift indices per delay
        max_seq_len = decoder_attention_mask.shape[-1]
        max_audio_len = max_seq_len - max_delay
        precomputed_idx = self.build_indices(
            bsz=batch_size,
            seq_len=max_seq_len,
            num_channels=num_channels,
            delay_pattern=delay_pattern,
            revert=False,
        )

        # create delay pattern input
        # the pad token will be used for masking which input is valid for prediction during generation
        prefill = torch.full(
            (batch_size, max_seq_len, num_channels),
            fill_value=audio_pad_token_id,
            dtype=torch.int,
        )
        prefill[:, :max_audio_len] = decoder_input_ids

        delayed_decoder_input_ids = self.apply_audio_delay(
            audio=prefill,
            pad_token_id=audio_pad_token_id,
            bos_token_id=audio_bos_token_id,
            precomputed_idx=precomputed_idx,
        )

        data.update({"decoder_input_ids": delayed_decoder_input_ids, "decoder_attention_mask": decoder_attention_mask})

        if output_labels:
            # Base idea is to shift on the sequence dim
            labels = data["decoder_input_ids"].clone()[:, 1:]
            labels[labels == audio_pad_token_id] = -100
            labels[labels == audio_bos_token_id] = -100

            data["labels"] = labels.transpose(1, 2).reshape(batch_size * num_channels, -1).contiguous().long()
            data["decoder_input_ids"] = data["decoder_input_ids"][:, :-1]
            data["decoder_attention_mask"] = data["decoder_attention_mask"][:, :-1]

        return BatchFeature(data=data, tensor_type=return_tensors)

    def batch_decode(
        self,
        decoder_input_ids: "torch.Tensor",
        audio_prompt_len: Optional[int] = None,
        **kwargs: Unpack[DiaProcessorKwargs],
    ) -> list["torch.Tensor"]:
        """
        Decodes a batch of audio codebook sequences into their respective audio waveforms via the
        `audio_tokenizer`. See [`~DacModel.decode`] for more information.

        Args:
            decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder.
            audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning).
        """
        output_kwargs = self._merge_kwargs(
            DiaProcessorKwargs,
            **kwargs,
        )
        audio_kwargs = output_kwargs["audio_kwargs"]

        delay_pattern = audio_kwargs.pop("delay_pattern", None)
        audio_bos_token_id = audio_kwargs.pop("bos_token_id", None)
        audio_pad_token_id = audio_kwargs.pop("pad_token_id", None)
        if audio_bos_token_id is None or audio_pad_token_id is None or delay_pattern is None:
            raise ValueError(
                "To enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, "
                "and `delay_pattern`. You may have accidentally overwritten one of those."
            )

        # either decode the whole audio sequence or only the generated parts
        if audio_prompt_len is not None:
            audio_prompt_len = torch.tensor(audio_prompt_len, device=decoder_input_ids.device, dtype=torch.long)
            start_of_generation_idx = audio_prompt_len[None].expand(decoder_input_ids.shape[0])
        else:
            start_of_generation_idx = (decoder_input_ids[:, :, 0] == audio_bos_token_id).sum(dim=-1)
        # -1 for the eos token
        end_of_generation_idx = (
            decoder_input_ids.shape[1] - (decoder_input_ids[:, :, 0] == audio_pad_token_id).sum(dim=-1) - 1
        )

        # revert delay
        bsz, seq_len, num_channels = decoder_input_ids.shape
        precomputed_idx = self.build_indices(
            bsz=bsz,
            seq_len=seq_len,
            num_channels=num_channels,
            delay_pattern=delay_pattern,
            revert=True,
        )

        output_sequences = self.apply_audio_delay(
            audio=decoder_input_ids,
            # We do not care about these values as we cut them out
            # with `start_of_generation_idx` and `end_of_generation_idx`
            pad_token_id=-1,
            bos_token_id=-1,
            precomputed_idx=precomputed_idx,
        ).transpose(1, 2)

        # retrieve the correct sequences each
        audios = []
        # TODO: see above, dac doesn't work in batches yet
        with torch.no_grad():
            for i in range(start_of_generation_idx.shape[0]):
                output_i = output_sequences[i, :, start_of_generation_idx[i] : end_of_generation_idx[i]][None, ...]
                output_i = output_i.to(self.audio_tokenizer.device)
                audio_i = self.audio_tokenizer.decode(audio_codes=output_i).audio_values.cpu().squeeze()
                audios.append(audio_i)

        return audios

    def decode(
        self,
        decoder_input_ids: "torch.Tensor",
        audio_prompt_len: Optional[int] = None,
        **kwargs: Unpack[DiaProcessorKwargs],
    ) -> "torch.Tensor":
        """
        Decodes a single sequence of audio codebooks into the respective audio waveform via the
        `audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information.
        """
        if decoder_input_ids.shape[0] != 1:
            raise ValueError(
                f"Expecting a single output to be decoded but received {decoder_input_ids.shape[0]} samples instead."
            )

        return self.batch_decode(decoder_input_ids, audio_prompt_len, **kwargs)[0]

    def get_audio_prompt_len(
        self,
        decoder_attention_mask: "torch.Tensor",
        **kwargs: Unpack[DiaProcessorKwargs],
    ) -> int:
        """Utility function to get the audio prompt length."""
        output_kwargs = self._merge_kwargs(
            DiaProcessorKwargs,
            **kwargs,
        )
        audio_kwargs = output_kwargs["audio_kwargs"]

        delay_pattern = audio_kwargs.pop("delay_pattern", None)
        if delay_pattern is None:
            raise ValueError(
                "To enable the utility of retrieving the prompt length for Dia, we need the "
                "`delay_pattern`. You may have accidentally overwritten this."
            )
        return decoder_attention_mask.shape[1] - max(delay_pattern)

    # Copied from transformers.models.csm.processing_csm.CsmProcessor.save_audio with Csm->Dia
    def save_audio(
        self,
        audio: AudioInput,
        saving_path: Union[str, Path, list[Union[str, Path]]],
        **kwargs: Unpack[DiaProcessorKwargs],
    ):
        # TODO: @eustlb, this should be in AudioProcessor
        if not is_soundfile_available():
            raise ImportError("Please install `soundfile` to save audio files.")

        # ensure correct audio input
        audio = make_list_of_audio(audio)

        # ensure correct saving path
        if isinstance(saving_path, (str, Path)):
            saving_path = [saving_path]
        elif not (isinstance(saving_path, (list, tuple)) and all(isinstance(p, (str, Path)) for p in saving_path)):
            raise ValueError("Invalid input path. Please provide a string, or a list of strings")

        if len(audio) != len(saving_path):
            raise ValueError("The number of audio and saving paths must be the same")

        output_kwargs = self._merge_kwargs(
            DiaProcessorKwargs,
            **kwargs,
        )
        audio_kwargs = output_kwargs["audio_kwargs"]
        sampling_rate = audio_kwargs["sampling_rate"]

        for audio_value, p in zip(audio, saving_path):
            if isinstance(audio_value, torch.Tensor):
                audio_value = audio_value.cpu().float().numpy()
            sf.write(p, audio_value, sampling_rate)

    @staticmethod
    def build_indices(
        bsz: int,
        seq_len: int,
        num_channels: int,
        delay_pattern: list[int],
        revert: bool = False,
    ) -> tuple["torch.Tensor", "torch.Tensor"]:
        """
        Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel]
        or in[seq, channel] = out[seq + delay[channel], channel] if `revert`.
        Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD.
        """
        delay_array = torch.tensor(delay_pattern, dtype=torch.int32)

        # (0..seq_len-1)
        sequence_idx = torch.arange(seq_len, dtype=torch.int32)[None, :].expand(bsz, seq_len)[..., None]
        # + or - delay depending if we delay or revert the delay
        if not revert:
            sequence_idx = sequence_idx - delay_array[None, None, :]
        else:
            sequence_idx = sequence_idx + delay_array[None, None, :]
        # if delay goes over the range we clamp back to valid values
        valid_sequence_idx = torch.clamp(sequence_idx, 0, seq_len - 1)

        batch_idx = torch.arange(bsz, dtype=torch.int32)[:, None, None].expand(bsz, seq_len, num_channels)
        channel_idx = torch.arange(num_channels, dtype=torch.int32)[None, None, :].expand(bsz, seq_len, num_channels)

        all_idx = torch.stack(
            [batch_idx.reshape(-1), valid_sequence_idx.reshape(-1), channel_idx.reshape(-1)],
            dim=1,
        ).long()

        return sequence_idx, all_idx

    @staticmethod
    def apply_audio_delay(
        audio: "torch.Tensor",
        pad_token_id: int,
        bos_token_id: int,
        precomputed_idx: tuple["torch.Tensor", "torch.Tensor"],
    ) -> "torch.Tensor":
        """
        Applies or reverts the delay pattern to batched audio tokens using precomputed indices,
        inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len.

        Args:
            audio: audio tokens of shape [bsz, seq_len, num_channels]
            pad_token_id: the PAD token
            bos_token_id: the BOS token
            precomputed_idx: from `build_indices`

        Returns:
            final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels]
        """
        # Move everything to the same device
        device = audio.device
        sequence_idx, all_idx = precomputed_idx
        sequence_idx = sequence_idx.to(device)
        all_idx = all_idx.to(device)

        # Gather per precomputed indices
        batch_idx, valid_sequence_idx, channel_idx = torch.unbind(all_idx, dim=-1)
        gathered_audio = audio[batch_idx, valid_sequence_idx, channel_idx].view(audio.size())

        # Mask according to negative sequence_idx => BOS; sequence_idx >= seq_len => PAD
        mask_bos = sequence_idx < 0
        mask_pad = sequence_idx >= audio.shape[1]
        final_audio = torch.where(mask_bos, bos_token_id, torch.where(mask_pad, pad_token_id, gathered_audio))

        return final_audio


__all__ = ["DiaProcessor"]
Adding all project files 2025-08-02 02:00:33 +02:00			`# coding=utf-8`
			`# Copyright 2025 The HuggingFace Inc. team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""Processor class for Dia"""`

			`import math`
			`from pathlib import Path`
			`from typing import Optional, Union`

			`from ...audio_utils import AudioInput, make_list_of_audio`
			`from ...feature_extraction_utils import BatchFeature`
			`from ...processing_utils import AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack`
			`from ...utils import is_soundfile_available, is_torch_available`


			`if is_torch_available():`
			`import torch`

			`if is_soundfile_available():`
			`import soundfile as sf`


			`class DiaAudioKwargs(AudioKwargs, total=False):`
			`bos_token_id: int`
			`eos_token_id: int`
			`pad_token_id: int`
			`delay_pattern: list[int]`
			`generation: bool`


			`class DiaProcessorKwargs(ProcessingKwargs, total=False):`
			`audio_kwargs: DiaAudioKwargs`
			`_defaults = {`
			`"text_kwargs": {`
			`"padding": True,`
			`"padding_side": "right",`
			`"add_special_tokens": False,`
			`},`
			`"audio_kwargs": {`
			`"eos_token_id": 1024,`
			`"pad_token_id": 1025,`
			`"bos_token_id": 1026,`
			`"delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15],`
			`"generation": True,`
			`"sampling_rate": 44100,`
			`},`
			`"common_kwargs": {"return_tensors": "pt"},`
			`}`


			`class DiaProcessor(ProcessorMixin):`
			`r"""`
			Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
			`a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-`
			nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
			`information.`

			`Args:`
			feature_extractor (`DiaFeatureExtractor`):
			An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
			tokenizer (`DiaTokenizer`):
			An instance of [`DiaTokenizer`]. The tokenizer is a required input.
			audio_tokenizer (`DacModel`):
			An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
			`"""`

			`feature_extractor_class = "DiaFeatureExtractor"`
			`tokenizer_class = "DiaTokenizer"`
			`audio_tokenizer_class = "DacModel"`

			`def __init__(self, feature_extractor, tokenizer, audio_tokenizer):`
			`super().__init__(feature_extractor, tokenizer, audio_tokenizer=audio_tokenizer)`

			`@property`
			`def model_input_names(self):`
			`"""`
			We no longer pass the raw audio values but the codebooks encoded by the `audio_tokenizer`.
			`Conventions may differ between audio models due to architectural choices.`
			`"""`
			`tokenizer_input_names = self.tokenizer.model_input_names`
			`audio_tokenizer_input_names = ["decoder_input_ids", "decoder_attention_mask"]`
			`return list(dict.fromkeys(tokenizer_input_names + audio_tokenizer_input_names))`

			`def __call__(`
			`self,`
			`text: Union[str, list[str]],`
			`audio: Optional[AudioInput] = None,`
			`output_labels: Optional[bool] = False,`
			`**kwargs: Unpack[DiaProcessorKwargs],`
			`):`
			`"""`
			Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
			forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
			DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
			`to the docstring of the above methods for more information.`
			`"""`
			`if not is_torch_available():`
			`raise ValueError(`
			"The `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't "
			"find it in your environment. You can install torch via `pip install torch`."
			`)`

			`if text is None:`
			raise ValueError("You need to specify the `text` input to process.")

			`output_kwargs = self._merge_kwargs(`
			`DiaProcessorKwargs,`
			`**kwargs,`
			`)`

			`text_kwargs = output_kwargs["text_kwargs"]`
			`audio_kwargs = output_kwargs["audio_kwargs"]`
			`common_kwargs = output_kwargs["common_kwargs"]`

			`return_tensors = common_kwargs.pop("return_tensors", None)`
			`if return_tensors != "pt":`
			raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")

			`data = {}`

			`# Text`
			`if isinstance(text, str):`
			`text = [text]`
			`elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):`
			`raise ValueError("Invalid input text. Please provide a string, or a list of strings")`

			`encodings = self.tokenizer(text, **text_kwargs)`
			`data.update(encodings)`

			`# Audio`
			`delay_pattern = audio_kwargs.pop("delay_pattern", None)`
			`audio_bos_token_id = audio_kwargs.pop("bos_token_id", None)`
			`audio_eos_token_id = audio_kwargs.pop("eos_token_id", None)`
			`audio_pad_token_id = audio_kwargs.pop("pad_token_id", None)`
			`generation = audio_kwargs.pop("generation", True)`
			`if (`
			`audio_bos_token_id is None`
			`or audio_eos_token_id is None`
			`or audio_pad_token_id is None`
			`or delay_pattern is None`
			`):`
			`raise ValueError(`
			"To enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, "
			"`pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those."
			`)`

			`if generation and output_labels:`
			`raise ValueError(`
			f"Labels with `generation` is incompatible, got generation={generation}, output_labels={output_labels}."
			`)`

			`batch_size = data["input_ids"].shape[0]`
			`num_channels = len(delay_pattern)`
			`max_delay = max(delay_pattern)`

			`# Voice cloning generation / general training`
			`if audio is not None:`
			`audio = make_list_of_audio(audio)`
			`input_audios = self.feature_extractor(audio, **audio_kwargs)`

			`compression_rate = math.prod(self.audio_tokenizer.config.downsampling_ratios)`
			`max_encoded_sequence_len = input_audios["padding_mask"][0].shape[-1] // compression_rate`

			`decoder_input_ids = []`
			`decoder_attention_mask = []`
			`# TODO: dac with batching is currently broken, but non-batch is working`
			`# refer to https://gist.github.com/vasqu/643a45b680cf39fd7467271ee2eb6f80 for a validation script`
			`for padding_mask, audio in zip(input_audios["padding_mask"], input_audios["input_values"]):`
			`# get current length with hop length in mind (as if it were sampled as a single audio)`
			`base_pad_len = self.feature_extractor.hop_length`
			`current_audio_len = math.ceil(padding_mask.sum(dim=-1) / base_pad_len) * base_pad_len`

			`encoded_sequence_len = current_audio_len // compression_rate`
			`padding_len = max_encoded_sequence_len - encoded_sequence_len`

			`# compute non-padded forward pass; one extra bos (and eos if training) is added`
			`with torch.no_grad():`
			`audio = audio[None, ..., :current_audio_len].to(self.audio_tokenizer.device)`
			`input_ids = self.audio_tokenizer.encode(audio).audio_codes.transpose(1, 2)`

			`if not generation:`
			`input_ids = torch.nn.functional.pad(`
			`input_ids, pad=(0, 0, 0, 1, 0, 0), mode="constant", value=audio_eos_token_id`
			`)`

			`# apply padding`
			`# +1 for the bos within the real sequence`
			`input_ids = torch.nn.functional.pad(`
			`input_ids, pad=(0, 0, padding_len + 1, 0, 0, 0), mode="constant", value=audio_bos_token_id`
			`)`
			`num_valid_inputs = encoded_sequence_len + 1 + max_delay # sequence + bos + delay`
			`num_valid_inputs += 0 if generation else 1 # eos if training`
			`attention_mask = torch.tensor([0] * padding_len + [1] * num_valid_inputs, dtype=torch.long)[None, :]`

			`decoder_input_ids.append(input_ids)`
			`decoder_attention_mask.append(attention_mask)`

			`decoder_input_ids = torch.cat(decoder_input_ids, dim=0)`
			`decoder_attention_mask = torch.cat(decoder_attention_mask, dim=0)`
			`# TTS generation`
			`elif generation:`
			`# all bos to start with TTS`
			`decoder_input_ids = torch.full((batch_size, 1, num_channels), audio_bos_token_id, dtype=torch.long)`

			`# we preemptively add the delay`
			`decoder_attention_mask = torch.ones(size=(batch_size, 1 + max_delay), dtype=torch.long)`
			`else:`
			`raise ValueError("If you try to train, you should provide audio data as well.")`

			`if batch_size != decoder_input_ids.shape[0]:`
			`raise ValueError(`
			`f"Need the same amount of samples for both text and audio, but got text samples={batch_size} and "`
			`f"audio samples = {decoder_input_ids.shape[0]} instead."`
			`)`

			`# prepare shift indices per delay`
			`max_seq_len = decoder_attention_mask.shape[-1]`
			`max_audio_len = max_seq_len - max_delay`
			`precomputed_idx = self.build_indices(`
			`bsz=batch_size,`
			`seq_len=max_seq_len,`
			`num_channels=num_channels,`
			`delay_pattern=delay_pattern,`
			`revert=False,`
			`)`

			`# create delay pattern input`
			`# the pad token will be used for masking which input is valid for prediction during generation`
			`prefill = torch.full(`
			`(batch_size, max_seq_len, num_channels),`
			`fill_value=audio_pad_token_id,`
			`dtype=torch.int,`
			`)`
			`prefill[:, :max_audio_len] = decoder_input_ids`

			`delayed_decoder_input_ids = self.apply_audio_delay(`
			`audio=prefill,`
			`pad_token_id=audio_pad_token_id,`
			`bos_token_id=audio_bos_token_id,`
			`precomputed_idx=precomputed_idx,`
			`)`

			`data.update({"decoder_input_ids": delayed_decoder_input_ids, "decoder_attention_mask": decoder_attention_mask})`

			`if output_labels:`
			`# Base idea is to shift on the sequence dim`
			`labels = data["decoder_input_ids"].clone()[:, 1:]`
			`labels[labels == audio_pad_token_id] = -100`
			`labels[labels == audio_bos_token_id] = -100`

			`data["labels"] = labels.transpose(1, 2).reshape(batch_size * num_channels, -1).contiguous().long()`
			`data["decoder_input_ids"] = data["decoder_input_ids"][:, :-1]`
			`data["decoder_attention_mask"] = data["decoder_attention_mask"][:, :-1]`

			`return BatchFeature(data=data, tensor_type=return_tensors)`

			`def batch_decode(`
			`self,`
			`decoder_input_ids: "torch.Tensor",`
			`audio_prompt_len: Optional[int] = None,`
			`**kwargs: Unpack[DiaProcessorKwargs],`
			`) -> list["torch.Tensor"]:`
			`"""`
			`Decodes a batch of audio codebook sequences into their respective audio waveforms via the`
			`audio_tokenizer`. See [`~DacModel.decode`] for more information.

			`Args:`
			decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder.
			audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning).
			`"""`
			`output_kwargs = self._merge_kwargs(`
			`DiaProcessorKwargs,`
			`**kwargs,`
			`)`
			`audio_kwargs = output_kwargs["audio_kwargs"]`

			`delay_pattern = audio_kwargs.pop("delay_pattern", None)`
			`audio_bos_token_id = audio_kwargs.pop("bos_token_id", None)`
			`audio_pad_token_id = audio_kwargs.pop("pad_token_id", None)`
			`if audio_bos_token_id is None or audio_pad_token_id is None or delay_pattern is None:`
			`raise ValueError(`
			"To enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, "
			"and `delay_pattern`. You may have accidentally overwritten one of those."
			`)`

			`# either decode the whole audio sequence or only the generated parts`
			`if audio_prompt_len is not None:`
			`audio_prompt_len = torch.tensor(audio_prompt_len, device=decoder_input_ids.device, dtype=torch.long)`
			`start_of_generation_idx = audio_prompt_len[None].expand(decoder_input_ids.shape[0])`
			`else:`
			`start_of_generation_idx = (decoder_input_ids[:, :, 0] == audio_bos_token_id).sum(dim=-1)`
			`# -1 for the eos token`
			`end_of_generation_idx = (`
			`decoder_input_ids.shape[1] - (decoder_input_ids[:, :, 0] == audio_pad_token_id).sum(dim=-1) - 1`
			`)`

			`# revert delay`
			`bsz, seq_len, num_channels = decoder_input_ids.shape`
			`precomputed_idx = self.build_indices(`
			`bsz=bsz,`
			`seq_len=seq_len,`
			`num_channels=num_channels,`
			`delay_pattern=delay_pattern,`
			`revert=True,`
			`)`

			`output_sequences = self.apply_audio_delay(`
			`audio=decoder_input_ids,`
			`# We do not care about these values as we cut them out`
			# with `start_of_generation_idx` and `end_of_generation_idx`
			`pad_token_id=-1,`
			`bos_token_id=-1,`
			`precomputed_idx=precomputed_idx,`
			`).transpose(1, 2)`

			`# retrieve the correct sequences each`
			`audios = []`
			`# TODO: see above, dac doesn't work in batches yet`
			`with torch.no_grad():`
			`for i in range(start_of_generation_idx.shape[0]):`
			`output_i = output_sequences[i, :, start_of_generation_idx[i] : end_of_generation_idx[i]][None, ...]`
			`output_i = output_i.to(self.audio_tokenizer.device)`
			`audio_i = self.audio_tokenizer.decode(audio_codes=output_i).audio_values.cpu().squeeze()`
			`audios.append(audio_i)`

			`return audios`

			`def decode(`
			`self,`
			`decoder_input_ids: "torch.Tensor",`
			`audio_prompt_len: Optional[int] = None,`
			`**kwargs: Unpack[DiaProcessorKwargs],`
			`) -> "torch.Tensor":`
			`"""`
			`Decodes a single sequence of audio codebooks into the respective audio waveform via the`
			`audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information.
			`"""`
			`if decoder_input_ids.shape[0] != 1:`
			`raise ValueError(`
			`f"Expecting a single output to be decoded but received {decoder_input_ids.shape[0]} samples instead."`
			`)`

			`return self.batch_decode(decoder_input_ids, audio_prompt_len, **kwargs)[0]`

			`def get_audio_prompt_len(`
			`self,`
			`decoder_attention_mask: "torch.Tensor",`
			`**kwargs: Unpack[DiaProcessorKwargs],`
			`) -> int:`
			`"""Utility function to get the audio prompt length."""`
			`output_kwargs = self._merge_kwargs(`
			`DiaProcessorKwargs,`
			`**kwargs,`
			`)`
			`audio_kwargs = output_kwargs["audio_kwargs"]`

			`delay_pattern = audio_kwargs.pop("delay_pattern", None)`
			`if delay_pattern is None:`
			`raise ValueError(`
			`"To enable the utility of retrieving the prompt length for Dia, we need the "`
			"`delay_pattern`. You may have accidentally overwritten this."
			`)`
			`return decoder_attention_mask.shape[1] - max(delay_pattern)`

			`# Copied from transformers.models.csm.processing_csm.CsmProcessor.save_audio with Csm->Dia`
			`def save_audio(`
			`self,`
			`audio: AudioInput,`
			`saving_path: Union[str, Path, list[Union[str, Path]]],`
			`**kwargs: Unpack[DiaProcessorKwargs],`
			`):`
			`# TODO: @eustlb, this should be in AudioProcessor`
			`if not is_soundfile_available():`
			raise ImportError("Please install `soundfile` to save audio files.")

			`# ensure correct audio input`
			`audio = make_list_of_audio(audio)`

			`# ensure correct saving path`
			`if isinstance(saving_path, (str, Path)):`
			`saving_path = [saving_path]`
			`elif not (isinstance(saving_path, (list, tuple)) and all(isinstance(p, (str, Path)) for p in saving_path)):`
			`raise ValueError("Invalid input path. Please provide a string, or a list of strings")`

			`if len(audio) != len(saving_path):`
			`raise ValueError("The number of audio and saving paths must be the same")`

			`output_kwargs = self._merge_kwargs(`
			`DiaProcessorKwargs,`
			`**kwargs,`
			`)`
			`audio_kwargs = output_kwargs["audio_kwargs"]`
			`sampling_rate = audio_kwargs["sampling_rate"]`

			`for audio_value, p in zip(audio, saving_path):`
			`if isinstance(audio_value, torch.Tensor):`
			`audio_value = audio_value.cpu().float().numpy()`
			`sf.write(p, audio_value, sampling_rate)`

			`@staticmethod`
			`def build_indices(`
			`bsz: int,`
			`seq_len: int,`
			`num_channels: int,`
			`delay_pattern: list[int],`
			`revert: bool = False,`
			`) -> tuple["torch.Tensor", "torch.Tensor"]:`
			`"""`
			`Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel]`
			or in[seq, channel] = out[seq + delay[channel], channel] if `revert`.
			`Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD.`
			`"""`
			`delay_array = torch.tensor(delay_pattern, dtype=torch.int32)`

			`# (0..seq_len-1)`
			`sequence_idx = torch.arange(seq_len, dtype=torch.int32)[None, :].expand(bsz, seq_len)[..., None]`
			`# + or - delay depending if we delay or revert the delay`
			`if not revert:`
			`sequence_idx = sequence_idx - delay_array[None, None, :]`
			`else:`
			`sequence_idx = sequence_idx + delay_array[None, None, :]`
			`# if delay goes over the range we clamp back to valid values`
			`valid_sequence_idx = torch.clamp(sequence_idx, 0, seq_len - 1)`

			`batch_idx = torch.arange(bsz, dtype=torch.int32)[:, None, None].expand(bsz, seq_len, num_channels)`
			`channel_idx = torch.arange(num_channels, dtype=torch.int32)[None, None, :].expand(bsz, seq_len, num_channels)`

			`all_idx = torch.stack(`
			`[batch_idx.reshape(-1), valid_sequence_idx.reshape(-1), channel_idx.reshape(-1)],`
			`dim=1,`
			`).long()`

			`return sequence_idx, all_idx`

			`@staticmethod`
			`def apply_audio_delay(`
			`audio: "torch.Tensor",`
			`pad_token_id: int,`
			`bos_token_id: int,`
			`precomputed_idx: tuple["torch.Tensor", "torch.Tensor"],`
			`) -> "torch.Tensor":`
			`"""`
			`Applies or reverts the delay pattern to batched audio tokens using precomputed indices,`
			`inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len.`

			`Args:`
			`audio: audio tokens of shape [bsz, seq_len, num_channels]`
			`pad_token_id: the PAD token`
			`bos_token_id: the BOS token`
			precomputed_idx: from `build_indices`

			`Returns:`
			`final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels]`
			`"""`
			`# Move everything to the same device`
			`device = audio.device`
			`sequence_idx, all_idx = precomputed_idx`
			`sequence_idx = sequence_idx.to(device)`
			`all_idx = all_idx.to(device)`

			`# Gather per precomputed indices`
			`batch_idx, valid_sequence_idx, channel_idx = torch.unbind(all_idx, dim=-1)`
			`gathered_audio = audio[batch_idx, valid_sequence_idx, channel_idx].view(audio.size())`

			`# Mask according to negative sequence_idx => BOS; sequence_idx >= seq_len => PAD`
			`mask_bos = sequence_idx < 0`
			`mask_pad = sequence_idx >= audio.shape[1]`
			`final_audio = torch.where(mask_bos, bos_token_id, torch.where(mask_pad, pad_token_id, gathered_audio))`

			`return final_audio`


			`__all__ = ["DiaProcessor"]`