team-10/env/Lib/site-packages/diffusers/utils/remote_utils.py

# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import json
from typing import List, Literal, Optional, Union, cast

import requests

from .deprecation_utils import deprecate
from .import_utils import is_safetensors_available, is_torch_available


if is_torch_available():
    import torch

    from ..image_processor import VaeImageProcessor
    from ..video_processor import VideoProcessor

    if is_safetensors_available():
        import safetensors.torch

    DTYPE_MAP = {
        "float16": torch.float16,
        "float32": torch.float32,
        "bfloat16": torch.bfloat16,
        "uint8": torch.uint8,
    }


from PIL import Image


def detect_image_type(data: bytes) -> str:
    if data.startswith(b"\xff\xd8"):
        return "jpeg"
    elif data.startswith(b"\x89PNG\r\n\x1a\n"):
        return "png"
    elif data.startswith(b"GIF87a") or data.startswith(b"GIF89a"):
        return "gif"
    elif data.startswith(b"BM"):
        return "bmp"
    return "unknown"


def check_inputs_decode(
    endpoint: str,
    tensor: "torch.Tensor",
    processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None,
    do_scaling: bool = True,
    scaling_factor: Optional[float] = None,
    shift_factor: Optional[float] = None,
    output_type: Literal["mp4", "pil", "pt"] = "pil",
    return_type: Literal["mp4", "pil", "pt"] = "pil",
    image_format: Literal["png", "jpg"] = "jpg",
    partial_postprocess: bool = False,
    input_tensor_type: Literal["binary"] = "binary",
    output_tensor_type: Literal["binary"] = "binary",
    height: Optional[int] = None,
    width: Optional[int] = None,
):
    if tensor.ndim == 3 and height is None and width is None:
        raise ValueError("`height` and `width` required for packed latents.")
    if (
        output_type == "pt"
        and return_type == "pil"
        and not partial_postprocess
        and not isinstance(processor, (VaeImageProcessor, VideoProcessor))
    ):
        raise ValueError("`processor` is required.")
    if do_scaling and scaling_factor is None:
        deprecate(
            "do_scaling",
            "1.0.0",
            "`do_scaling` is deprecated, pass `scaling_factor` and `shift_factor` if required.",
            standard_warn=False,
        )


def postprocess_decode(
    response: requests.Response,
    processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None,
    output_type: Literal["mp4", "pil", "pt"] = "pil",
    return_type: Literal["mp4", "pil", "pt"] = "pil",
    partial_postprocess: bool = False,
):
    if output_type == "pt" or (output_type == "pil" and processor is not None):
        output_tensor = response.content
        parameters = response.headers
        shape = json.loads(parameters["shape"])
        dtype = parameters["dtype"]
        torch_dtype = DTYPE_MAP[dtype]
        output_tensor = torch.frombuffer(bytearray(output_tensor), dtype=torch_dtype).reshape(shape)
    if output_type == "pt":
        if partial_postprocess:
            if return_type == "pil":
                output = [Image.fromarray(image.numpy()) for image in output_tensor]
                if len(output) == 1:
                    output = output[0]
            elif return_type == "pt":
                output = output_tensor
        else:
            if processor is None or return_type == "pt":
                output = output_tensor
            else:
                if isinstance(processor, VideoProcessor):
                    output = cast(
                        List[Image.Image],
                        processor.postprocess_video(output_tensor, output_type="pil")[0],
                    )
                else:
                    output = cast(
                        Image.Image,
                        processor.postprocess(output_tensor, output_type="pil")[0],
                    )
    elif output_type == "pil" and return_type == "pil" and processor is None:
        output = Image.open(io.BytesIO(response.content)).convert("RGB")
        detected_format = detect_image_type(response.content)
        output.format = detected_format
    elif output_type == "pil" and processor is not None:
        if return_type == "pil":
            output = [
                Image.fromarray(image)
                for image in (output_tensor.permute(0, 2, 3, 1).float().numpy() * 255).round().astype("uint8")
            ]
        elif return_type == "pt":
            output = output_tensor
    elif output_type == "mp4" and return_type == "mp4":
        output = response.content
    return output


def prepare_decode(
    tensor: "torch.Tensor",
    processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None,
    do_scaling: bool = True,
    scaling_factor: Optional[float] = None,
    shift_factor: Optional[float] = None,
    output_type: Literal["mp4", "pil", "pt"] = "pil",
    image_format: Literal["png", "jpg"] = "jpg",
    partial_postprocess: bool = False,
    height: Optional[int] = None,
    width: Optional[int] = None,
):
    headers = {}
    parameters = {
        "image_format": image_format,
        "output_type": output_type,
        "partial_postprocess": partial_postprocess,
        "shape": list(tensor.shape),
        "dtype": str(tensor.dtype).split(".")[-1],
    }
    if do_scaling and scaling_factor is not None:
        parameters["scaling_factor"] = scaling_factor
    if do_scaling and shift_factor is not None:
        parameters["shift_factor"] = shift_factor
    if do_scaling and scaling_factor is None:
        parameters["do_scaling"] = do_scaling
    elif do_scaling and scaling_factor is None and shift_factor is None:
        parameters["do_scaling"] = do_scaling
    if height is not None and width is not None:
        parameters["height"] = height
        parameters["width"] = width
    headers["Content-Type"] = "tensor/binary"
    headers["Accept"] = "tensor/binary"
    if output_type == "pil" and image_format == "jpg" and processor is None:
        headers["Accept"] = "image/jpeg"
    elif output_type == "pil" and image_format == "png" and processor is None:
        headers["Accept"] = "image/png"
    elif output_type == "mp4":
        headers["Accept"] = "text/plain"
    tensor_data = safetensors.torch._tobytes(tensor, "tensor")
    return {"data": tensor_data, "params": parameters, "headers": headers}


def remote_decode(
    endpoint: str,
    tensor: "torch.Tensor",
    processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None,
    do_scaling: bool = True,
    scaling_factor: Optional[float] = None,
    shift_factor: Optional[float] = None,
    output_type: Literal["mp4", "pil", "pt"] = "pil",
    return_type: Literal["mp4", "pil", "pt"] = "pil",
    image_format: Literal["png", "jpg"] = "jpg",
    partial_postprocess: bool = False,
    input_tensor_type: Literal["binary"] = "binary",
    output_tensor_type: Literal["binary"] = "binary",
    height: Optional[int] = None,
    width: Optional[int] = None,
) -> Union[Image.Image, List[Image.Image], bytes, "torch.Tensor"]:
    """
    Hugging Face Hybrid Inference that allow running VAE decode remotely.

    Args:
        endpoint (`str`):
            Endpoint for Remote Decode.
        tensor (`torch.Tensor`):
            Tensor to be decoded.
        processor (`VaeImageProcessor` or `VideoProcessor`, *optional*):
            Used with `return_type="pt"`, and `return_type="pil"` for Video models.
        do_scaling (`bool`, default `True`, *optional*):
            **DEPRECATED**. **pass `scaling_factor`/`shift_factor` instead.** **still set
            do_scaling=None/do_scaling=False for no scaling until option is removed** When `True` scaling e.g. `latents
            / self.vae.config.scaling_factor` is applied remotely. If `False`, input must be passed with scaling
            applied.
        scaling_factor (`float`, *optional*):
            Scaling is applied when passed e.g. [`latents /
            self.vae.config.scaling_factor`](https://github.com/huggingface/diffusers/blob/7007febae5cff000d4df9059d9cf35133e8b2ca9/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L1083C37-L1083C77).
            - SD v1: 0.18215
            - SD XL: 0.13025
            - Flux: 0.3611
            If `None`, input must be passed with scaling applied.
        shift_factor (`float`, *optional*):
            Shift is applied when passed e.g. `latents + self.vae.config.shift_factor`.
            - Flux: 0.1159
            If `None`, input must be passed with scaling applied.
        output_type (`"mp4"` or `"pil"` or `"pt", default `"pil"):
            **Endpoint** output type. Subject to change. Report feedback on preferred type.

            `"mp4": Supported by video models. Endpoint returns `bytes` of video. `"pil"`: Supported by image and video
            models.
                Image models: Endpoint returns `bytes` of an image in `image_format`. Video models: Endpoint returns
                `torch.Tensor` with partial `postprocessing` applied.
                    Requires `processor` as a flag (any `None` value will work).
            `"pt"`: Support by image and video models. Endpoint returns `torch.Tensor`.
                With `partial_postprocess=True` the tensor is postprocessed `uint8` image tensor.

            Recommendations:
                `"pt"` with `partial_postprocess=True` is the smallest transfer for full quality. `"pt"` with
                `partial_postprocess=False` is the most compatible with third party code. `"pil"` with
                `image_format="jpg"` is the smallest transfer overall.

        return_type (`"mp4"` or `"pil"` or `"pt", default `"pil"):
            **Function** return type.

            `"mp4": Function returns `bytes` of video. `"pil"`: Function returns `PIL.Image.Image`.
                With `output_type="pil" no further processing is applied. With `output_type="pt" a `PIL.Image.Image` is
                created.
                    `partial_postprocess=False` `processor` is required. `partial_postprocess=True` `processor` is
                    **not** required.
            `"pt"`: Function returns `torch.Tensor`.
                `processor` is **not** required. `partial_postprocess=False` tensor is `float16` or `bfloat16`, without
                denormalization. `partial_postprocess=True` tensor is `uint8`, denormalized.

        image_format (`"png"` or `"jpg"`, default `jpg`):
            Used with `output_type="pil"`. Endpoint returns `jpg` or `png`.

        partial_postprocess (`bool`, default `False`):
            Used with `output_type="pt"`. `partial_postprocess=False` tensor is `float16` or `bfloat16`, without
            denormalization. `partial_postprocess=True` tensor is `uint8`, denormalized.

        input_tensor_type (`"binary"`, default `"binary"`):
            Tensor transfer type.

        output_tensor_type (`"binary"`, default `"binary"`):
            Tensor transfer type.

        height (`int`, **optional**):
            Required for `"packed"` latents.

        width (`int`, **optional**):
            Required for `"packed"` latents.

    Returns:
        output (`Image.Image` or `List[Image.Image]` or `bytes` or `torch.Tensor`).
    """
    if input_tensor_type == "base64":
        deprecate(
            "input_tensor_type='base64'",
            "1.0.0",
            "input_tensor_type='base64' is deprecated. Using `binary`.",
            standard_warn=False,
        )
        input_tensor_type = "binary"
    if output_tensor_type == "base64":
        deprecate(
            "output_tensor_type='base64'",
            "1.0.0",
            "output_tensor_type='base64' is deprecated. Using `binary`.",
            standard_warn=False,
        )
        output_tensor_type = "binary"
    check_inputs_decode(
        endpoint,
        tensor,
        processor,
        do_scaling,
        scaling_factor,
        shift_factor,
        output_type,
        return_type,
        image_format,
        partial_postprocess,
        input_tensor_type,
        output_tensor_type,
        height,
        width,
    )
    kwargs = prepare_decode(
        tensor=tensor,
        processor=processor,
        do_scaling=do_scaling,
        scaling_factor=scaling_factor,
        shift_factor=shift_factor,
        output_type=output_type,
        image_format=image_format,
        partial_postprocess=partial_postprocess,
        height=height,
        width=width,
    )
    response = requests.post(endpoint, **kwargs)
    if not response.ok:
        raise RuntimeError(response.json())
    output = postprocess_decode(
        response=response,
        processor=processor,
        output_type=output_type,
        return_type=return_type,
        partial_postprocess=partial_postprocess,
    )
    return output


def check_inputs_encode(
    endpoint: str,
    image: Union["torch.Tensor", Image.Image],
    scaling_factor: Optional[float] = None,
    shift_factor: Optional[float] = None,
):
    pass


def postprocess_encode(
    response: requests.Response,
):
    output_tensor = response.content
    parameters = response.headers
    shape = json.loads(parameters["shape"])
    dtype = parameters["dtype"]
    torch_dtype = DTYPE_MAP[dtype]
    output_tensor = torch.frombuffer(bytearray(output_tensor), dtype=torch_dtype).reshape(shape)
    return output_tensor


def prepare_encode(
    image: Union["torch.Tensor", Image.Image],
    scaling_factor: Optional[float] = None,
    shift_factor: Optional[float] = None,
):
    headers = {}
    parameters = {}
    if scaling_factor is not None:
        parameters["scaling_factor"] = scaling_factor
    if shift_factor is not None:
        parameters["shift_factor"] = shift_factor
    if isinstance(image, torch.Tensor):
        data = safetensors.torch._tobytes(image.contiguous(), "tensor")
        parameters["shape"] = list(image.shape)
        parameters["dtype"] = str(image.dtype).split(".")[-1]
    else:
        buffer = io.BytesIO()
        image.save(buffer, format="PNG")
        data = buffer.getvalue()
    return {"data": data, "params": parameters, "headers": headers}


def remote_encode(
    endpoint: str,
    image: Union["torch.Tensor", Image.Image],
    scaling_factor: Optional[float] = None,
    shift_factor: Optional[float] = None,
) -> "torch.Tensor":
    """
    Hugging Face Hybrid Inference that allow running VAE encode remotely.

    Args:
        endpoint (`str`):
            Endpoint for Remote Decode.
        image (`torch.Tensor` or `PIL.Image.Image`):
            Image to be encoded.
        scaling_factor (`float`, *optional*):
            Scaling is applied when passed e.g. [`latents * self.vae.config.scaling_factor`].
            - SD v1: 0.18215
            - SD XL: 0.13025
            - Flux: 0.3611
            If `None`, input must be passed with scaling applied.
        shift_factor (`float`, *optional*):
            Shift is applied when passed e.g. `latents - self.vae.config.shift_factor`.
            - Flux: 0.1159
            If `None`, input must be passed with scaling applied.

    Returns:
        output (`torch.Tensor`).
    """
    check_inputs_encode(
        endpoint,
        image,
        scaling_factor,
        shift_factor,
    )
    kwargs = prepare_encode(
        image=image,
        scaling_factor=scaling_factor,
        shift_factor=shift_factor,
    )
    response = requests.post(endpoint, **kwargs)
    if not response.ok:
        raise RuntimeError(response.json())
    output = postprocess_encode(
        response=response,
    )
    return output