# Copyright 2024 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import TYPE_CHECKING, Any, Optional from ..utils.logging import tqdm from .base import HfQuantizer from .quantizers_utils import get_module_from_name if TYPE_CHECKING: from ..modeling_utils import PreTrainedModel from ..utils import is_accelerate_available, is_flute_available, is_hadamard_available, is_torch_available, logging from ..utils.quantization_config import QuantizationConfigMixin if is_torch_available(): import torch logger = logging.get_logger(__name__) class HiggsHfQuantizer(HfQuantizer): """ Quantizer of the HIGGS method. Enables the loading of prequantized models and in-flight quantization of full-precision models. """ requires_calibration = False requires_parameters_quantization = True required_packages = ["flute-kernel", "fast_hadamard_transform"] def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): super().__init__(quantization_config, **kwargs) self.quantization_config = quantization_config def validate_environment(self, device_map, **kwargs): if not torch.cuda.is_available(): raise NotImplementedError("HIGGS quantization is only supported on GPU. Please use a different quantizer.") if not is_accelerate_available(): raise ImportError("Using `higgs` quantization requires Accelerate: `pip install accelerate`") if not is_flute_available(): raise ImportError("Using `higgs` quantization requires FLUTE: `pip install flute-kernel>=0.3.0`") if not is_hadamard_available(): raise ImportError( "Using `higgs` quantization requires fast_hadamard_transform: `pip install fast_hadamard_transform`" ) if device_map is None: raise ValueError( "You are attempting to load a HIGGS model without setting device_map." " Please set device_map comprised of 'cuda' devices." ) elif isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()): raise ValueError( "You are attempting to load a HIGGS model with a device_map that contains a CPU or disk device." " This is not supported. Please remove the CPU or disk device from the device_map." ) def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": if torch_dtype is None: logger.info("`torch_dtype` is None. Setting `torch_dtype=torch.float16` for FLUTE compatibility.") torch_dtype = torch.float16 elif torch_dtype != torch.float16 and torch_dtype != torch.bfloat16: raise ValueError( f"Invalid `torch_dtype` {torch_dtype}. HIGGS quantization only supports `torch_dtype=torch.float16` or `torch_dtype=torch.bfloat16`." ) return torch_dtype def create_quantized_param( self, model: "PreTrainedModel", param_value: "torch.Tensor", param_name: str, target_device: "torch.device", state_dict: dict[str, Any], unexpected_keys: Optional[list[str]] = None, ): from ..integrations import quantize_with_higgs """ Quantizes weights into weight and weight_scale """ flute_dict = quantize_with_higgs( param_value.to(target_device), self.quantization_config.bits, self.quantization_config.p, self.quantization_config.group_size, self.quantization_config.hadamard_size, ) del param_value module, _ = get_module_from_name(model, param_name) module_name = ".".join(param_name.split(".")[:-1]) for key, value in flute_dict.items(): if key in module._parameters: module._parameters[key] = torch.nn.Parameter(value, requires_grad=False) elif key in module._buffers: module._buffers[key] = torch.nn.Buffer(value) elif key == "tune_metadata": module.tune_metadata = value self.quantization_config.tune_metadata[module_name] = value.to_dict() else: raise ValueError(f"Unexpected key {key} in module {module}") if unexpected_keys is not None and param_name in unexpected_keys: unexpected_keys.remove(param_name) def _process_model_before_weight_loading( self, model: "PreTrainedModel", **kwargs, ): from ..integrations import replace_with_higgs_linear replace_with_higgs_linear( model, quantization_config=self.quantization_config, ) model.config.quantization_config = self.quantization_config def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): from flute.tune import TuneMetaData, maybe_tune_and_repack from flute.utils import make_workspace_streamk from ..integrations import HiggsLinear flute_workspaces = {} flute_modules = {name: module for name, module in model.named_modules() if isinstance(module, HiggsLinear)} for name, module in tqdm(flute_modules.items(), desc="Repacking HIGGS modules", leave=False): # Every HiggsLinear needs a "workspace": a buffer for the unpacking operation. # This buffer needs to be on the same device as the weights, but can be reused across modules otherwise. if module.weight.device not in flute_workspaces: flute_workspaces[module.weight.device] = make_workspace_streamk(device=module.weight.device) module.workspace = flute_workspaces[module.weight.device] # FLUTE weights are packed in a way that is optimized for a specific number of SMs (GPU streaming multiprocessors). # If the model is loaded on a different device than the one it was saved on, we need to repack the weights. module.tune_metadata = TuneMetaData.from_dict(self.quantization_config.tune_metadata[name]) module.weight.data, module.tune_metadata = maybe_tune_and_repack( weight=module.weight.data, scales=module.scales.data, metadata=module.tune_metadata, ) self.quantization_config.tune_metadata[name] = module.tune_metadata.to_dict() def update_missing_keys(self, model, missing_keys: list[str], prefix: str) -> list[str]: from ..integrations import HiggsLinear higgs_names = {name for name, module in model.named_modules() if isinstance(module, HiggsLinear)} def should_update(key: str) -> bool: if key.endswith(".weight") or key.endswith(".bias"): return False full_key = f"{prefix}.{key}" return any(name in key or name in full_key for name in higgs_names) return [key for key in missing_keys if not should_update(key)] @property def is_trainable(self) -> bool: return False def is_serializable(self, safe_serialization=None): return True def check_quantized_param( self, model: "PreTrainedModel", param_value: "torch.Tensor", param_name: str, state_dict: dict[str, Any], **kwargs, ) -> bool: from ..integrations import HiggsLinear module, tensor_name = get_module_from_name(model, param_name) if isinstance(module, HiggsLinear) and tensor_name == "weight" and param_value.dtype != torch.int16: # Only quantize weights of HiggsLinear modules that are not already quantized return True else: return False def _dequantize(self, model): from ..integrations import dequantize_higgs model = dequantize_higgs(model) return model