293 lines
8.9 KiB
Python
293 lines
8.9 KiB
Python
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
from typing import TYPE_CHECKING
|
|
|
|
from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_torch_greater_or_equal
|
|
|
|
|
|
_import_structure = {
|
|
"aqlm": ["replace_with_aqlm_linear"],
|
|
"awq": [
|
|
"fuse_awq_modules",
|
|
"post_init_awq_exllama_modules",
|
|
"post_init_awq_ipex_modules",
|
|
"replace_quantization_scales",
|
|
"replace_with_awq_linear",
|
|
],
|
|
"bitnet": [
|
|
"BitLinear",
|
|
"pack_weights",
|
|
"replace_with_bitnet_linear",
|
|
"unpack_weights",
|
|
],
|
|
"bitsandbytes": [
|
|
"dequantize_and_replace",
|
|
"get_keys_to_not_convert",
|
|
"replace_8bit_linear",
|
|
"replace_with_bnb_linear",
|
|
"set_module_8bit_tensor_to_device",
|
|
"set_module_quantized_tensor_to_device",
|
|
"validate_bnb_backend_availability",
|
|
],
|
|
"deepspeed": [
|
|
"HfDeepSpeedConfig",
|
|
"HfTrainerDeepSpeedConfig",
|
|
"deepspeed_config",
|
|
"deepspeed_init",
|
|
"deepspeed_load_checkpoint",
|
|
"deepspeed_optim_sched",
|
|
"is_deepspeed_available",
|
|
"is_deepspeed_zero3_enabled",
|
|
"set_hf_deepspeed_config",
|
|
"unset_hf_deepspeed_config",
|
|
],
|
|
"eetq": ["replace_with_eetq_linear"],
|
|
"fbgemm_fp8": ["FbgemmFp8Linear", "FbgemmFp8Llama4TextExperts", "replace_with_fbgemm_fp8_linear"],
|
|
"finegrained_fp8": ["FP8Linear", "replace_with_fp8_linear"],
|
|
"fsdp": ["is_fsdp_managed_module"],
|
|
"ggml": [
|
|
"GGUF_CONFIG_MAPPING",
|
|
"GGUF_TOKENIZER_MAPPING",
|
|
"_gguf_parse_value",
|
|
"load_dequant_gguf_tensor",
|
|
"load_gguf",
|
|
],
|
|
"higgs": [
|
|
"HiggsLinear",
|
|
"dequantize_higgs",
|
|
"quantize_with_higgs",
|
|
"replace_with_higgs_linear",
|
|
],
|
|
"hqq": ["prepare_for_hqq_linear"],
|
|
"hub_kernels": [
|
|
"LayerRepository",
|
|
"register_kernel_mapping",
|
|
"replace_kernel_forward_from_hub",
|
|
"use_kernel_forward_from_hub",
|
|
],
|
|
"integration_utils": [
|
|
"INTEGRATION_TO_CALLBACK",
|
|
"AzureMLCallback",
|
|
"ClearMLCallback",
|
|
"CodeCarbonCallback",
|
|
"CometCallback",
|
|
"DagsHubCallback",
|
|
"DVCLiveCallback",
|
|
"FlyteCallback",
|
|
"MLflowCallback",
|
|
"NeptuneCallback",
|
|
"NeptuneMissingConfiguration",
|
|
"SwanLabCallback",
|
|
"TensorBoardCallback",
|
|
"TrackioCallback",
|
|
"WandbCallback",
|
|
"get_available_reporting_integrations",
|
|
"get_reporting_integration_callbacks",
|
|
"hp_params",
|
|
"is_azureml_available",
|
|
"is_clearml_available",
|
|
"is_codecarbon_available",
|
|
"is_comet_available",
|
|
"is_dagshub_available",
|
|
"is_dvclive_available",
|
|
"is_flyte_deck_standard_available",
|
|
"is_flytekit_available",
|
|
"is_mlflow_available",
|
|
"is_neptune_available",
|
|
"is_optuna_available",
|
|
"is_ray_available",
|
|
"is_ray_tune_available",
|
|
"is_sigopt_available",
|
|
"is_swanlab_available",
|
|
"is_tensorboard_available",
|
|
"is_trackio_available",
|
|
"is_wandb_available",
|
|
"rewrite_logs",
|
|
"run_hp_search_optuna",
|
|
"run_hp_search_ray",
|
|
"run_hp_search_sigopt",
|
|
"run_hp_search_wandb",
|
|
],
|
|
"peft": ["PeftAdapterMixin"],
|
|
"quanto": ["replace_with_quanto_layers"],
|
|
"spqr": ["replace_with_spqr_linear"],
|
|
"vptq": ["replace_with_vptq_linear"],
|
|
}
|
|
|
|
try:
|
|
if not is_torch_available():
|
|
raise OptionalDependencyNotAvailable()
|
|
except OptionalDependencyNotAvailable:
|
|
pass
|
|
else:
|
|
_import_structure["executorch"] = [
|
|
"TorchExportableModuleWithStaticCache",
|
|
"convert_and_export_with_cache",
|
|
]
|
|
|
|
try:
|
|
if not is_torch_greater_or_equal("2.3"):
|
|
raise OptionalDependencyNotAvailable()
|
|
except OptionalDependencyNotAvailable:
|
|
pass
|
|
else:
|
|
_import_structure["tensor_parallel"] = [
|
|
"shard_and_distribute_module",
|
|
"ALL_PARALLEL_STYLES",
|
|
"translate_to_torch_parallel_style",
|
|
]
|
|
try:
|
|
if not is_torch_greater_or_equal("2.5"):
|
|
raise OptionalDependencyNotAvailable()
|
|
except OptionalDependencyNotAvailable:
|
|
pass
|
|
else:
|
|
_import_structure["flex_attention"] = [
|
|
"make_flex_block_causal_mask",
|
|
]
|
|
|
|
if TYPE_CHECKING:
|
|
from .aqlm import replace_with_aqlm_linear
|
|
from .awq import (
|
|
fuse_awq_modules,
|
|
post_init_awq_exllama_modules,
|
|
post_init_awq_ipex_modules,
|
|
replace_quantization_scales,
|
|
replace_with_awq_linear,
|
|
)
|
|
from .bitnet import (
|
|
BitLinear,
|
|
pack_weights,
|
|
replace_with_bitnet_linear,
|
|
unpack_weights,
|
|
)
|
|
from .bitsandbytes import (
|
|
dequantize_and_replace,
|
|
get_keys_to_not_convert,
|
|
replace_8bit_linear,
|
|
replace_with_bnb_linear,
|
|
set_module_8bit_tensor_to_device,
|
|
set_module_quantized_tensor_to_device,
|
|
validate_bnb_backend_availability,
|
|
)
|
|
from .deepspeed import (
|
|
HfDeepSpeedConfig,
|
|
HfTrainerDeepSpeedConfig,
|
|
deepspeed_config,
|
|
deepspeed_init,
|
|
deepspeed_load_checkpoint,
|
|
deepspeed_optim_sched,
|
|
is_deepspeed_available,
|
|
is_deepspeed_zero3_enabled,
|
|
set_hf_deepspeed_config,
|
|
unset_hf_deepspeed_config,
|
|
)
|
|
from .eetq import replace_with_eetq_linear
|
|
from .fbgemm_fp8 import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts, replace_with_fbgemm_fp8_linear
|
|
from .finegrained_fp8 import FP8Linear, replace_with_fp8_linear
|
|
from .fsdp import is_fsdp_managed_module
|
|
from .ggml import (
|
|
GGUF_CONFIG_MAPPING,
|
|
GGUF_TOKENIZER_MAPPING,
|
|
_gguf_parse_value,
|
|
load_dequant_gguf_tensor,
|
|
load_gguf,
|
|
)
|
|
from .higgs import HiggsLinear, dequantize_higgs, quantize_with_higgs, replace_with_higgs_linear
|
|
from .hqq import prepare_for_hqq_linear
|
|
from .hub_kernels import (
|
|
LayerRepository,
|
|
register_kernel_mapping,
|
|
replace_kernel_forward_from_hub,
|
|
use_kernel_forward_from_hub,
|
|
)
|
|
from .integration_utils import (
|
|
INTEGRATION_TO_CALLBACK,
|
|
AzureMLCallback,
|
|
ClearMLCallback,
|
|
CodeCarbonCallback,
|
|
CometCallback,
|
|
DagsHubCallback,
|
|
DVCLiveCallback,
|
|
FlyteCallback,
|
|
MLflowCallback,
|
|
NeptuneCallback,
|
|
NeptuneMissingConfiguration,
|
|
SwanLabCallback,
|
|
TensorBoardCallback,
|
|
TrackioCallback,
|
|
WandbCallback,
|
|
get_available_reporting_integrations,
|
|
get_reporting_integration_callbacks,
|
|
hp_params,
|
|
is_azureml_available,
|
|
is_clearml_available,
|
|
is_codecarbon_available,
|
|
is_comet_available,
|
|
is_dagshub_available,
|
|
is_dvclive_available,
|
|
is_flyte_deck_standard_available,
|
|
is_flytekit_available,
|
|
is_mlflow_available,
|
|
is_neptune_available,
|
|
is_optuna_available,
|
|
is_ray_available,
|
|
is_ray_tune_available,
|
|
is_sigopt_available,
|
|
is_swanlab_available,
|
|
is_tensorboard_available,
|
|
is_trackio_available,
|
|
is_wandb_available,
|
|
rewrite_logs,
|
|
run_hp_search_optuna,
|
|
run_hp_search_ray,
|
|
run_hp_search_sigopt,
|
|
run_hp_search_wandb,
|
|
)
|
|
from .peft import PeftAdapterMixin
|
|
from .quanto import replace_with_quanto_layers
|
|
from .spqr import replace_with_spqr_linear
|
|
from .vptq import replace_with_vptq_linear
|
|
|
|
try:
|
|
if not is_torch_available():
|
|
raise OptionalDependencyNotAvailable()
|
|
except OptionalDependencyNotAvailable:
|
|
pass
|
|
else:
|
|
from .executorch import TorchExportableModuleWithStaticCache, convert_and_export_with_cache
|
|
|
|
try:
|
|
if not is_torch_greater_or_equal("2.3"):
|
|
raise OptionalDependencyNotAvailable()
|
|
except OptionalDependencyNotAvailable:
|
|
pass
|
|
else:
|
|
from .tensor_parallel import (
|
|
ALL_PARALLEL_STYLES,
|
|
shard_and_distribute_module,
|
|
translate_to_torch_parallel_style,
|
|
)
|
|
|
|
try:
|
|
if not is_torch_greater_or_equal("2.5"):
|
|
raise OptionalDependencyNotAvailable()
|
|
except OptionalDependencyNotAvailable:
|
|
pass
|
|
else:
|
|
from .flex_attention import make_flex_block_causal_mask
|
|
else:
|
|
import sys
|
|
|
|
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
|