Adding all project files

This commit is contained in:
Martina Burlando 2025-08-02 02:00:33 +02:00
parent 6c9e127bdc
commit cd4316ad0f
42289 changed files with 8009643 additions and 0 deletions

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,50 @@
""" Module to give helpful messages to the user that did not
compile scikit-learn properly.
"""
import os
INPLACE_MSG = """
It appears that you are importing a local scikit-learn source tree. For
this, you need to have an inplace install. Maybe you are in the source
directory and you need to try from another location."""
STANDARD_MSG = """
If you have used an installer, please check that it is suited for your
Python version, your operating system and your platform."""
def raise_build_error(e):
# Raise a comprehensible error and list the contents of the
# directory to help debugging on the mailing list.
local_dir = os.path.split(__file__)[0]
msg = STANDARD_MSG
if local_dir == "sklearn/__check_build":
# Picking up the local install: this will work only if the
# install is an 'inplace build'
msg = INPLACE_MSG
dir_content = list()
for i, filename in enumerate(os.listdir(local_dir)):
if (i + 1) % 3:
dir_content.append(filename.ljust(26))
else:
dir_content.append(filename + "\n")
raise ImportError(
"""%s
___________________________________________________________________________
Contents of %s:
%s
___________________________________________________________________________
It seems that scikit-learn has not been built correctly.
If you have installed scikit-learn from source, please do not forget
to build the package before using it: run `python setup.py install` or
`make` in the source directory.
%s"""
% (e, local_dir, "".join(dir_content).strip(), msg)
)
try:
from ._check_build import check_build # noqa
except ImportError as e:
raise_build_error(e)

View file

@ -0,0 +1,144 @@
"""
Machine learning module for Python
==================================
sklearn is a Python module integrating classical machine
learning algorithms in the tightly-knit world of scientific Python
packages (numpy, scipy, matplotlib).
It aims to provide simple and efficient solutions to learning problems
that are accessible to everybody and reusable in various contexts:
machine-learning as a versatile tool for science and engineering.
See http://scikit-learn.org for complete documentation.
"""
import sys
import logging
import os
import random
from ._config import get_config, set_config, config_context
logger = logging.getLogger(__name__)
# PEP0440 compatible formatted version, see:
# https://www.python.org/dev/peps/pep-0440/
#
# Generic release markers:
# X.Y.0 # For first release after an increment in Y
# X.Y.Z # For bugfix releases
#
# Admissible pre-release markers:
# X.Y.ZaN # Alpha release
# X.Y.ZbN # Beta release
# X.Y.ZrcN # Release Candidate
# X.Y.Z # Final release
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
#
__version__ = "1.2.2"
# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
# simultaneously. This can happen for instance when calling BLAS inside a
# prange. Setting the following environment variable allows multiple OpenMP
# libraries to be loaded. It should not degrade performances since we manually
# take care of potential over-subcription performance issues, in sections of
# the code where nested OpenMP loops can happen, by dynamically reconfiguring
# the inner OpenMP runtime to temporarily disable it while under the scope of
# the outer OpenMP parallel section.
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
# Workaround issue discovered in intel-openmp 2019.5:
# https://github.com/ContinuumIO/anaconda-issues/issues/11294
os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
try:
# This variable is injected in the __builtins__ by the build
# process. It is used to enable importing subpackages of sklearn when
# the binaries are not built
# mypy error: Cannot determine type of '__SKLEARN_SETUP__'
__SKLEARN_SETUP__ # type: ignore
except NameError:
__SKLEARN_SETUP__ = False
if __SKLEARN_SETUP__:
sys.stderr.write("Partial import of sklearn during the build process.\n")
# We are not importing the rest of scikit-learn during the build
# process, as it may not be compiled yet
else:
# `_distributor_init` allows distributors to run custom init code.
# For instance, for the Windows wheel, this is used to pre-load the
# vcomp shared library runtime for OpenMP embedded in the sklearn/.libs
# sub-folder.
# It is necessary to do this prior to importing show_versions as the
# later is linked to the OpenMP runtime to make it possible to introspect
# it and importing it first would fail if the OpenMP dll cannot be found.
from . import _distributor_init # noqa: F401
from . import __check_build # noqa: F401
from .base import clone
from .utils._show_versions import show_versions
__all__ = [
"calibration",
"cluster",
"covariance",
"cross_decomposition",
"datasets",
"decomposition",
"dummy",
"ensemble",
"exceptions",
"experimental",
"externals",
"feature_extraction",
"feature_selection",
"gaussian_process",
"inspection",
"isotonic",
"kernel_approximation",
"kernel_ridge",
"linear_model",
"manifold",
"metrics",
"mixture",
"model_selection",
"multiclass",
"multioutput",
"naive_bayes",
"neighbors",
"neural_network",
"pipeline",
"preprocessing",
"random_projection",
"semi_supervised",
"svm",
"tree",
"discriminant_analysis",
"impute",
"compose",
# Non-modules:
"clone",
"get_config",
"set_config",
"config_context",
"show_versions",
]
def setup_module(module):
"""Fixture for the tests to assure globally controllable seeding of RNGs"""
import numpy as np
# Check if a random seed exists in the environment, if not create one.
_random_seed = os.environ.get("SKLEARN_SEED", None)
if _random_seed is None:
_random_seed = np.random.uniform() * np.iinfo(np.int32).max
_random_seed = int(_random_seed)
print("I: Seeding RNGs with %r" % _random_seed)
np.random.seed(_random_seed)
random.seed(_random_seed)

View file

@ -0,0 +1,128 @@
"""
Utilities useful during the build.
"""
# author: Andy Mueller, Gael Varoquaux
# license: BSD
import os
import sklearn
import contextlib
from .pre_build_helpers import basic_check_build
from .openmp_helpers import check_openmp_support
from .._min_dependencies import CYTHON_MIN_VERSION
from ..externals._packaging.version import parse
DEFAULT_ROOT = "sklearn"
def _check_cython_version():
message = (
"Please install Cython with a version >= {0} in order "
"to build a scikit-learn from source."
).format(CYTHON_MIN_VERSION)
try:
import Cython
except ModuleNotFoundError as e:
# Re-raise with more informative error message instead:
raise ModuleNotFoundError(message) from e
if parse(Cython.__version__) < parse(CYTHON_MIN_VERSION):
message += " The current version of Cython is {} installed in {}.".format(
Cython.__version__, Cython.__path__
)
raise ValueError(message)
def cythonize_extensions(extension):
"""Check that a recent Cython is available and cythonize extensions"""
_check_cython_version()
from Cython.Build import cythonize
import Cython
# Fast fail before cythonization if compiler fails compiling basic test
# code even without OpenMP
basic_check_build()
# check simple compilation with OpenMP. If it fails scikit-learn will be
# built without OpenMP and the test test_openmp_supported in the test suite
# will fail.
# `check_openmp_support` compiles a small test program to see if the
# compilers are properly configured to build with OpenMP. This is expensive
# and we only want to call this function once.
# The result of this check is cached as a private attribute on the sklearn
# module (only at build-time) to be used twice:
# - First to set the value of SKLEARN_OPENMP_PARALLELISM_ENABLED, the
# cython build-time variable passed to the cythonize() call.
# - Then in the build_ext subclass defined in the top-level setup.py file
# to actually build the compiled extensions with OpenMP flags if needed.
sklearn._OPENMP_SUPPORTED = check_openmp_support()
n_jobs = 1
with contextlib.suppress(ImportError):
import joblib
n_jobs = joblib.cpu_count()
# Additional checks for Cython
cython_enable_debug_directives = (
os.environ.get("SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES", "0") != "0"
)
compiler_directives = {
"language_level": 3,
"boundscheck": cython_enable_debug_directives,
"wraparound": False,
"initializedcheck": False,
"nonecheck": False,
"cdivision": True,
}
# TODO: once Cython 3 is released and we require Cython>=3 we should get
# rid of the `legacy_implicit_noexcept` directive.
# This should mostly consist in:
#
# - ensuring nogil is at the end of function signature,
# e.g. replace "nogil except -1" by "except -1 nogil".
#
# - "noexcept"-qualifying Cython and externalized C interfaces
# which aren't raising nor propagating exceptions.
# See: https://cython.readthedocs.io/en/latest/src/userguide/language_basics.html#error-return-values # noqa
#
# See: https://github.com/cython/cython/issues/5088 for more details
if parse(Cython.__version__) > parse("3.0.0a11"):
compiler_directives["legacy_implicit_noexcept"] = True
return cythonize(
extension,
nthreads=n_jobs,
compile_time_env={
"SKLEARN_OPENMP_PARALLELISM_ENABLED": sklearn._OPENMP_SUPPORTED
},
compiler_directives=compiler_directives,
)
def gen_from_templates(templates):
"""Generate cython files from a list of templates"""
# Lazy import because cython is not a runtime dependency.
from Cython import Tempita
for template in templates:
outfile = template.replace(".tp", "")
# if the template is not updated, no need to output the cython file
if not (
os.path.exists(outfile)
and os.stat(template).st_mtime < os.stat(outfile).st_mtime
):
with open(template, "r") as f:
tmpl = f.read()
tmpl_ = Tempita.sub(tmpl)
with open(outfile, "w") as f:
f.write(tmpl_)

View file

@ -0,0 +1,132 @@
"""Helpers for OpenMP support during the build."""
# This code is adapted for a large part from the astropy openmp helpers, which
# can be found at: https://github.com/astropy/extension-helpers/blob/master/extension_helpers/_openmp_helpers.py # noqa
import os
import sys
import textwrap
import warnings
from .pre_build_helpers import compile_test_program
def get_openmp_flag(compiler):
if hasattr(compiler, "compiler"):
compiler = compiler.compiler[0]
else:
compiler = compiler.__class__.__name__
if sys.platform == "win32":
return ["/openmp"]
elif sys.platform == "darwin" and "openmp" in os.getenv("CPPFLAGS", ""):
# -fopenmp can't be passed as compile flag when using Apple-clang.
# OpenMP support has to be enabled during preprocessing.
#
# For example, our macOS wheel build jobs use the following environment
# variables to build with Apple-clang and the brew installed "libomp":
#
# export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"
# export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
# export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
# export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib
# -L/usr/local/opt/libomp/lib -lomp"
return []
# Default flag for GCC and clang:
return ["-fopenmp"]
def check_openmp_support():
"""Check whether OpenMP test code can be compiled and run"""
if "PYODIDE_PACKAGE_ABI" in os.environ:
# Pyodide doesn't support OpenMP
return False
code = textwrap.dedent(
"""\
#include <omp.h>
#include <stdio.h>
int main(void) {
#pragma omp parallel
printf("nthreads=%d\\n", omp_get_num_threads());
return 0;
}
"""
)
extra_preargs = os.getenv("LDFLAGS", None)
if extra_preargs is not None:
extra_preargs = extra_preargs.strip().split(" ")
# FIXME: temporary fix to link against system libraries on linux
# "-Wl,--sysroot=/" should be removed
extra_preargs = [
flag
for flag in extra_preargs
if flag.startswith(("-L", "-Wl,-rpath", "-l", "-Wl,--sysroot=/"))
]
extra_postargs = get_openmp_flag
openmp_exception = None
try:
output = compile_test_program(
code, extra_preargs=extra_preargs, extra_postargs=extra_postargs
)
if output and "nthreads=" in output[0]:
nthreads = int(output[0].strip().split("=")[1])
openmp_supported = len(output) == nthreads
elif "PYTHON_CROSSENV" in os.environ:
# Since we can't run the test program when cross-compiling
# assume that openmp is supported if the program can be
# compiled.
openmp_supported = True
else:
openmp_supported = False
except Exception as exception:
# We could be more specific and only catch: CompileError, LinkError,
# and subprocess.CalledProcessError.
# setuptools introduced CompileError and LinkError, but that requires
# version 61.1. Even the latest version of Ubuntu (22.04LTS) only
# ships with 59.6. So for now we catch all exceptions and reraise a
# generic exception with the original error message instead:
openmp_supported = False
openmp_exception = exception
if not openmp_supported:
if os.getenv("SKLEARN_FAIL_NO_OPENMP"):
raise Exception(
"Failed to build scikit-learn with OpenMP support"
) from openmp_exception
else:
message = textwrap.dedent(
"""
***********
* WARNING *
***********
It seems that scikit-learn cannot be built with OpenMP.
- Make sure you have followed the installation instructions:
https://scikit-learn.org/dev/developers/advanced_installation.html
- If your compiler supports OpenMP but you still see this
message, please submit a bug report at:
https://github.com/scikit-learn/scikit-learn/issues
- The build will continue with OpenMP-based parallelism
disabled. Note however that some estimators will run in
sequential mode instead of leveraging thread-based
parallelism.
***
"""
)
warnings.warn(message)
return openmp_supported

View file

@ -0,0 +1,82 @@
"""Helpers to check build environment before actual build of scikit-learn"""
import os
import sys
import glob
import tempfile
import textwrap
import subprocess
from setuptools.command.build_ext import customize_compiler, new_compiler
def compile_test_program(code, extra_preargs=[], extra_postargs=[]):
"""Check that some C code can be compiled and run"""
ccompiler = new_compiler()
customize_compiler(ccompiler)
# extra_(pre/post)args can be a callable to make it possible to get its
# value from the compiler
if callable(extra_preargs):
extra_preargs = extra_preargs(ccompiler)
if callable(extra_postargs):
extra_postargs = extra_postargs(ccompiler)
start_dir = os.path.abspath(".")
with tempfile.TemporaryDirectory() as tmp_dir:
try:
os.chdir(tmp_dir)
# Write test program
with open("test_program.c", "w") as f:
f.write(code)
os.mkdir("objects")
# Compile, test program
ccompiler.compile(
["test_program.c"], output_dir="objects", extra_postargs=extra_postargs
)
# Link test program
objects = glob.glob(os.path.join("objects", "*" + ccompiler.obj_extension))
ccompiler.link_executable(
objects,
"test_program",
extra_preargs=extra_preargs,
extra_postargs=extra_postargs,
)
if "PYTHON_CROSSENV" not in os.environ:
# Run test program if not cross compiling
# will raise a CalledProcessError if return code was non-zero
output = subprocess.check_output("./test_program")
output = output.decode(sys.stdout.encoding or "utf-8").splitlines()
else:
# Return an empty output if we are cross compiling
# as we cannot run the test_program
output = []
except Exception:
raise
finally:
os.chdir(start_dir)
return output
def basic_check_build():
"""Check basic compilation and linking of C code"""
if "PYODIDE_PACKAGE_ABI" in os.environ:
# The following check won't work in pyodide
return
code = textwrap.dedent(
"""\
#include <stdio.h>
int main(void) {
return 0;
}
"""
)
compile_test_program(code)

View file

@ -0,0 +1,294 @@
"""Global configuration state and functions for management
"""
import os
from contextlib import contextmanager as contextmanager
import threading
_global_config = {
"assume_finite": bool(os.environ.get("SKLEARN_ASSUME_FINITE", False)),
"working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
"print_changed_only": True,
"display": "diagram",
"pairwise_dist_chunk_size": int(
os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256)
),
"enable_cython_pairwise_dist": True,
"array_api_dispatch": False,
"transform_output": "default",
}
_threadlocal = threading.local()
def _get_threadlocal_config():
"""Get a threadlocal **mutable** configuration. If the configuration
does not exist, copy the default global configuration."""
if not hasattr(_threadlocal, "global_config"):
_threadlocal.global_config = _global_config.copy()
return _threadlocal.global_config
def get_config():
"""Retrieve current values for configuration set by :func:`set_config`.
Returns
-------
config : dict
Keys are parameter names that can be passed to :func:`set_config`.
See Also
--------
config_context : Context manager for global scikit-learn configuration.
set_config : Set global scikit-learn configuration.
"""
# Return a copy of the threadlocal configuration so that users will
# not be able to modify the configuration with the returned dict.
return _get_threadlocal_config().copy()
def set_config(
assume_finite=None,
working_memory=None,
print_changed_only=None,
display=None,
pairwise_dist_chunk_size=None,
enable_cython_pairwise_dist=None,
array_api_dispatch=None,
transform_output=None,
):
"""Set global scikit-learn configuration
.. versionadded:: 0.19
Parameters
----------
assume_finite : bool, default=None
If True, validation for finiteness will be skipped,
saving time, but leading to potential crashes. If
False, validation for finiteness will be performed,
avoiding error. Global default: False.
.. versionadded:: 0.19
working_memory : int, default=None
If set, scikit-learn will attempt to limit the size of temporary arrays
to this number of MiB (per job when parallelised), often saving both
computation time and memory on expensive operations that can be
performed in chunks. Global default: 1024.
.. versionadded:: 0.20
print_changed_only : bool, default=None
If True, only the parameters that were set to non-default
values will be printed when printing an estimator. For example,
``print(SVC())`` while True will only print 'SVC()' while the default
behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with
all the non-changed parameters.
.. versionadded:: 0.21
display : {'text', 'diagram'}, default=None
If 'diagram', estimators will be displayed as a diagram in a Jupyter
lab or notebook context. If 'text', estimators will be displayed as
text. Default is 'diagram'.
.. versionadded:: 0.23
pairwise_dist_chunk_size : int, default=None
The number of row vectors per chunk for the accelerated pairwise-
distances reduction backend. Default is 256 (suitable for most of
modern laptops' caches and architectures).
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
enable_cython_pairwise_dist : bool, default=None
Use the accelerated pairwise-distances reduction backend when
possible. Global default: True.
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
array_api_dispatch : bool, default=None
Use Array API dispatching when inputs follow the Array API standard.
Default is False.
See the :ref:`User Guide <array_api>` for more details.
.. versionadded:: 1.2
transform_output : str, default=None
Configure output of `transform` and `fit_transform`.
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
for an example on how to use the API.
- `"default"`: Default output format of a transformer
- `"pandas"`: DataFrame output
- `None`: Transform configuration is unchanged
.. versionadded:: 1.2
See Also
--------
config_context : Context manager for global scikit-learn configuration.
get_config : Retrieve current values of the global configuration.
"""
local_config = _get_threadlocal_config()
if assume_finite is not None:
local_config["assume_finite"] = assume_finite
if working_memory is not None:
local_config["working_memory"] = working_memory
if print_changed_only is not None:
local_config["print_changed_only"] = print_changed_only
if display is not None:
local_config["display"] = display
if pairwise_dist_chunk_size is not None:
local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size
if enable_cython_pairwise_dist is not None:
local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist
if array_api_dispatch is not None:
local_config["array_api_dispatch"] = array_api_dispatch
if transform_output is not None:
local_config["transform_output"] = transform_output
@contextmanager
def config_context(
*,
assume_finite=None,
working_memory=None,
print_changed_only=None,
display=None,
pairwise_dist_chunk_size=None,
enable_cython_pairwise_dist=None,
array_api_dispatch=None,
transform_output=None,
):
"""Context manager for global scikit-learn configuration.
Parameters
----------
assume_finite : bool, default=None
If True, validation for finiteness will be skipped,
saving time, but leading to potential crashes. If
False, validation for finiteness will be performed,
avoiding error. If None, the existing value won't change.
The default value is False.
working_memory : int, default=None
If set, scikit-learn will attempt to limit the size of temporary arrays
to this number of MiB (per job when parallelised), often saving both
computation time and memory on expensive operations that can be
performed in chunks. If None, the existing value won't change.
The default value is 1024.
print_changed_only : bool, default=None
If True, only the parameters that were set to non-default
values will be printed when printing an estimator. For example,
``print(SVC())`` while True will only print 'SVC()', but would print
'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
when False. If None, the existing value won't change.
The default value is True.
.. versionchanged:: 0.23
Default changed from False to True.
display : {'text', 'diagram'}, default=None
If 'diagram', estimators will be displayed as a diagram in a Jupyter
lab or notebook context. If 'text', estimators will be displayed as
text. If None, the existing value won't change.
The default value is 'diagram'.
.. versionadded:: 0.23
pairwise_dist_chunk_size : int, default=None
The number of row vectors per chunk for the accelerated pairwise-
distances reduction backend. Default is 256 (suitable for most of
modern laptops' caches and architectures).
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
enable_cython_pairwise_dist : bool, default=None
Use the accelerated pairwise-distances reduction backend when
possible. Global default: True.
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
array_api_dispatch : bool, default=None
Use Array API dispatching when inputs follow the Array API standard.
Default is False.
See the :ref:`User Guide <array_api>` for more details.
.. versionadded:: 1.2
transform_output : str, default=None
Configure output of `transform` and `fit_transform`.
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
for an example on how to use the API.
- `"default"`: Default output format of a transformer
- `"pandas"`: DataFrame output
- `None`: Transform configuration is unchanged
.. versionadded:: 1.2
Yields
------
None.
See Also
--------
set_config : Set global scikit-learn configuration.
get_config : Retrieve current values of the global configuration.
Notes
-----
All settings, not just those presently modified, will be returned to
their previous values when the context manager is exited.
Examples
--------
>>> import sklearn
>>> from sklearn.utils.validation import assert_all_finite
>>> with sklearn.config_context(assume_finite=True):
... assert_all_finite([float('nan')])
>>> with sklearn.config_context(assume_finite=True):
... with sklearn.config_context(assume_finite=False):
... assert_all_finite([float('nan')])
Traceback (most recent call last):
...
ValueError: Input contains NaN...
"""
old_config = get_config()
set_config(
assume_finite=assume_finite,
working_memory=working_memory,
print_changed_only=print_changed_only,
display=display,
pairwise_dist_chunk_size=pairwise_dist_chunk_size,
enable_cython_pairwise_dist=enable_cython_pairwise_dist,
array_api_dispatch=array_api_dispatch,
transform_output=transform_output,
)
try:
yield
finally:
set_config(**old_config)

View file

@ -0,0 +1,22 @@
'''Helper to preload vcomp140.dll and msvcp140.dll to prevent
"not found" errors.
Once vcomp140.dll and msvcp140.dll are
preloaded, the namespace is made available to any subsequent
vcomp140.dll and msvcp140.dll. This is
created as part of the scripts that build the wheel.
'''
import os
import os.path as op
from ctypes import WinDLL
if os.name == "nt":
libs_path = op.join(op.dirname(__file__), ".libs")
vcomp140_dll_filename = op.join(libs_path, "vcomp140.dll")
msvcp140_dll_filename = op.join(libs_path, "msvcp140.dll")
WinDLL(op.abspath(vcomp140_dll_filename))
WinDLL(op.abspath(msvcp140_dll_filename))

View file

@ -0,0 +1,29 @@
"""
The :mod:`sklearn._loss` module includes loss function classes suitable for
fitting classification and regression tasks.
"""
from .loss import (
HalfSquaredError,
AbsoluteError,
PinballLoss,
HalfPoissonLoss,
HalfGammaLoss,
HalfTweedieLoss,
HalfTweedieLossIdentity,
HalfBinomialLoss,
HalfMultinomialLoss,
)
__all__ = [
"HalfSquaredError",
"AbsoluteError",
"PinballLoss",
"HalfPoissonLoss",
"HalfGammaLoss",
"HalfTweedieLoss",
"HalfTweedieLossIdentity",
"HalfBinomialLoss",
"HalfMultinomialLoss",
]

View file

@ -0,0 +1,81 @@
# cython: language_level=3
cimport numpy as cnp
cnp.import_array()
# Fused types for y_true, y_pred, raw_prediction
ctypedef fused Y_DTYPE_C:
cnp.npy_float64
cnp.npy_float32
# Fused types for gradient and hessian
ctypedef fused G_DTYPE_C:
cnp.npy_float64
cnp.npy_float32
# Struct to return 2 doubles
ctypedef struct double_pair:
double val1
double val2
# C base class for loss functions
cdef class CyLossFunction:
cdef double cy_loss(self, double y_true, double raw_prediction) nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
cdef class CyHalfSquaredError(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
cdef class CyAbsoluteError(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
cdef class CyPinballLoss(CyLossFunction):
cdef readonly double quantile # readonly makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
cdef class CyHalfPoissonLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
cdef class CyHalfGammaLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
cdef class CyHalfTweedieLoss(CyLossFunction):
cdef readonly double power # readonly makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
cdef class CyHalfTweedieLossIdentity(CyLossFunction):
cdef readonly double power # readonly makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
cdef class CyHalfBinomialLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil

View file

@ -0,0 +1,373 @@
"""
Distribution functions used in GLM
"""
# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
# License: BSD 3 clause
#
# TODO(1.3): remove file
# This is only used for backward compatibility in _GeneralizedLinearRegressor
# for the deprecated family attribute.
from abc import ABCMeta, abstractmethod
from collections import namedtuple
import numbers
import numpy as np
from scipy.special import xlogy
DistributionBoundary = namedtuple("DistributionBoundary", ("value", "inclusive"))
class ExponentialDispersionModel(metaclass=ABCMeta):
r"""Base class for reproductive Exponential Dispersion Models (EDM).
The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by
.. math:: p(y| \theta, \phi) = c(y, \phi)
\exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
= \tilde{c}(y, \phi)
\exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right)
with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`,
variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`,
unit variance :math:`v(y_\textrm{pred})` and
unit deviance :math:`d(y,y_\textrm{pred})`.
Methods
-------
deviance
deviance_derivative
in_y_range
unit_deviance
unit_deviance_derivative
unit_variance
References
----------
https://en.wikipedia.org/wiki/Exponential_dispersion_model.
"""
def in_y_range(self, y):
"""Returns ``True`` if y is in the valid range of Y~EDM.
Parameters
----------
y : array of shape (n_samples,)
Target values.
"""
# Note that currently supported distributions have +inf upper bound
if not isinstance(self._lower_bound, DistributionBoundary):
raise TypeError(
"_lower_bound attribute must be of type DistributionBoundary"
)
if self._lower_bound.inclusive:
return np.greater_equal(y, self._lower_bound.value)
else:
return np.greater(y, self._lower_bound.value)
@abstractmethod
def unit_variance(self, y_pred):
r"""Compute the unit variance function.
The unit variance :math:`v(y_\textrm{pred})` determines the variance as
a function of the mean :math:`y_\textrm{pred}` by
:math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`.
It can also be derived from the unit deviance
:math:`d(y,y_\textrm{pred})` as
.. math:: v(y_\textrm{pred}) = \frac{2}{
\frac{\partial^2 d(y,y_\textrm{pred})}{
\partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}}
See also :func:`variance`.
Parameters
----------
y_pred : array of shape (n_samples,)
Predicted mean.
"""
@abstractmethod
def unit_deviance(self, y, y_pred, check_input=False):
r"""Compute the unit deviance.
The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
log-likelihood as
:math:`d(y,y_\textrm{pred}) = -2\phi\cdot
\left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
Parameters
----------
y : array of shape (n_samples,)
Target values.
y_pred : array of shape (n_samples,)
Predicted mean.
check_input : bool, default=False
If True raise an exception on invalid y or y_pred values, otherwise
they will be propagated as NaN.
Returns
-------
deviance: array of shape (n_samples,)
Computed deviance
"""
def unit_deviance_derivative(self, y, y_pred):
r"""Compute the derivative of the unit deviance w.r.t. y_pred.
The derivative of the unit deviance is given by
:math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred})
= -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}`
with unit variance :math:`v(y_\textrm{pred})`.
Parameters
----------
y : array of shape (n_samples,)
Target values.
y_pred : array of shape (n_samples,)
Predicted mean.
"""
return -2 * (y - y_pred) / self.unit_variance(y_pred)
def deviance(self, y, y_pred, weights=1):
r"""Compute the deviance.
The deviance is a weighted sum of the per sample unit deviances,
:math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)`
with weights :math:`s_i` and unit deviance
:math:`d(y,y_\textrm{pred})`.
In terms of the log-likelihood it is :math:`D = -2\phi\cdot
\left(loglike(y,y_\textrm{pred},\frac{phi}{s})
- loglike(y,y,\frac{phi}{s})\right)`.
Parameters
----------
y : array of shape (n_samples,)
Target values.
y_pred : array of shape (n_samples,)
Predicted mean.
weights : {int, array of shape (n_samples,)}, default=1
Weights or exposure to which variance is inverse proportional.
"""
return np.sum(weights * self.unit_deviance(y, y_pred))
def deviance_derivative(self, y, y_pred, weights=1):
r"""Compute the derivative of the deviance w.r.t. y_pred.
It gives :math:`\frac{\partial}{\partial y_\textrm{pred}}
D(y, \y_\textrm{pred}; weights)`.
Parameters
----------
y : array, shape (n_samples,)
Target values.
y_pred : array, shape (n_samples,)
Predicted mean.
weights : {int, array of shape (n_samples,)}, default=1
Weights or exposure to which variance is inverse proportional.
"""
return weights * self.unit_deviance_derivative(y, y_pred)
class TweedieDistribution(ExponentialDispersionModel):
r"""A class for the Tweedie distribution.
A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]`
is uniquely defined by it's mean-variance relationship
:math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`.
Special cases are:
===== ================
Power Distribution
===== ================
0 Normal
1 Poisson
(1,2) Compound Poisson
2 Gamma
3 Inverse Gaussian
Parameters
----------
power : float, default=0
The variance power of the `unit_variance`
:math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`.
For ``0<power<1``, no distribution exists.
"""
def __init__(self, power=0):
self.power = power
@property
def power(self):
return self._power
@power.setter
def power(self, power):
# We use a property with a setter, to update lower and
# upper bound when the power parameter is updated e.g. in grid
# search.
if not isinstance(power, numbers.Real):
raise TypeError("power must be a real number, input was {0}".format(power))
if power <= 0:
# Extreme Stable or Normal distribution
self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
elif 0 < power < 1:
raise ValueError(
"Tweedie distribution is only defined for power<=0 and power>=1."
)
elif 1 <= power < 2:
# Poisson or Compound Poisson distribution
self._lower_bound = DistributionBoundary(0, inclusive=True)
elif power >= 2:
# Gamma, Positive Stable, Inverse Gaussian distributions
self._lower_bound = DistributionBoundary(0, inclusive=False)
else: # pragma: no cover
# this branch should be unreachable.
raise ValueError
self._power = power
def unit_variance(self, y_pred):
"""Compute the unit variance of a Tweedie distribution
v(y_\textrm{pred})=y_\textrm{pred}**power.
Parameters
----------
y_pred : array of shape (n_samples,)
Predicted mean.
"""
return np.power(y_pred, self.power)
def unit_deviance(self, y, y_pred, check_input=False):
r"""Compute the unit deviance.
The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
log-likelihood as
:math:`d(y,y_\textrm{pred}) = -2\phi\cdot
\left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
Parameters
----------
y : array of shape (n_samples,)
Target values.
y_pred : array of shape (n_samples,)
Predicted mean.
check_input : bool, default=False
If True raise an exception on invalid y or y_pred values, otherwise
they will be propagated as NaN.
Returns
-------
deviance: array of shape (n_samples,)
Computed deviance
"""
p = self.power
if check_input:
message = (
"Mean Tweedie deviance error with power={} can only be used on ".format(
p
)
)
if p < 0:
# 'Extreme stable', y any real number, y_pred > 0
if (y_pred <= 0).any():
raise ValueError(message + "strictly positive y_pred.")
elif p == 0:
# Normal, y and y_pred can be any real number
pass
elif 0 < p < 1:
raise ValueError(
"Tweedie deviance is only defined for power<=0 and power>=1."
)
elif 1 <= p < 2:
# Poisson and compound Poisson distribution, y >= 0, y_pred > 0
if (y < 0).any() or (y_pred <= 0).any():
raise ValueError(
message + "non-negative y and strictly positive y_pred."
)
elif p >= 2:
# Gamma and Extreme stable distribution, y and y_pred > 0
if (y <= 0).any() or (y_pred <= 0).any():
raise ValueError(message + "strictly positive y and y_pred.")
else: # pragma: nocover
# Unreachable statement
raise ValueError
if p < 0:
# 'Extreme stable', y any real number, y_pred > 0
dev = 2 * (
np.power(np.maximum(y, 0), 2 - p) / ((1 - p) * (2 - p))
- y * np.power(y_pred, 1 - p) / (1 - p)
+ np.power(y_pred, 2 - p) / (2 - p)
)
elif p == 0:
# Normal distribution, y and y_pred any real number
dev = (y - y_pred) ** 2
elif p < 1:
raise ValueError(
"Tweedie deviance is only defined for power<=0 and power>=1."
)
elif p == 1:
# Poisson distribution
dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
elif p == 2:
# Gamma distribution
dev = 2 * (np.log(y_pred / y) + y / y_pred - 1)
else:
dev = 2 * (
np.power(y, 2 - p) / ((1 - p) * (2 - p))
- y * np.power(y_pred, 1 - p) / (1 - p)
+ np.power(y_pred, 2 - p) / (2 - p)
)
return dev
class NormalDistribution(TweedieDistribution):
"""Class for the Normal (aka Gaussian) distribution."""
def __init__(self):
super().__init__(power=0)
class PoissonDistribution(TweedieDistribution):
"""Class for the scaled Poisson distribution."""
def __init__(self):
super().__init__(power=1)
class GammaDistribution(TweedieDistribution):
"""Class for the Gamma distribution."""
def __init__(self):
super().__init__(power=2)
class InverseGaussianDistribution(TweedieDistribution):
"""Class for the scaled InverseGaussianDistribution distribution."""
def __init__(self):
super().__init__(power=3)
EDM_DISTRIBUTIONS = {
"normal": NormalDistribution,
"poisson": PoissonDistribution,
"gamma": GammaDistribution,
"inverse-gaussian": InverseGaussianDistribution,
}

View file

@ -0,0 +1,261 @@
"""
Module contains classes for invertible (and differentiable) link functions.
"""
# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
from abc import ABC, abstractmethod
from dataclasses import dataclass
import numpy as np
from scipy.special import expit, logit
from scipy.stats import gmean
from ..utils.extmath import softmax
@dataclass
class Interval:
low: float
high: float
low_inclusive: bool
high_inclusive: bool
def __post_init__(self):
"""Check that low <= high"""
if self.low > self.high:
raise ValueError(
f"One must have low <= high; got low={self.low}, high={self.high}."
)
def includes(self, x):
"""Test whether all values of x are in interval range.
Parameters
----------
x : ndarray
Array whose elements are tested to be in interval range.
Returns
-------
result : bool
"""
if self.low_inclusive:
low = np.greater_equal(x, self.low)
else:
low = np.greater(x, self.low)
if not np.all(low):
return False
if self.high_inclusive:
high = np.less_equal(x, self.high)
else:
high = np.less(x, self.high)
# Note: np.all returns numpy.bool_
return bool(np.all(high))
def _inclusive_low_high(interval, dtype=np.float64):
"""Generate values low and high to be within the interval range.
This is used in tests only.
Returns
-------
low, high : tuple
The returned values low and high lie within the interval.
"""
eps = 10 * np.finfo(dtype).eps
if interval.low == -np.inf:
low = -1e10
elif interval.low < 0:
low = interval.low * (1 - eps) + eps
else:
low = interval.low * (1 + eps) + eps
if interval.high == np.inf:
high = 1e10
elif interval.high < 0:
high = interval.high * (1 + eps) - eps
else:
high = interval.high * (1 - eps) - eps
return low, high
class BaseLink(ABC):
"""Abstract base class for differentiable, invertible link functions.
Convention:
- link function g: raw_prediction = g(y_pred)
- inverse link h: y_pred = h(raw_prediction)
For (generalized) linear models, `raw_prediction = X @ coef` is the so
called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
conditional (on X) expected value of the target `y_true`.
The methods are not implemented as staticmethods in case a link function needs
parameters.
"""
is_multiclass = False # used for testing only
# Usually, raw_prediction may be any real number and y_pred is an open
# interval.
# interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
interval_y_pred = Interval(-np.inf, np.inf, False, False)
@abstractmethod
def link(self, y_pred, out=None):
"""Compute the link function g(y_pred).
The link function maps (predicted) target values to raw predictions,
i.e. `g(y_pred) = raw_prediction`.
Parameters
----------
y_pred : array
Predicted target values.
out : array
A location into which the result is stored. If provided, it must
have a shape that the inputs broadcast to. If not provided or None,
a freshly-allocated array is returned.
Returns
-------
out : array
Output array, element-wise link function.
"""
@abstractmethod
def inverse(self, raw_prediction, out=None):
"""Compute the inverse link function h(raw_prediction).
The inverse link function maps raw predictions to predicted target
values, i.e. `h(raw_prediction) = y_pred`.
Parameters
----------
raw_prediction : array
Raw prediction values (in link space).
out : array
A location into which the result is stored. If provided, it must
have a shape that the inputs broadcast to. If not provided or None,
a freshly-allocated array is returned.
Returns
-------
out : array
Output array, element-wise inverse link function.
"""
class IdentityLink(BaseLink):
"""The identity link function g(x)=x."""
def link(self, y_pred, out=None):
if out is not None:
np.copyto(out, y_pred)
return out
else:
return y_pred
inverse = link
class LogLink(BaseLink):
"""The log link function g(x)=log(x)."""
interval_y_pred = Interval(0, np.inf, False, False)
def link(self, y_pred, out=None):
return np.log(y_pred, out=out)
def inverse(self, raw_prediction, out=None):
return np.exp(raw_prediction, out=out)
class LogitLink(BaseLink):
"""The logit link function g(x)=logit(x)."""
interval_y_pred = Interval(0, 1, False, False)
def link(self, y_pred, out=None):
return logit(y_pred, out=out)
def inverse(self, raw_prediction, out=None):
return expit(raw_prediction, out=out)
class MultinomialLogit(BaseLink):
"""The symmetric multinomial logit function.
Convention:
- y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
Notes:
- The inverse link h is the softmax function.
- The sum is over the second axis, i.e. axis=1 (n_classes).
We have to choose additional constraints in order to make
y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1)
for n_classes classes identifiable and invertible.
We choose the symmetric side constraint where the geometric mean response
is set as reference category, see [2]:
The symmetric multinomial logit link function for a single data point is
then defined as
raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
= log(y_pred[k]) - mean(log(y_pred)).
Note that this is equivalent to the definition in [1] and implies mean
centered raw predictions:
sum(raw_prediction[k], k=0..n_classes-1) = 0.
For linear models with raw_prediction = X @ coef, this corresponds to
sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
feature is zero.
Reference
---------
.. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
logistic regression: a statistical view of boosting" Ann. Statist.
28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
https://projecteuclid.org/euclid.aos/1016218223
.. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
multinomial logit models with symmetric side constraints."
Computational Statistics 28 (2013): 1017-1034.
http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
"""
is_multiclass = True
interval_y_pred = Interval(0, 1, False, False)
def symmetrize_raw_prediction(self, raw_prediction):
return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
def link(self, y_pred, out=None):
# geometric mean as reference category
gm = gmean(y_pred, axis=1)
return np.log(y_pred / gm[:, np.newaxis], out=out)
def inverse(self, raw_prediction, out=None):
if out is None:
return softmax(raw_prediction, copy=True)
else:
np.copyto(out, raw_prediction)
softmax(out, copy=False)
return out
_LINKS = {
"identity": IdentityLink,
"log": LogLink,
"logit": LogitLink,
"multinomial_logit": MultinomialLogit,
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,123 @@
# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
#
# License: BSD 3 clause
#
# TODO(1.3): remove file
import numpy as np
from numpy.testing import (
assert_allclose,
assert_array_equal,
)
from scipy.optimize import check_grad
import pytest
from sklearn._loss.glm_distribution import (
TweedieDistribution,
NormalDistribution,
PoissonDistribution,
GammaDistribution,
InverseGaussianDistribution,
DistributionBoundary,
)
@pytest.mark.parametrize(
"family, expected",
[
(NormalDistribution(), [True, True, True]),
(PoissonDistribution(), [False, True, True]),
(TweedieDistribution(power=1.5), [False, True, True]),
(GammaDistribution(), [False, False, True]),
(InverseGaussianDistribution(), [False, False, True]),
(TweedieDistribution(power=4.5), [False, False, True]),
],
)
def test_family_bounds(family, expected):
"""Test the valid range of distributions at -1, 0, 1."""
result = family.in_y_range([-1, 0, 1])
assert_array_equal(result, expected)
def test_invalid_distribution_bound():
dist = TweedieDistribution()
dist._lower_bound = 0
with pytest.raises(TypeError, match="must be of type DistributionBoundary"):
dist.in_y_range([-1, 0, 1])
def test_tweedie_distribution_power():
msg = "distribution is only defined for power<=0 and power>=1"
with pytest.raises(ValueError, match=msg):
TweedieDistribution(power=0.5)
with pytest.raises(TypeError, match="must be a real number"):
TweedieDistribution(power=1j)
with pytest.raises(TypeError, match="must be a real number"):
dist = TweedieDistribution()
dist.power = 1j
dist = TweedieDistribution()
assert isinstance(dist._lower_bound, DistributionBoundary)
assert dist._lower_bound.inclusive is False
dist.power = 1
assert dist._lower_bound.value == 0.0
assert dist._lower_bound.inclusive is True
@pytest.mark.parametrize(
"family, chk_values",
[
(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
(PoissonDistribution(), [0.1, 1.5]),
(GammaDistribution(), [0.1, 1.5]),
(InverseGaussianDistribution(), [0.1, 1.5]),
(TweedieDistribution(power=-2.5), [0.1, 1.5]),
(TweedieDistribution(power=-1), [0.1, 1.5]),
(TweedieDistribution(power=1.5), [0.1, 1.5]),
(TweedieDistribution(power=2.5), [0.1, 1.5]),
(TweedieDistribution(power=-4), [0.1, 1.5]),
],
)
def test_deviance_zero(family, chk_values):
"""Test deviance(y,y) = 0 for different families."""
for x in chk_values:
assert_allclose(family.deviance(x, x), 0, atol=1e-9)
@pytest.mark.parametrize(
"family",
[
NormalDistribution(),
PoissonDistribution(),
GammaDistribution(),
InverseGaussianDistribution(),
TweedieDistribution(power=-2.5),
TweedieDistribution(power=-1),
TweedieDistribution(power=1.5),
TweedieDistribution(power=2.5),
TweedieDistribution(power=-4),
],
ids=lambda x: x.__class__.__name__,
)
def test_deviance_derivative(family, global_random_seed):
"""Test deviance derivative for different families."""
rng = np.random.RandomState(global_random_seed)
y_true = rng.rand(10)
# make data positive
y_true += np.abs(y_true.min()) + 1e-2
y_pred = y_true + np.fmax(rng.rand(10), 0.0)
dev = family.deviance(y_true, y_pred)
assert isinstance(dev, float)
dev_derivative = family.deviance_derivative(y_true, y_pred)
assert dev_derivative.shape == y_pred.shape
err = check_grad(
lambda y_pred: family.deviance(y_true, y_pred),
lambda y_pred: family.deviance_derivative(y_true, y_pred),
y_pred,
) / np.linalg.norm(dev_derivative)
assert abs(err) < 3e-6

View file

@ -0,0 +1,109 @@
import numpy as np
from numpy.testing import assert_allclose, assert_array_equal
import pytest
from sklearn._loss.link import (
_LINKS,
_inclusive_low_high,
MultinomialLogit,
Interval,
)
LINK_FUNCTIONS = list(_LINKS.values())
def test_interval_raises():
"""Test that interval with low > high raises ValueError."""
with pytest.raises(
ValueError, match="One must have low <= high; got low=1, high=0."
):
Interval(1, 0, False, False)
@pytest.mark.parametrize(
"interval",
[
Interval(0, 1, False, False),
Interval(0, 1, False, True),
Interval(0, 1, True, False),
Interval(0, 1, True, True),
Interval(-np.inf, np.inf, False, False),
Interval(-np.inf, np.inf, False, True),
Interval(-np.inf, np.inf, True, False),
Interval(-np.inf, np.inf, True, True),
Interval(-10, -1, False, False),
Interval(-10, -1, False, True),
Interval(-10, -1, True, False),
Interval(-10, -1, True, True),
],
)
def test_is_in_range(interval):
# make sure low and high are always within the interval, used for linspace
low, high = _inclusive_low_high(interval)
x = np.linspace(low, high, num=10)
assert interval.includes(x)
# x contains lower bound
assert interval.includes(np.r_[x, interval.low]) == interval.low_inclusive
# x contains upper bound
assert interval.includes(np.r_[x, interval.high]) == interval.high_inclusive
# x contains upper and lower bound
assert interval.includes(np.r_[x, interval.low, interval.high]) == (
interval.low_inclusive and interval.high_inclusive
)
@pytest.mark.parametrize("link", LINK_FUNCTIONS)
def test_link_inverse_identity(link, global_random_seed):
# Test that link of inverse gives identity.
rng = np.random.RandomState(global_random_seed)
link = link()
n_samples, n_classes = 100, None
# The values for `raw_prediction` are limited from -20 to 20 because in the
# class `LogitLink` the term `expit(x)` comes very close to 1 for large
# positive x and therefore loses precision.
if link.is_multiclass:
n_classes = 10
raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples, n_classes))
if isinstance(link, MultinomialLogit):
raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
else:
raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples))
assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
y_pred = link.inverse(raw_prediction)
assert_allclose(link.inverse(link.link(y_pred)), y_pred)
@pytest.mark.parametrize("link", LINK_FUNCTIONS)
def test_link_out_argument(link):
# Test that out argument gets assigned the result.
rng = np.random.RandomState(42)
link = link()
n_samples, n_classes = 100, None
if link.is_multiclass:
n_classes = 10
raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
if isinstance(link, MultinomialLogit):
raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
else:
# So far, the valid interval of raw_prediction is (-inf, inf) and
# we do not need to distinguish.
raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples))
y_pred = link.inverse(raw_prediction, out=None)
out = np.empty_like(raw_prediction)
y_pred_2 = link.inverse(raw_prediction, out=out)
assert_allclose(y_pred, out)
assert_array_equal(out, y_pred_2)
assert np.shares_memory(out, y_pred_2)
out = np.empty_like(y_pred)
raw_prediction_2 = link.link(y_pred, out=out)
assert_allclose(raw_prediction, out)
assert_array_equal(out, raw_prediction_2)
assert np.shares_memory(out, raw_prediction_2)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,72 @@
"""All minimum dependencies for scikit-learn."""
from collections import defaultdict
import platform
import argparse
# scipy and cython should by in sync with pyproject.toml
# NumPy version should match oldest-supported-numpy for the minimum supported
# Python version.
# see: https://github.com/scipy/oldest-supported-numpy/blob/main/setup.cfg
if platform.python_implementation() == "PyPy":
NUMPY_MIN_VERSION = "1.19.2"
else:
NUMPY_MIN_VERSION = "1.17.3"
SCIPY_MIN_VERSION = "1.3.2"
JOBLIB_MIN_VERSION = "1.1.1"
THREADPOOLCTL_MIN_VERSION = "2.0.0"
PYTEST_MIN_VERSION = "5.3.1"
CYTHON_MIN_VERSION = "0.29.24"
# 'build' and 'install' is included to have structured metadata for CI.
# It will NOT be included in setup's extras_require
# The values are (version_spec, comma separated tags)
dependent_packages = {
"numpy": (NUMPY_MIN_VERSION, "build, install"),
"scipy": (SCIPY_MIN_VERSION, "build, install"),
"joblib": (JOBLIB_MIN_VERSION, "install"),
"threadpoolctl": (THREADPOOLCTL_MIN_VERSION, "install"),
"cython": (CYTHON_MIN_VERSION, "build"),
"matplotlib": ("3.1.3", "benchmark, docs, examples, tests"),
"scikit-image": ("0.16.2", "docs, examples, tests"),
"pandas": ("1.0.5", "benchmark, docs, examples, tests"),
"seaborn": ("0.9.0", "docs, examples"),
"memory_profiler": ("0.57.0", "benchmark, docs"),
"pytest": (PYTEST_MIN_VERSION, "tests"),
"pytest-cov": ("2.9.0", "tests"),
"flake8": ("3.8.2", "tests"),
"black": ("22.3.0", "tests"),
"mypy": ("0.961", "tests"),
"pyamg": ("4.0.0", "tests"),
"sphinx": ("4.0.1", "docs"),
"sphinx-gallery": ("0.7.0", "docs"),
"numpydoc": ("1.2.0", "docs, tests"),
"Pillow": ("7.1.2", "docs"),
"pooch": ("1.6.0", "docs, examples, tests"),
"sphinx-prompt": ("1.3.0", "docs"),
"sphinxext-opengraph": ("0.4.2", "docs"),
"plotly": ("5.10.0", "docs, examples"),
# XXX: Pin conda-lock to the latest released version (needs manual update
# from time to time)
"conda-lock": ("1.3.0", "maintenance"),
}
# create inverse mapping for setuptools
tag_to_packages: dict = defaultdict(list)
for package, (min_version, extras) in dependent_packages.items():
for extra in extras.split(", "):
tag_to_packages[extra].append("{}>={}".format(package, min_version))
# Used by CI to get the min dependencies
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Get min dependencies for a package")
parser.add_argument("package", choices=dependent_packages)
args = parser.parse_args()
min_version = dependent_packages[args.package][0]
print(min_version)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,54 @@
"""
The :mod:`sklearn.cluster` module gathers popular unsupervised clustering
algorithms.
"""
from ._spectral import spectral_clustering, SpectralClustering
from ._mean_shift import mean_shift, MeanShift, estimate_bandwidth, get_bin_seeds
from ._affinity_propagation import affinity_propagation, AffinityPropagation
from ._agglomerative import (
ward_tree,
AgglomerativeClustering,
linkage_tree,
FeatureAgglomeration,
)
from ._kmeans import k_means, KMeans, MiniBatchKMeans, kmeans_plusplus
from ._bisect_k_means import BisectingKMeans
from ._dbscan import dbscan, DBSCAN
from ._optics import (
OPTICS,
cluster_optics_dbscan,
compute_optics_graph,
cluster_optics_xi,
)
from ._bicluster import SpectralBiclustering, SpectralCoclustering
from ._birch import Birch
__all__ = [
"AffinityPropagation",
"AgglomerativeClustering",
"Birch",
"DBSCAN",
"OPTICS",
"cluster_optics_dbscan",
"cluster_optics_xi",
"compute_optics_graph",
"KMeans",
"BisectingKMeans",
"FeatureAgglomeration",
"MeanShift",
"MiniBatchKMeans",
"SpectralClustering",
"affinity_propagation",
"dbscan",
"estimate_bandwidth",
"get_bin_seeds",
"k_means",
"kmeans_plusplus",
"linkage_tree",
"mean_shift",
"spectral_clustering",
"ward_tree",
"SpectralBiclustering",
"SpectralCoclustering",
]

View file

@ -0,0 +1,582 @@
"""Affinity Propagation clustering algorithm."""
# Author: Alexandre Gramfort alexandre.gramfort@inria.fr
# Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD 3 clause
from numbers import Integral, Real
import warnings
import numpy as np
from ..exceptions import ConvergenceWarning
from ..base import BaseEstimator, ClusterMixin
from ..utils import as_float_array, check_random_state
from ..utils._param_validation import Interval, StrOptions
from ..utils.validation import check_is_fitted
from ..metrics import euclidean_distances
from ..metrics import pairwise_distances_argmin
from .._config import config_context
def _equal_similarities_and_preferences(S, preference):
def all_equal_preferences():
return np.all(preference == preference.flat[0])
def all_equal_similarities():
# Create mask to ignore diagonal of S
mask = np.ones(S.shape, dtype=bool)
np.fill_diagonal(mask, 0)
return np.all(S[mask].flat == S[mask].flat[0])
return all_equal_preferences() and all_equal_similarities()
def _affinity_propagation(
S,
*,
preference,
convergence_iter,
max_iter,
damping,
verbose,
return_n_iter,
random_state,
):
"""Main affinity propagation algorithm."""
n_samples = S.shape[0]
if n_samples == 1 or _equal_similarities_and_preferences(S, preference):
# It makes no sense to run the algorithm in this case, so return 1 or
# n_samples clusters, depending on preferences
warnings.warn(
"All samples have mutually equal similarities. "
"Returning arbitrary cluster center(s)."
)
if preference.flat[0] >= S.flat[n_samples - 1]:
return (
(np.arange(n_samples), np.arange(n_samples), 0)
if return_n_iter
else (np.arange(n_samples), np.arange(n_samples))
)
else:
return (
(np.array([0]), np.array([0] * n_samples), 0)
if return_n_iter
else (np.array([0]), np.array([0] * n_samples))
)
# Place preference on the diagonal of S
S.flat[:: (n_samples + 1)] = preference
A = np.zeros((n_samples, n_samples))
R = np.zeros((n_samples, n_samples)) # Initialize messages
# Intermediate results
tmp = np.zeros((n_samples, n_samples))
# Remove degeneracies
S += (
np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100
) * random_state.standard_normal(size=(n_samples, n_samples))
# Execute parallel affinity propagation updates
e = np.zeros((n_samples, convergence_iter))
ind = np.arange(n_samples)
for it in range(max_iter):
# tmp = A + S; compute responsibilities
np.add(A, S, tmp)
I = np.argmax(tmp, axis=1)
Y = tmp[ind, I] # np.max(A + S, axis=1)
tmp[ind, I] = -np.inf
Y2 = np.max(tmp, axis=1)
# tmp = Rnew
np.subtract(S, Y[:, None], tmp)
tmp[ind, I] = S[ind, I] - Y2
# Damping
tmp *= 1 - damping
R *= damping
R += tmp
# tmp = Rp; compute availabilities
np.maximum(R, 0, tmp)
tmp.flat[:: n_samples + 1] = R.flat[:: n_samples + 1]
# tmp = -Anew
tmp -= np.sum(tmp, axis=0)
dA = np.diag(tmp).copy()
tmp.clip(0, np.inf, tmp)
tmp.flat[:: n_samples + 1] = dA
# Damping
tmp *= 1 - damping
A *= damping
A -= tmp
# Check for convergence
E = (np.diag(A) + np.diag(R)) > 0
e[:, it % convergence_iter] = E
K = np.sum(E, axis=0)
if it >= convergence_iter:
se = np.sum(e, axis=1)
unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples
if (not unconverged and (K > 0)) or (it == max_iter):
never_converged = False
if verbose:
print("Converged after %d iterations." % it)
break
else:
never_converged = True
if verbose:
print("Did not converge")
I = np.flatnonzero(E)
K = I.size # Identify exemplars
if K > 0:
if never_converged:
warnings.warn(
"Affinity propagation did not converge, this model "
"may return degenerate cluster centers and labels.",
ConvergenceWarning,
)
c = np.argmax(S[:, I], axis=1)
c[I] = np.arange(K) # Identify clusters
# Refine the final set of exemplars and clusters and return results
for k in range(K):
ii = np.where(c == k)[0]
j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
I[k] = ii[j]
c = np.argmax(S[:, I], axis=1)
c[I] = np.arange(K)
labels = I[c]
# Reduce labels to a sorted, gapless, list
cluster_centers_indices = np.unique(labels)
labels = np.searchsorted(cluster_centers_indices, labels)
else:
warnings.warn(
"Affinity propagation did not converge and this model "
"will not have any cluster centers.",
ConvergenceWarning,
)
labels = np.array([-1] * n_samples)
cluster_centers_indices = []
if return_n_iter:
return cluster_centers_indices, labels, it + 1
else:
return cluster_centers_indices, labels
###############################################################################
# Public API
def affinity_propagation(
S,
*,
preference=None,
convergence_iter=15,
max_iter=200,
damping=0.5,
copy=True,
verbose=False,
return_n_iter=False,
random_state=None,
):
"""Perform Affinity Propagation Clustering of data.
Read more in the :ref:`User Guide <affinity_propagation>`.
Parameters
----------
S : array-like of shape (n_samples, n_samples)
Matrix of similarities between points.
preference : array-like of shape (n_samples,) or float, default=None
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number of
exemplars, i.e. of clusters, is influenced by the input preferences
value. If the preferences are not passed as arguments, they will be
set to the median of the input similarities (resulting in a moderate
number of clusters). For a smaller amount of clusters, this can be set
to the minimum value of the similarities.
convergence_iter : int, default=15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.
max_iter : int, default=200
Maximum number of iterations.
damping : float, default=0.5
Damping factor between 0.5 and 1.
copy : bool, default=True
If copy is False, the affinity matrix is modified inplace by the
algorithm, for memory efficiency.
verbose : bool, default=False
The verbosity level.
return_n_iter : bool, default=False
Whether or not to return the number of iterations.
random_state : int, RandomState instance or None, default=None
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.
.. versionadded:: 0.23
this parameter was previously hardcoded as 0.
Returns
-------
cluster_centers_indices : ndarray of shape (n_clusters,)
Index of clusters centers.
labels : ndarray of shape (n_samples,)
Cluster labels for each point.
n_iter : int
Number of iterations run. Returned only if `return_n_iter` is
set to True.
Notes
-----
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
When the algorithm does not converge, it will still return a arrays of
``cluster_center_indices`` and labels if there are any exemplars/clusters,
however they may be degenerate and should be used with caution.
When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, a single cluster center
and label ``0`` for every sample will be returned. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.
References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
"""
S = as_float_array(S, copy=copy)
estimator = AffinityPropagation(
damping=damping,
max_iter=max_iter,
convergence_iter=convergence_iter,
copy=False,
preference=preference,
affinity="precomputed",
verbose=verbose,
random_state=random_state,
).fit(S)
if return_n_iter:
return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_
return estimator.cluster_centers_indices_, estimator.labels_
class AffinityPropagation(ClusterMixin, BaseEstimator):
"""Perform Affinity Propagation Clustering of data.
Read more in the :ref:`User Guide <affinity_propagation>`.
Parameters
----------
damping : float, default=0.5
Damping factor in the range `[0.5, 1.0)` is the extent to
which the current value is maintained relative to
incoming values (weighted 1 - damping). This in order
to avoid numerical oscillations when updating these
values (messages).
max_iter : int, default=200
Maximum number of iterations.
convergence_iter : int, default=15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.
copy : bool, default=True
Make a copy of input data.
preference : array-like of shape (n_samples,) or float, default=None
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number
of exemplars, ie of clusters, is influenced by the input
preferences value. If the preferences are not passed as arguments,
they will be set to the median of the input similarities.
affinity : {'euclidean', 'precomputed'}, default='euclidean'
Which affinity to use. At the moment 'precomputed' and
``euclidean`` are supported. 'euclidean' uses the
negative squared euclidean distance between points.
verbose : bool, default=False
Whether to be verbose.
random_state : int, RandomState instance or None, default=None
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.
.. versionadded:: 0.23
this parameter was previously hardcoded as 0.
Attributes
----------
cluster_centers_indices_ : ndarray of shape (n_clusters,)
Indices of cluster centers.
cluster_centers_ : ndarray of shape (n_clusters, n_features)
Cluster centers (if affinity != ``precomputed``).
labels_ : ndarray of shape (n_samples,)
Labels of each point.
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
Stores the affinity matrix used in ``fit``.
n_iter_ : int
Number of iterations taken to converge.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
AgglomerativeClustering : Recursively merges the pair of
clusters that minimally increases a given linkage distance.
FeatureAgglomeration : Similar to AgglomerativeClustering,
but recursively merges features instead of samples.
KMeans : K-Means clustering.
MiniBatchKMeans : Mini-Batch K-Means clustering.
MeanShift : Mean shift clustering using a flat kernel.
SpectralClustering : Apply clustering to a projection
of the normalized Laplacian.
Notes
-----
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
The algorithmic complexity of affinity propagation is quadratic
in the number of points.
When the algorithm does not converge, it will still return a arrays of
``cluster_center_indices`` and labels if there are any exemplars/clusters,
however they may be degenerate and should be used with caution.
When ``fit`` does not converge, ``cluster_centers_`` is still populated
however it may be degenerate. In such a case, proceed with caution.
If ``fit`` does not converge and fails to produce any ``cluster_centers_``
then ``predict`` will label every sample as ``-1``.
When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, ``fit`` will result in
a single cluster center and label ``0`` for every sample. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.
References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
Examples
--------
>>> from sklearn.cluster import AffinityPropagation
>>> import numpy as np
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [4, 2], [4, 4], [4, 0]])
>>> clustering = AffinityPropagation(random_state=5).fit(X)
>>> clustering
AffinityPropagation(random_state=5)
>>> clustering.labels_
array([0, 0, 0, 1, 1, 1])
>>> clustering.predict([[0, 0], [4, 4]])
array([0, 1])
>>> clustering.cluster_centers_
array([[1, 2],
[4, 2]])
"""
_parameter_constraints: dict = {
"damping": [Interval(Real, 0.5, 1.0, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"convergence_iter": [Interval(Integral, 1, None, closed="left")],
"copy": ["boolean"],
"preference": [
"array-like",
Interval(Real, None, None, closed="neither"),
None,
],
"affinity": [StrOptions({"euclidean", "precomputed"})],
"verbose": ["verbose"],
"random_state": ["random_state"],
}
def __init__(
self,
*,
damping=0.5,
max_iter=200,
convergence_iter=15,
copy=True,
preference=None,
affinity="euclidean",
verbose=False,
random_state=None,
):
self.damping = damping
self.max_iter = max_iter
self.convergence_iter = convergence_iter
self.copy = copy
self.verbose = verbose
self.preference = preference
self.affinity = affinity
self.random_state = random_state
def _more_tags(self):
return {"pairwise": self.affinity == "precomputed"}
def fit(self, X, y=None):
"""Fit the clustering from features, or affinity matrix.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
array-like of shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse feature matrix
is provided, it will be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Returns the instance itself.
"""
self._validate_params()
if self.affinity == "precomputed":
accept_sparse = False
else:
accept_sparse = "csr"
X = self._validate_data(X, accept_sparse=accept_sparse)
if self.affinity == "precomputed":
self.affinity_matrix_ = X.copy() if self.copy else X
else: # self.affinity == "euclidean"
self.affinity_matrix_ = -euclidean_distances(X, squared=True)
if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
raise ValueError(
"The matrix of similarities must be a square array. "
f"Got {self.affinity_matrix_.shape} instead."
)
if self.preference is None:
preference = np.median(self.affinity_matrix_)
else:
preference = self.preference
preference = np.array(preference, copy=False)
random_state = check_random_state(self.random_state)
(
self.cluster_centers_indices_,
self.labels_,
self.n_iter_,
) = _affinity_propagation(
self.affinity_matrix_,
max_iter=self.max_iter,
convergence_iter=self.convergence_iter,
preference=preference,
damping=self.damping,
verbose=self.verbose,
return_n_iter=True,
random_state=random_state,
)
if self.affinity != "precomputed":
self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
return self
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
New data to predict. If a sparse matrix is provided, it will be
converted into a sparse ``csr_matrix``.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels.
"""
check_is_fitted(self)
X = self._validate_data(X, reset=False, accept_sparse="csr")
if not hasattr(self, "cluster_centers_"):
raise ValueError(
"Predict method is not supported when affinity='precomputed'."
)
if self.cluster_centers_.shape[0] > 0:
with config_context(assume_finite=True):
return pairwise_distances_argmin(X, self.cluster_centers_)
else:
warnings.warn(
"This model does not have any cluster centers "
"because affinity propagation did not converge. "
"Labeling every sample as '-1'.",
ConvergenceWarning,
)
return np.array([-1] * X.shape[0])
def fit_predict(self, X, y=None):
"""Fit clustering from features/affinity matrix; return cluster labels.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
array-like of shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse feature matrix
is provided, it will be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels.
"""
return super().fit_predict(X, y)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,628 @@
"""Spectral biclustering algorithms."""
# Authors : Kemal Eren
# License: BSD 3 clause
from abc import ABCMeta, abstractmethod
import numpy as np
from numbers import Integral
from scipy.linalg import norm
from scipy.sparse import dia_matrix, issparse
from scipy.sparse.linalg import eigsh, svds
from . import KMeans, MiniBatchKMeans
from ..base import BaseEstimator, BiclusterMixin
from ..utils import check_random_state
from ..utils import check_scalar
from ..utils.extmath import make_nonnegative, randomized_svd, safe_sparse_dot
from ..utils.validation import assert_all_finite
from ..utils._param_validation import Interval, StrOptions
__all__ = ["SpectralCoclustering", "SpectralBiclustering"]
def _scale_normalize(X):
"""Normalize ``X`` by scaling rows and columns independently.
Returns the normalized matrix and the row and column scaling
factors.
"""
X = make_nonnegative(X)
row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
row_diag = np.where(np.isnan(row_diag), 0, row_diag)
col_diag = np.where(np.isnan(col_diag), 0, col_diag)
if issparse(X):
n_rows, n_cols = X.shape
r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
an = r * X * c
else:
an = row_diag[:, np.newaxis] * X * col_diag
return an, row_diag, col_diag
def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
"""Normalize rows and columns of ``X`` simultaneously so that all
rows sum to one constant and all columns sum to a different
constant.
"""
# According to paper, this can also be done more efficiently with
# deviation reduction and balancing algorithms.
X = make_nonnegative(X)
X_scaled = X
for _ in range(max_iter):
X_new, _, _ = _scale_normalize(X_scaled)
if issparse(X):
dist = norm(X_scaled.data - X.data)
else:
dist = norm(X_scaled - X_new)
X_scaled = X_new
if dist is not None and dist < tol:
break
return X_scaled
def _log_normalize(X):
"""Normalize ``X`` according to Kluger's log-interactions scheme."""
X = make_nonnegative(X, min_value=1)
if issparse(X):
raise ValueError(
"Cannot compute log of a sparse matrix,"
" because log(x) diverges to -infinity as x"
" goes to 0."
)
L = np.log(X)
row_avg = L.mean(axis=1)[:, np.newaxis]
col_avg = L.mean(axis=0)
avg = L.mean()
return L - row_avg - col_avg + avg
class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
"""Base class for spectral biclustering."""
_parameter_constraints: dict = {
"svd_method": [StrOptions({"randomized", "arpack"})],
"n_svd_vecs": [Interval(Integral, 0, None, closed="left"), None],
"mini_batch": ["boolean"],
"init": [StrOptions({"k-means++", "random"}), np.ndarray],
"n_init": [Interval(Integral, 1, None, closed="left")],
"random_state": ["random_state"],
}
@abstractmethod
def __init__(
self,
n_clusters=3,
svd_method="randomized",
n_svd_vecs=None,
mini_batch=False,
init="k-means++",
n_init=10,
random_state=None,
):
self.n_clusters = n_clusters
self.svd_method = svd_method
self.n_svd_vecs = n_svd_vecs
self.mini_batch = mini_batch
self.init = init
self.n_init = n_init
self.random_state = random_state
@abstractmethod
def _check_parameters(self, n_samples):
"""Validate parameters depending on the input data."""
def fit(self, X, y=None):
"""Create a biclustering for X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
SpectralBiclustering instance.
"""
self._validate_params()
X = self._validate_data(X, accept_sparse="csr", dtype=np.float64)
self._check_parameters(X.shape[0])
self._fit(X)
return self
def _svd(self, array, n_components, n_discard):
"""Returns first `n_components` left and right singular
vectors u and v, discarding the first `n_discard`.
"""
if self.svd_method == "randomized":
kwargs = {}
if self.n_svd_vecs is not None:
kwargs["n_oversamples"] = self.n_svd_vecs
u, _, vt = randomized_svd(
array, n_components, random_state=self.random_state, **kwargs
)
elif self.svd_method == "arpack":
u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
if np.any(np.isnan(vt)):
# some eigenvalues of A * A.T are negative, causing
# sqrt() to be np.nan. This causes some vectors in vt
# to be np.nan.
A = safe_sparse_dot(array.T, array)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
vt = v.T
if np.any(np.isnan(u)):
A = safe_sparse_dot(array, array.T)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
assert_all_finite(u)
assert_all_finite(vt)
u = u[:, n_discard:]
vt = vt[n_discard:]
return u, vt.T
def _k_means(self, data, n_clusters):
if self.mini_batch:
model = MiniBatchKMeans(
n_clusters,
init=self.init,
n_init=self.n_init,
random_state=self.random_state,
)
else:
model = KMeans(
n_clusters,
init=self.init,
n_init=self.n_init,
random_state=self.random_state,
)
model.fit(data)
centroid = model.cluster_centers_
labels = model.labels_
return centroid, labels
def _more_tags(self):
return {
"_xfail_checks": {
"check_estimators_dtypes": "raises nan error",
"check_fit2d_1sample": "_scale_normalize fails",
"check_fit2d_1feature": "raises apply_along_axis error",
"check_estimator_sparse_data": "does not fail gracefully",
"check_methods_subset_invariance": "empty array passed inside",
"check_dont_overwrite_parameters": "empty array passed inside",
"check_fit2d_predict1d": "empty array passed inside",
}
}
class SpectralCoclustering(BaseSpectral):
"""Spectral Co-Clustering algorithm (Dhillon, 2001).
Clusters rows and columns of an array `X` to solve the relaxed
normalized cut of the bipartite graph created from `X` as follows:
the edge between row vertex `i` and column vertex `j` has weight
`X[i, j]`.
The resulting bicluster structure is block-diagonal, since each
row and each column belongs to exactly one bicluster.
Supports sparse matrices, as long as they are nonnegative.
Read more in the :ref:`User Guide <spectral_coclustering>`.
Parameters
----------
n_clusters : int, default=3
The number of biclusters to find.
svd_method : {'randomized', 'arpack'}, default='randomized'
Selects the algorithm for finding singular vectors. May be
'randomized' or 'arpack'. If 'randomized', use
:func:`sklearn.utils.extmath.randomized_svd`, which may be faster
for large matrices. If 'arpack', use
:func:`scipy.sparse.linalg.svds`, which is more accurate, but
possibly slower in some cases.
n_svd_vecs : int, default=None
Number of vectors to use in calculating the SVD. Corresponds
to `ncv` when `svd_method=arpack` and `n_oversamples` when
`svd_method` is 'randomized`.
mini_batch : bool, default=False
Whether to use mini-batch k-means, which is faster but may get
different results.
init : {'k-means++', 'random'}, or ndarray of shape \
(n_clusters, n_features), default='k-means++'
Method for initialization of k-means algorithm; defaults to
'k-means++'.
n_init : int, default=10
Number of random initializations that are tried with the
k-means algorithm.
If mini-batch k-means is used, the best initialization is
chosen and the algorithm runs once. Otherwise, the algorithm
is run for each initialization and the best solution chosen.
random_state : int, RandomState instance, default=None
Used for randomizing the singular value decomposition and the k-means
initialization. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Attributes
----------
rows_ : array-like of shape (n_row_clusters, n_rows)
Results of the clustering. `rows[i, r]` is True if
cluster `i` contains row `r`. Available only after calling ``fit``.
columns_ : array-like of shape (n_column_clusters, n_columns)
Results of the clustering, like `rows`.
row_labels_ : array-like of shape (n_rows,)
The bicluster label of each row.
column_labels_ : array-like of shape (n_cols,)
The bicluster label of each column.
biclusters_ : tuple of two ndarrays
The tuple contains the `rows_` and `columns_` arrays.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
SpectralBiclustering : Partitions rows and columns under the assumption
that the data has an underlying checkerboard structure.
References
----------
* :doi:`Dhillon, Inderjit S, 2001. Co-clustering documents and words using
bipartite spectral graph partitioning.
<10.1145/502512.502550>`
Examples
--------
>>> from sklearn.cluster import SpectralCoclustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)
>>> clustering.row_labels_ #doctest: +SKIP
array([0, 1, 1, 0, 0, 0], dtype=int32)
>>> clustering.column_labels_ #doctest: +SKIP
array([0, 0], dtype=int32)
>>> clustering
SpectralCoclustering(n_clusters=2, random_state=0)
"""
_parameter_constraints: dict = {
**BaseSpectral._parameter_constraints,
"n_clusters": [Interval(Integral, 1, None, closed="left")],
}
def __init__(
self,
n_clusters=3,
*,
svd_method="randomized",
n_svd_vecs=None,
mini_batch=False,
init="k-means++",
n_init=10,
random_state=None,
):
super().__init__(
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
)
def _check_parameters(self, n_samples):
if self.n_clusters > n_samples:
raise ValueError(
f"n_clusters should be <= n_samples={n_samples}. Got"
f" {self.n_clusters} instead."
)
def _fit(self, X):
normalized_data, row_diag, col_diag = _scale_normalize(X)
n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
u, v = self._svd(normalized_data, n_sv, n_discard=1)
z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))
_, labels = self._k_means(z, self.n_clusters)
n_rows = X.shape[0]
self.row_labels_ = labels[:n_rows]
self.column_labels_ = labels[n_rows:]
self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])
self.columns_ = np.vstack(
[self.column_labels_ == c for c in range(self.n_clusters)]
)
class SpectralBiclustering(BaseSpectral):
"""Spectral biclustering (Kluger, 2003).
Partitions rows and columns under the assumption that the data has
an underlying checkerboard structure. For instance, if there are
two row partitions and three column partitions, each row will
belong to three biclusters, and each column will belong to two
biclusters. The outer product of the corresponding row and column
label vectors gives this checkerboard structure.
Read more in the :ref:`User Guide <spectral_biclustering>`.
Parameters
----------
n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
The number of row and column clusters in the checkerboard
structure.
method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
Method of normalizing and converting singular vectors into
biclusters. May be one of 'scale', 'bistochastic', or 'log'.
The authors recommend using 'log'. If the data is sparse,
however, log normalization will not work, which is why the
default is 'bistochastic'.
.. warning::
if `method='log'`, the data must not be sparse.
n_components : int, default=6
Number of singular vectors to check.
n_best : int, default=3
Number of best singular vectors to which to project the data
for clustering.
svd_method : {'randomized', 'arpack'}, default='randomized'
Selects the algorithm for finding singular vectors. May be
'randomized' or 'arpack'. If 'randomized', uses
:func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
for large matrices. If 'arpack', uses
`scipy.sparse.linalg.svds`, which is more accurate, but
possibly slower in some cases.
n_svd_vecs : int, default=None
Number of vectors to use in calculating the SVD. Corresponds
to `ncv` when `svd_method=arpack` and `n_oversamples` when
`svd_method` is 'randomized`.
mini_batch : bool, default=False
Whether to use mini-batch k-means, which is faster but may get
different results.
init : {'k-means++', 'random'} or ndarray of shape (n_clusters, n_features), \
default='k-means++'
Method for initialization of k-means algorithm; defaults to
'k-means++'.
n_init : int, default=10
Number of random initializations that are tried with the
k-means algorithm.
If mini-batch k-means is used, the best initialization is
chosen and the algorithm runs once. Otherwise, the algorithm
is run for each initialization and the best solution chosen.
random_state : int, RandomState instance, default=None
Used for randomizing the singular value decomposition and the k-means
initialization. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Attributes
----------
rows_ : array-like of shape (n_row_clusters, n_rows)
Results of the clustering. `rows[i, r]` is True if
cluster `i` contains row `r`. Available only after calling ``fit``.
columns_ : array-like of shape (n_column_clusters, n_columns)
Results of the clustering, like `rows`.
row_labels_ : array-like of shape (n_rows,)
Row partition labels.
column_labels_ : array-like of shape (n_cols,)
Column partition labels.
biclusters_ : tuple of two ndarrays
The tuple contains the `rows_` and `columns_` arrays.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
SpectralCoclustering : Spectral Co-Clustering algorithm (Dhillon, 2001).
References
----------
* :doi:`Kluger, Yuval, et. al., 2003. Spectral biclustering of microarray
data: coclustering genes and conditions.
<10.1101/gr.648603>`
Examples
--------
>>> from sklearn.cluster import SpectralBiclustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)
>>> clustering.row_labels_
array([1, 1, 1, 0, 0, 0], dtype=int32)
>>> clustering.column_labels_
array([0, 1], dtype=int32)
>>> clustering
SpectralBiclustering(n_clusters=2, random_state=0)
"""
_parameter_constraints: dict = {
**BaseSpectral._parameter_constraints,
"n_clusters": [Interval(Integral, 1, None, closed="left"), tuple],
"method": [StrOptions({"bistochastic", "scale", "log"})],
"n_components": [Interval(Integral, 1, None, closed="left")],
"n_best": [Interval(Integral, 1, None, closed="left")],
}
def __init__(
self,
n_clusters=3,
*,
method="bistochastic",
n_components=6,
n_best=3,
svd_method="randomized",
n_svd_vecs=None,
mini_batch=False,
init="k-means++",
n_init=10,
random_state=None,
):
super().__init__(
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
)
self.method = method
self.n_components = n_components
self.n_best = n_best
def _check_parameters(self, n_samples):
if isinstance(self.n_clusters, Integral):
if self.n_clusters > n_samples:
raise ValueError(
f"n_clusters should be <= n_samples={n_samples}. Got"
f" {self.n_clusters} instead."
)
else: # tuple
try:
n_row_clusters, n_column_clusters = self.n_clusters
check_scalar(
n_row_clusters,
"n_row_clusters",
target_type=Integral,
min_val=1,
max_val=n_samples,
)
check_scalar(
n_column_clusters,
"n_column_clusters",
target_type=Integral,
min_val=1,
max_val=n_samples,
)
except (ValueError, TypeError) as e:
raise ValueError(
"Incorrect parameter n_clusters has value:"
f" {self.n_clusters}. It should either be a single integer"
" or an iterable with two integers:"
" (n_row_clusters, n_column_clusters)"
" And the values are should be in the"
" range: (1, n_samples)"
) from e
if self.n_best > self.n_components:
raise ValueError(
f"n_best={self.n_best} must be <= n_components={self.n_components}."
)
def _fit(self, X):
n_sv = self.n_components
if self.method == "bistochastic":
normalized_data = _bistochastic_normalize(X)
n_sv += 1
elif self.method == "scale":
normalized_data, _, _ = _scale_normalize(X)
n_sv += 1
elif self.method == "log":
normalized_data = _log_normalize(X)
n_discard = 0 if self.method == "log" else 1
u, v = self._svd(normalized_data, n_sv, n_discard)
ut = u.T
vt = v.T
try:
n_row_clusters, n_col_clusters = self.n_clusters
except TypeError:
n_row_clusters = n_col_clusters = self.n_clusters
best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters)
best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters)
self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters)
self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters)
self.rows_ = np.vstack(
[
self.row_labels_ == label
for label in range(n_row_clusters)
for _ in range(n_col_clusters)
]
)
self.columns_ = np.vstack(
[
self.column_labels_ == label
for _ in range(n_row_clusters)
for label in range(n_col_clusters)
]
)
def _fit_best_piecewise(self, vectors, n_best, n_clusters):
"""Find the ``n_best`` vectors that are best approximated by piecewise
constant vectors.
The piecewise vectors are found by k-means; the best is chosen
according to Euclidean distance.
"""
def make_piecewise(v):
centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters)
return centroid[labels].ravel()
piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors)
dists = np.apply_along_axis(norm, axis=1, arr=(vectors - piecewise_vectors))
result = vectors[np.argsort(dists)[:n_best]]
return result
def _project_and_cluster(self, data, vectors, n_clusters):
"""Project ``data`` to ``vectors`` and cluster the result."""
projected = safe_sparse_dot(data, vectors)
_, labels = self._k_means(projected, n_clusters)
return labels

View file

@ -0,0 +1,742 @@
# Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# Joel Nothman <joel.nothman@gmail.com>
# License: BSD 3 clause
import warnings
import numpy as np
from numbers import Integral, Real
from scipy import sparse
from math import sqrt
from ..metrics import pairwise_distances_argmin
from ..metrics.pairwise import euclidean_distances
from ..base import (
TransformerMixin,
ClusterMixin,
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
)
from ..utils.extmath import row_norms
from ..utils._param_validation import Interval
from ..utils.validation import check_is_fitted
from ..exceptions import ConvergenceWarning
from . import AgglomerativeClustering
from .._config import config_context
def _iterate_sparse_X(X):
"""This little hack returns a densified row when iterating over a sparse
matrix, instead of constructing a sparse matrix for every row that is
expensive.
"""
n_samples = X.shape[0]
X_indices = X.indices
X_data = X.data
X_indptr = X.indptr
for i in range(n_samples):
row = np.zeros(X.shape[1])
startptr, endptr = X_indptr[i], X_indptr[i + 1]
nonzero_indices = X_indices[startptr:endptr]
row[nonzero_indices] = X_data[startptr:endptr]
yield row
def _split_node(node, threshold, branching_factor):
"""The node has to be split if there is no place for a new subcluster
in the node.
1. Two empty nodes and two empty subclusters are initialized.
2. The pair of distant subclusters are found.
3. The properties of the empty subclusters and nodes are updated
according to the nearest distance between the subclusters to the
pair of distant subclusters.
4. The two nodes are set as children to the two subclusters.
"""
new_subcluster1 = _CFSubcluster()
new_subcluster2 = _CFSubcluster()
new_node1 = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=node.is_leaf,
n_features=node.n_features,
dtype=node.init_centroids_.dtype,
)
new_node2 = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=node.is_leaf,
n_features=node.n_features,
dtype=node.init_centroids_.dtype,
)
new_subcluster1.child_ = new_node1
new_subcluster2.child_ = new_node2
if node.is_leaf:
if node.prev_leaf_ is not None:
node.prev_leaf_.next_leaf_ = new_node1
new_node1.prev_leaf_ = node.prev_leaf_
new_node1.next_leaf_ = new_node2
new_node2.prev_leaf_ = new_node1
new_node2.next_leaf_ = node.next_leaf_
if node.next_leaf_ is not None:
node.next_leaf_.prev_leaf_ = new_node2
dist = euclidean_distances(
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True
)
n_clusters = dist.shape[0]
farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))
node1_dist, node2_dist = dist[(farthest_idx,)]
node1_closer = node1_dist < node2_dist
# make sure node1 is closest to itself even if all distances are equal.
# This can only happen when all node.centroids_ are duplicates leading to all
# distances between centroids being zero.
node1_closer[farthest_idx[0]] = True
for idx, subcluster in enumerate(node.subclusters_):
if node1_closer[idx]:
new_node1.append_subcluster(subcluster)
new_subcluster1.update(subcluster)
else:
new_node2.append_subcluster(subcluster)
new_subcluster2.update(subcluster)
return new_subcluster1, new_subcluster2
class _CFNode:
"""Each node in a CFTree is called a CFNode.
The CFNode can have a maximum of branching_factor
number of CFSubclusters.
Parameters
----------
threshold : float
Threshold needed for a new subcluster to enter a CFSubcluster.
branching_factor : int
Maximum number of CF subclusters in each node.
is_leaf : bool
We need to know if the CFNode is a leaf or not, in order to
retrieve the final subclusters.
n_features : int
The number of features.
Attributes
----------
subclusters_ : list
List of subclusters for a particular CFNode.
prev_leaf_ : _CFNode
Useful only if is_leaf is True.
next_leaf_ : _CFNode
next_leaf. Useful only if is_leaf is True.
the final subclusters.
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
Manipulate ``init_centroids_`` throughout rather than centroids_ since
the centroids are just a view of the ``init_centroids_`` .
init_sq_norm_ : ndarray of shape (branching_factor + 1,)
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
centroids_ : ndarray of shape (branching_factor + 1, n_features)
View of ``init_centroids_``.
squared_norm_ : ndarray of shape (branching_factor + 1,)
View of ``init_sq_norm_``.
"""
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
self.threshold = threshold
self.branching_factor = branching_factor
self.is_leaf = is_leaf
self.n_features = n_features
# The list of subclusters, centroids and squared norms
# to manipulate throughout.
self.subclusters_ = []
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype)
self.squared_norm_ = []
self.prev_leaf_ = None
self.next_leaf_ = None
def append_subcluster(self, subcluster):
n_samples = len(self.subclusters_)
self.subclusters_.append(subcluster)
self.init_centroids_[n_samples] = subcluster.centroid_
self.init_sq_norm_[n_samples] = subcluster.sq_norm_
# Keep centroids and squared norm as views. In this way
# if we change init_centroids and init_sq_norm_, it is
# sufficient,
self.centroids_ = self.init_centroids_[: n_samples + 1, :]
self.squared_norm_ = self.init_sq_norm_[: n_samples + 1]
def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
"""Remove a subcluster from a node and update it with the
split subclusters.
"""
ind = self.subclusters_.index(subcluster)
self.subclusters_[ind] = new_subcluster1
self.init_centroids_[ind] = new_subcluster1.centroid_
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
self.append_subcluster(new_subcluster2)
def insert_cf_subcluster(self, subcluster):
"""Insert a new subcluster into the node."""
if not self.subclusters_:
self.append_subcluster(subcluster)
return False
threshold = self.threshold
branching_factor = self.branching_factor
# We need to find the closest subcluster among all the
# subclusters so that we can insert our new subcluster.
dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
dist_matrix *= -2.0
dist_matrix += self.squared_norm_
closest_index = np.argmin(dist_matrix)
closest_subcluster = self.subclusters_[closest_index]
# If the subcluster has a child, we need a recursive strategy.
if closest_subcluster.child_ is not None:
split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)
if not split_child:
# If it is determined that the child need not be split, we
# can just update the closest_subcluster
closest_subcluster.update(subcluster)
self.init_centroids_[closest_index] = self.subclusters_[
closest_index
].centroid_
self.init_sq_norm_[closest_index] = self.subclusters_[
closest_index
].sq_norm_
return False
# things not too good. we need to redistribute the subclusters in
# our child node, and add a new subcluster in the parent
# subcluster to accommodate the new child.
else:
new_subcluster1, new_subcluster2 = _split_node(
closest_subcluster.child_,
threshold,
branching_factor,
)
self.update_split_subclusters(
closest_subcluster, new_subcluster1, new_subcluster2
)
if len(self.subclusters_) > self.branching_factor:
return True
return False
# good to go!
else:
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
if merged:
self.init_centroids_[closest_index] = closest_subcluster.centroid_
self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_
return False
# not close to any other subclusters, and we still
# have space, so add.
elif len(self.subclusters_) < self.branching_factor:
self.append_subcluster(subcluster)
return False
# We do not have enough space nor is it closer to an
# other subcluster. We need to split.
else:
self.append_subcluster(subcluster)
return True
class _CFSubcluster:
"""Each subcluster in a CFNode is called a CFSubcluster.
A CFSubcluster can have a CFNode has its child.
Parameters
----------
linear_sum : ndarray of shape (n_features,), default=None
Sample. This is kept optional to allow initialization of empty
subclusters.
Attributes
----------
n_samples_ : int
Number of samples that belong to each subcluster.
linear_sum_ : ndarray
Linear sum of all the samples in a subcluster. Prevents holding
all sample data in memory.
squared_sum_ : float
Sum of the squared l2 norms of all samples belonging to a subcluster.
centroid_ : ndarray of shape (branching_factor + 1, n_features)
Centroid of the subcluster. Prevent recomputing of centroids when
``CFNode.centroids_`` is called.
child_ : _CFNode
Child Node of the subcluster. Once a given _CFNode is set as the child
of the _CFNode, it is set to ``self.child_``.
sq_norm_ : ndarray of shape (branching_factor + 1,)
Squared norm of the subcluster. Used to prevent recomputing when
pairwise minimum distances are computed.
"""
def __init__(self, *, linear_sum=None):
if linear_sum is None:
self.n_samples_ = 0
self.squared_sum_ = 0.0
self.centroid_ = self.linear_sum_ = 0
else:
self.n_samples_ = 1
self.centroid_ = self.linear_sum_ = linear_sum
self.squared_sum_ = self.sq_norm_ = np.dot(
self.linear_sum_, self.linear_sum_
)
self.child_ = None
def update(self, subcluster):
self.n_samples_ += subcluster.n_samples_
self.linear_sum_ += subcluster.linear_sum_
self.squared_sum_ += subcluster.squared_sum_
self.centroid_ = self.linear_sum_ / self.n_samples_
self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
def merge_subcluster(self, nominee_cluster, threshold):
"""Check if a cluster is worthy enough to be merged. If
yes then merge.
"""
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
new_n = self.n_samples_ + nominee_cluster.n_samples_
new_centroid = (1 / new_n) * new_ls
new_sq_norm = np.dot(new_centroid, new_centroid)
# The squared radius of the cluster is defined:
# r^2 = sum_i ||x_i - c||^2 / n
# with x_i the n points assigned to the cluster and c its centroid:
# c = sum_i x_i / n
# This can be expanded to:
# r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
# and therefore simplifies to:
# r^2 = sum_i ||x_i||^2 / n - ||c||^2
sq_radius = new_ss / new_n - new_sq_norm
if sq_radius <= threshold**2:
(
self.n_samples_,
self.linear_sum_,
self.squared_sum_,
self.centroid_,
self.sq_norm_,
) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)
return True
return False
@property
def radius(self):
"""Return radius of the subcluster"""
# Because of numerical issues, this could become negative
sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
return sqrt(max(0, sq_radius))
class Birch(
ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
):
"""Implements the BIRCH clustering algorithm.
It is a memory-efficient, online-learning algorithm provided as an
alternative to :class:`MiniBatchKMeans`. It constructs a tree
data structure with the cluster centroids being read off the leaf.
These can be either the final cluster centroids or can be provided as input
to another clustering algorithm such as :class:`AgglomerativeClustering`.
Read more in the :ref:`User Guide <birch>`.
.. versionadded:: 0.16
Parameters
----------
threshold : float, default=0.5
The radius of the subcluster obtained by merging a new sample and the
closest subcluster should be lesser than the threshold. Otherwise a new
subcluster is started. Setting this value to be very low promotes
splitting and vice-versa.
branching_factor : int, default=50
Maximum number of CF subclusters in each node. If a new samples enters
such that the number of subclusters exceed the branching_factor then
that node is split into two nodes with the subclusters redistributed
in each. The parent subcluster of that node is removed and two new
subclusters are added as parents of the 2 split nodes.
n_clusters : int, instance of sklearn.cluster model or None, default=3
Number of clusters after the final clustering step, which treats the
subclusters from the leaves as new samples.
- `None` : the final clustering step is not performed and the
subclusters are returned as they are.
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model
is fit treating the subclusters as new samples and the initial data
is mapped to the label of the closest subcluster.
- `int` : the model fit is :class:`AgglomerativeClustering` with
`n_clusters` set to be equal to the int.
compute_labels : bool, default=True
Whether or not to compute labels for each fit.
copy : bool, default=True
Whether or not to make a copy of the given data. If set to False,
the initial data will be overwritten.
Attributes
----------
root_ : _CFNode
Root of the CFTree.
dummy_leaf_ : _CFNode
Start pointer to all the leaves.
subcluster_centers_ : ndarray
Centroids of all subclusters read directly from the leaves.
subcluster_labels_ : ndarray
Labels assigned to the centroids of the subclusters after
they are clustered globally.
labels_ : ndarray of shape (n_samples,)
Array of labels assigned to the input data.
if partial_fit is used instead of fit, they are assigned to the
last batch of data.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
MiniBatchKMeans : Alternative implementation that does incremental updates
of the centers' positions using mini-batches.
Notes
-----
The tree data structure consists of nodes with each node consisting of
a number of subclusters. The maximum number of subclusters in a node
is determined by the branching factor. Each subcluster maintains a
linear sum, squared sum and the number of samples in that subcluster.
In addition, each subcluster can also have a node as its child, if the
subcluster is not a member of a leaf node.
For a new point entering the root, it is merged with the subcluster closest
to it and the linear sum, squared sum and the number of samples of that
subcluster are updated. This is done recursively till the properties of
the leaf node are updated.
References
----------
* Tian Zhang, Raghu Ramakrishnan, Maron Livny
BIRCH: An efficient data clustering method for large databases.
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
* Roberto Perdisci
JBirch - Java implementation of BIRCH clustering algorithm
https://code.google.com/archive/p/jbirch
Examples
--------
>>> from sklearn.cluster import Birch
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
>>> brc = Birch(n_clusters=None)
>>> brc.fit(X)
Birch(n_clusters=None)
>>> brc.predict(X)
array([0, 0, 0, 1, 1, 1])
"""
_parameter_constraints: dict = {
"threshold": [Interval(Real, 0.0, None, closed="neither")],
"branching_factor": [Interval(Integral, 1, None, closed="neither")],
"n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")],
"compute_labels": ["boolean"],
"copy": ["boolean"],
}
def __init__(
self,
*,
threshold=0.5,
branching_factor=50,
n_clusters=3,
compute_labels=True,
copy=True,
):
self.threshold = threshold
self.branching_factor = branching_factor
self.n_clusters = n_clusters
self.compute_labels = compute_labels
self.copy = copy
def fit(self, X, y=None):
"""
Build a CF Tree for the input data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Fitted estimator.
"""
self._validate_params()
return self._fit(X, partial=False)
def _fit(self, X, partial):
has_root = getattr(self, "root_", None)
first_call = not (partial and has_root)
X = self._validate_data(
X,
accept_sparse="csr",
copy=self.copy,
reset=first_call,
dtype=[np.float64, np.float32],
)
threshold = self.threshold
branching_factor = self.branching_factor
n_samples, n_features = X.shape
# If partial_fit is called for the first time or fit is called, we
# start a new tree.
if first_call:
# The first root is the leaf. Manipulate this object throughout.
self.root_ = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=True,
n_features=n_features,
dtype=X.dtype,
)
# To enable getting back subclusters.
self.dummy_leaf_ = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=True,
n_features=n_features,
dtype=X.dtype,
)
self.dummy_leaf_.next_leaf_ = self.root_
self.root_.prev_leaf_ = self.dummy_leaf_
# Cannot vectorize. Enough to convince to use cython.
if not sparse.issparse(X):
iter_func = iter
else:
iter_func = _iterate_sparse_X
for sample in iter_func(X):
subcluster = _CFSubcluster(linear_sum=sample)
split = self.root_.insert_cf_subcluster(subcluster)
if split:
new_subcluster1, new_subcluster2 = _split_node(
self.root_, threshold, branching_factor
)
del self.root_
self.root_ = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=False,
n_features=n_features,
dtype=X.dtype,
)
self.root_.append_subcluster(new_subcluster1)
self.root_.append_subcluster(new_subcluster2)
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
self.subcluster_centers_ = centroids
self._n_features_out = self.subcluster_centers_.shape[0]
self._global_clustering(X)
return self
def _get_leaves(self):
"""
Retrieve the leaves of the CF Node.
Returns
-------
leaves : list of shape (n_leaves,)
List of the leaf nodes.
"""
leaf_ptr = self.dummy_leaf_.next_leaf_
leaves = []
while leaf_ptr is not None:
leaves.append(leaf_ptr)
leaf_ptr = leaf_ptr.next_leaf_
return leaves
def partial_fit(self, X=None, y=None):
"""
Online learning. Prevents rebuilding of CFTree from scratch.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), \
default=None
Input data. If X is not provided, only the global clustering
step is done.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Fitted estimator.
"""
self._validate_params()
if X is None:
# Perform just the final global clustering step.
self._global_clustering()
return self
else:
return self._fit(X, partial=True)
def _check_fit(self, X):
check_is_fitted(self)
if (
hasattr(self, "subcluster_centers_")
and X.shape[1] != self.subcluster_centers_.shape[1]
):
raise ValueError(
"Training data and predicted data do not have same number of features."
)
def predict(self, X):
"""
Predict data using the ``centroids_`` of subclusters.
Avoid computation of the row norms of X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
Returns
-------
labels : ndarray of shape(n_samples,)
Labelled data.
"""
check_is_fitted(self)
X = self._validate_data(X, accept_sparse="csr", reset=False)
return self._predict(X)
def _predict(self, X):
"""Predict data using the ``centroids_`` of subclusters."""
kwargs = {"Y_norm_squared": self._subcluster_norms}
with config_context(assume_finite=True):
argmin = pairwise_distances_argmin(
X, self.subcluster_centers_, metric_kwargs=kwargs
)
return self.subcluster_labels_[argmin]
def transform(self, X):
"""
Transform X into subcluster centroids dimension.
Each dimension represents the distance from the sample point to each
cluster centroid.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
Returns
-------
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
Transformed data.
"""
check_is_fitted(self)
X = self._validate_data(X, accept_sparse="csr", reset=False)
with config_context(assume_finite=True):
return euclidean_distances(X, self.subcluster_centers_)
def _global_clustering(self, X=None):
"""
Global clustering for the subclusters obtained after fitting
"""
clusterer = self.n_clusters
centroids = self.subcluster_centers_
compute_labels = (X is not None) and self.compute_labels
# Preprocessing for the global clustering.
not_enough_centroids = False
if isinstance(clusterer, Integral):
clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
# There is no need to perform the global clustering step.
if len(centroids) < self.n_clusters:
not_enough_centroids = True
# To use in predict to avoid recalculation.
self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
if clusterer is None or not_enough_centroids:
self.subcluster_labels_ = np.arange(len(centroids))
if not_enough_centroids:
warnings.warn(
"Number of subclusters found (%d) by BIRCH is less "
"than (%d). Decrease the threshold."
% (len(centroids), self.n_clusters),
ConvergenceWarning,
)
else:
# The global clustering step that clusters the subclusters of
# the leaves. It assumes the centroids of the subclusters as
# samples and finds the final centroids.
self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
if compute_labels:
self.labels_ = self._predict(X)
def _more_tags(self):
return {"preserves_dtype": [np.float64, np.float32]}

View file

@ -0,0 +1,523 @@
"""Bisecting K-means clustering."""
# Author: Michal Krawczyk <mkrwczyk.1@gmail.com>
import warnings
import numpy as np
import scipy.sparse as sp
from ._kmeans import _BaseKMeans
from ._kmeans import _kmeans_single_elkan
from ._kmeans import _kmeans_single_lloyd
from ._kmeans import _labels_inertia_threadpool_limit
from ._k_means_common import _inertia_dense
from ._k_means_common import _inertia_sparse
from ..utils.extmath import row_norms
from ..utils._openmp_helpers import _openmp_effective_n_threads
from ..utils.validation import check_is_fitted
from ..utils.validation import _check_sample_weight
from ..utils.validation import check_random_state
from ..utils._param_validation import StrOptions
class _BisectingTree:
"""Tree structure representing the hierarchical clusters of BisectingKMeans."""
def __init__(self, center, indices, score):
"""Create a new cluster node in the tree.
The node holds the center of this cluster and the indices of the data points
that belong to it.
"""
self.center = center
self.indices = indices
self.score = score
self.left = None
self.right = None
def split(self, labels, centers, scores):
"""Split the cluster node into two subclusters."""
self.left = _BisectingTree(
indices=self.indices[labels == 0], center=centers[0], score=scores[0]
)
self.right = _BisectingTree(
indices=self.indices[labels == 1], center=centers[1], score=scores[1]
)
# reset the indices attribute to save memory
self.indices = None
def get_cluster_to_bisect(self):
"""Return the cluster node to bisect next.
It's based on the score of the cluster, which can be either the number of
data points assigned to that cluster or the inertia of that cluster
(see `bisecting_strategy` for details).
"""
max_score = None
for cluster_leaf in self.iter_leaves():
if max_score is None or cluster_leaf.score > max_score:
max_score = cluster_leaf.score
best_cluster_leaf = cluster_leaf
return best_cluster_leaf
def iter_leaves(self):
"""Iterate over all the cluster leaves in the tree."""
if self.left is None:
yield self
else:
yield from self.left.iter_leaves()
yield from self.right.iter_leaves()
class BisectingKMeans(_BaseKMeans):
"""Bisecting K-Means clustering.
Read more in the :ref:`User Guide <bisect_k_means>`.
.. versionadded:: 1.1
Parameters
----------
n_clusters : int, default=8
The number of clusters to form as well as the number of
centroids to generate.
init : {'k-means++', 'random'} or callable, default='random'
Method for initialization:
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': choose `n_clusters` observations (rows) at random from data
for the initial centroids.
If a callable is passed, it should take arguments X, n_clusters and a
random state and return an initialization.
n_init : int, default=1
Number of time the inner k-means algorithm will be run with different
centroid seeds in each bisection.
That will result producing for each bisection best output of n_init
consecutive runs in terms of inertia.
random_state : int, RandomState instance or None, default=None
Determines random number generation for centroid initialization
in inner K-Means. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
max_iter : int, default=300
Maximum number of iterations of the inner k-means algorithm at each
bisection.
verbose : int, default=0
Verbosity mode.
tol : float, default=1e-4
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence. Used in inner k-means algorithm at each bisection to pick
best possible clusters.
copy_x : bool, default=True
When pre-computing distances it is more numerically accurate to center
the data first. If copy_x is True (default), then the original data is
not modified. If False, the original data is modified, and put back
before the function returns, but small numerical differences may be
introduced by subtracting and then adding the data mean. Note that if
the original data is not C-contiguous, a copy will be made even if
copy_x is False. If the original data is sparse, but not in CSR format,
a copy will be made even if copy_x is False.
algorithm : {"lloyd", "elkan"}, default="lloyd"
Inner K-means algorithm used in bisection.
The classical EM-style algorithm is `"lloyd"`.
The `"elkan"` variation can be more efficient on some datasets with
well-defined clusters, by using the triangle inequality. However it's
more memory intensive due to the allocation of an extra array of shape
`(n_samples, n_clusters)`.
bisecting_strategy : {"biggest_inertia", "largest_cluster"},\
default="biggest_inertia"
Defines how bisection should be performed:
- "biggest_inertia" means that BisectingKMeans will always check
all calculated cluster for cluster with biggest SSE
(Sum of squared errors) and bisect it. This approach concentrates on
precision, but may be costly in terms of execution time (especially for
larger amount of data points).
- "largest_cluster" - BisectingKMeans will always split cluster with
largest amount of points assigned to it from all clusters
previously calculated. That should work faster than picking by SSE
('biggest_inertia') and may produce similar results in most cases.
Attributes
----------
cluster_centers_ : ndarray of shape (n_clusters, n_features)
Coordinates of cluster centers. If the algorithm stops before fully
converging (see ``tol`` and ``max_iter``), these will not be
consistent with ``labels_``.
labels_ : ndarray of shape (n_samples,)
Labels of each point.
inertia_ : float
Sum of squared distances of samples to their closest cluster center,
weighted by the sample weights if provided.
n_features_in_ : int
Number of features seen during :term:`fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
See Also
--------
KMeans : Original implementation of K-Means algorithm.
Notes
-----
It might be inefficient when n_cluster is less than 3, due to unnecessary
calculations for that case.
Examples
--------
>>> from sklearn.cluster import BisectingKMeans
>>> import numpy as np
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [10, 2], [10, 4], [10, 0],
... [10, 6], [10, 8], [10, 10]])
>>> bisect_means = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
>>> bisect_means.labels_
array([2, 2, 2, 0, 0, 0, 1, 1, 1], dtype=int32)
>>> bisect_means.predict([[0, 0], [12, 3]])
array([2, 0], dtype=int32)
>>> bisect_means.cluster_centers_
array([[10., 2.],
[10., 8.],
[ 1., 2.]])
"""
_parameter_constraints: dict = {
**_BaseKMeans._parameter_constraints,
"init": [StrOptions({"k-means++", "random"}), callable],
"copy_x": ["boolean"],
"algorithm": [StrOptions({"lloyd", "elkan"})],
"bisecting_strategy": [StrOptions({"biggest_inertia", "largest_cluster"})],
}
def __init__(
self,
n_clusters=8,
*,
init="random",
n_init=1,
random_state=None,
max_iter=300,
verbose=0,
tol=1e-4,
copy_x=True,
algorithm="lloyd",
bisecting_strategy="biggest_inertia",
):
super().__init__(
n_clusters=n_clusters,
init=init,
max_iter=max_iter,
verbose=verbose,
random_state=random_state,
tol=tol,
n_init=n_init,
)
self.copy_x = copy_x
self.algorithm = algorithm
self.bisecting_strategy = bisecting_strategy
def _warn_mkl_vcomp(self, n_active_threads):
"""Warn when vcomp and mkl are both present"""
warnings.warn(
"BisectingKMeans is known to have a memory leak on Windows "
"with MKL, when there are less chunks than available "
"threads. You can avoid it by setting the environment"
f" variable OMP_NUM_THREADS={n_active_threads}."
)
def _inertia_per_cluster(self, X, centers, labels, sample_weight):
"""Calculate the sum of squared errors (inertia) per cluster.
Parameters
----------
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
The input samples.
centers : ndarray of shape (n_clusters, n_features)
The cluster centers.
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
sample_weight : ndarray of shape (n_samples,)
The weights for each observation in X.
Returns
-------
inertia_per_cluster : ndarray of shape (n_clusters,)
Sum of squared errors (inertia) for each cluster.
"""
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
inertia_per_cluster = np.empty(centers.shape[1])
for label in range(centers.shape[0]):
inertia_per_cluster[label] = _inertia(
X, sample_weight, centers, labels, self._n_threads, single_label=label
)
return inertia_per_cluster
def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
"""Split a cluster into 2 subsclusters.
Parameters
----------
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
Training instances to cluster.
x_squared_norms : ndarray of shape (n_samples,)
Squared euclidean norm of each data point.
sample_weight : ndarray of shape (n_samples,)
The weights for each observation in X.
cluster_to_bisect : _BisectingTree node object
The cluster node to split.
"""
X = X[cluster_to_bisect.indices]
x_squared_norms = x_squared_norms[cluster_to_bisect.indices]
sample_weight = sample_weight[cluster_to_bisect.indices]
best_inertia = None
# Split samples in X into 2 clusters.
# Repeating `n_init` times to obtain best clusters
for _ in range(self.n_init):
centers_init = self._init_centroids(
X, x_squared_norms, self.init, self._random_state, n_centroids=2
)
labels, inertia, centers, _ = self._kmeans_single(
X,
sample_weight,
centers_init,
max_iter=self.max_iter,
verbose=self.verbose,
tol=self.tol,
n_threads=self._n_threads,
)
# allow small tolerance on the inertia to accommodate for
# non-deterministic rounding errors due to parallel computation
if best_inertia is None or inertia < best_inertia * (1 - 1e-6):
best_labels = labels
best_centers = centers
best_inertia = inertia
if self.verbose:
print(f"New centroids from bisection: {best_centers}")
if self.bisecting_strategy == "biggest_inertia":
scores = self._inertia_per_cluster(
X, best_centers, best_labels, sample_weight
)
else: # bisecting_strategy == "largest_cluster"
# Using minlength to make sure that we have the counts for both labels even
# if all samples are labelled 0.
scores = np.bincount(best_labels, minlength=2)
cluster_to_bisect.split(best_labels, best_centers, scores)
def fit(self, X, y=None, sample_weight=None):
"""Compute bisecting k-means clustering.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training instances to cluster.
.. note:: The data will be converted to C ordering,
which will cause a memory copy
if the given data is not C-contiguous.
y : Ignored
Not used, present here for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
The weights for each observation in X. If None, all observations
are assigned equal weight.
Returns
-------
self
Fitted estimator.
"""
self._validate_params()
X = self._validate_data(
X,
accept_sparse="csr",
dtype=[np.float64, np.float32],
order="C",
copy=self.copy_x,
accept_large_sparse=False,
)
self._check_params_vs_input(X)
self._random_state = check_random_state(self.random_state)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
self._n_threads = _openmp_effective_n_threads()
if self.algorithm == "lloyd" or self.n_clusters == 1:
self._kmeans_single = _kmeans_single_lloyd
self._check_mkl_vcomp(X, X.shape[0])
else:
self._kmeans_single = _kmeans_single_elkan
# Subtract of mean of X for more accurate distance computations
if not sp.issparse(X):
self._X_mean = X.mean(axis=0)
X -= self._X_mean
# Initialize the hierarchical clusters tree
self._bisecting_tree = _BisectingTree(
indices=np.arange(X.shape[0]),
center=X.mean(axis=0),
score=0,
)
x_squared_norms = row_norms(X, squared=True)
for _ in range(self.n_clusters - 1):
# Chose cluster to bisect
cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
# Split this cluster into 2 subclusters
self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect)
# Aggregate final labels and centers from the bisecting tree
self.labels_ = np.full(X.shape[0], -1, dtype=np.int32)
self.cluster_centers_ = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
for i, cluster_node in enumerate(self._bisecting_tree.iter_leaves()):
self.labels_[cluster_node.indices] = i
self.cluster_centers_[i] = cluster_node.center
cluster_node.label = i # label final clusters for future prediction
cluster_node.indices = None # release memory
# Restore original data
if not sp.issparse(X):
X += self._X_mean
self.cluster_centers_ += self._X_mean
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
self.inertia_ = _inertia(
X, sample_weight, self.cluster_centers_, self.labels_, self._n_threads
)
self._n_features_out = self.cluster_centers_.shape[0]
return self
def predict(self, X):
"""Predict which cluster each sample in X belongs to.
Prediction is made by going down the hierarchical tree
in searching of closest leaf cluster.
In the vector quantization literature, `cluster_centers_` is called
the code book and each value returned by `predict` is the index of
the closest code in the code book.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
New data to predict.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
check_is_fitted(self)
X = self._check_test_data(X)
x_squared_norms = row_norms(X, squared=True)
# sample weights are unused but necessary in cython helpers
sample_weight = np.ones_like(x_squared_norms)
labels = self._predict_recursive(X, sample_weight, self._bisecting_tree)
return labels
def _predict_recursive(self, X, sample_weight, cluster_node):
"""Predict recursively by going down the hierarchical tree.
Parameters
----------
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
The data points, currently assigned to `cluster_node`, to predict between
the subclusters of this node.
sample_weight : ndarray of shape (n_samples,)
The weights for each observation in X.
cluster_node : _BisectingTree node object
The cluster node of the hierarchical tree.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
if cluster_node.left is None:
# This cluster has no subcluster. Labels are just the label of the cluster.
return np.full(X.shape[0], cluster_node.label, dtype=np.int32)
# Determine if data points belong to the left or right subcluster
centers = np.vstack((cluster_node.left.center, cluster_node.right.center))
if hasattr(self, "_X_mean"):
centers += self._X_mean
cluster_labels = _labels_inertia_threadpool_limit(
X,
sample_weight,
centers,
self._n_threads,
return_inertia=False,
)
mask = cluster_labels == 0
# Compute the labels for each subset of the data points.
labels = np.full(X.shape[0], -1, dtype=np.int32)
labels[mask] = self._predict_recursive(
X[mask], sample_weight[mask], cluster_node.left
)
labels[~mask] = self._predict_recursive(
X[~mask], sample_weight[~mask], cluster_node.right
)
return labels
def _more_tags(self):
return {"preserves_dtype": [np.float64, np.float32]}

View file

@ -0,0 +1,450 @@
"""
DBSCAN: Density-Based Spatial Clustering of Applications with Noise
"""
# Author: Robert Layton <robertlayton@gmail.com>
# Joel Nothman <joel.nothman@gmail.com>
# Lars Buitinck
#
# License: BSD 3 clause
import warnings
from numbers import Integral, Real
import numpy as np
from scipy import sparse
from ..metrics.pairwise import _VALID_METRICS
from ..base import BaseEstimator, ClusterMixin
from ..utils.validation import _check_sample_weight
from ..utils._param_validation import Interval, StrOptions
from ..neighbors import NearestNeighbors
from ._dbscan_inner import dbscan_inner
def dbscan(
X,
eps=0.5,
*,
min_samples=5,
metric="minkowski",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=2,
sample_weight=None,
n_jobs=None,
):
"""Perform DBSCAN clustering from vector array or distance matrix.
Read more in the :ref:`User Guide <dbscan>`.
Parameters
----------
X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``.
eps : float, default=0.5
The maximum distance between two samples for one to be considered
as in the neighborhood of the other. This is not a maximum bound
on the distances of points within a cluster. This is the most
important DBSCAN parameter to choose appropriately for your data set
and distance function.
min_samples : int, default=5
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point. This includes the point itself.
metric : str or callable, default='minkowski'
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit.
X may be a :term:`sparse graph <sparse graph>`,
in which case only "nonzero" elements may be considered neighbors.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.19
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.
leaf_size : int, default=30
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.
p : float, default=2
The power of the Minkowski metric to be used to calculate distance
between points.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search. ``None`` means
1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
using all processors. See :term:`Glossary <n_jobs>` for more details.
If precomputed distance are used, parallel execution is not available
and thus n_jobs will have no effect.
Returns
-------
core_samples : ndarray of shape (n_core_samples,)
Indices of core samples.
labels : ndarray of shape (n_samples,)
Cluster labels for each point. Noisy samples are given the label -1.
See Also
--------
DBSCAN : An estimator interface for this clustering algorithm.
OPTICS : A similar estimator interface clustering at multiple values of
eps. Our implementation is optimized for memory usage.
Notes
-----
For an example, see :ref:`examples/cluster/plot_dbscan.py
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n). It may attract a higher
memory complexity when querying these nearest neighborhoods, depending
on the ``algorithm``.
One way to avoid the query complexity is to pre-compute sparse
neighborhoods in chunks using
:func:`NearestNeighbors.radius_neighbors_graph
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
``mode='distance'``, then using ``metric='precomputed'`` here.
Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.
:func:`cluster.optics <sklearn.cluster.optics>` provides a similar
clustering with lower memory usage.
References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
<https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf>`_.
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
<10.1145/3068335>`
ACM Transactions on Database Systems (TODS), 42(3), 19.
"""
est = DBSCAN(
eps=eps,
min_samples=min_samples,
metric=metric,
metric_params=metric_params,
algorithm=algorithm,
leaf_size=leaf_size,
p=p,
n_jobs=n_jobs,
)
est.fit(X, sample_weight=sample_weight)
return est.core_sample_indices_, est.labels_
class DBSCAN(ClusterMixin, BaseEstimator):
"""Perform DBSCAN clustering from vector array or distance matrix.
DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
Finds core samples of high density and expands clusters from them.
Good for data which contains clusters of similar density.
Read more in the :ref:`User Guide <dbscan>`.
Parameters
----------
eps : float, default=0.5
The maximum distance between two samples for one to be considered
as in the neighborhood of the other. This is not a maximum bound
on the distances of points within a cluster. This is the most
important DBSCAN parameter to choose appropriately for your data set
and distance function.
min_samples : int, default=5
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point. This includes the point itself.
metric : str, or callable, default='euclidean'
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a :term:`sparse graph`, in which
case only "nonzero" elements may be considered neighbors for DBSCAN.
.. versionadded:: 0.17
metric *precomputed* to accept precomputed sparse matrix.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.19
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.
leaf_size : int, default=30
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.
p : float, default=None
The power of the Minkowski metric to be used to calculate distance
between points. If None, then ``p=2`` (equivalent to the Euclidean
distance).
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
core_sample_indices_ : ndarray of shape (n_core_samples,)
Indices of core samples.
components_ : ndarray of shape (n_core_samples, n_features)
Copy of each core sample found by training.
labels_ : ndarray of shape (n_samples)
Cluster labels for each point in the dataset given to fit().
Noisy samples are given the label -1.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
OPTICS : A similar clustering at multiple values of eps. Our implementation
is optimized for memory usage.
Notes
-----
For an example, see :ref:`examples/cluster/plot_dbscan.py
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n). It may attract a higher
memory complexity when querying these nearest neighborhoods, depending
on the ``algorithm``.
One way to avoid the query complexity is to pre-compute sparse
neighborhoods in chunks using
:func:`NearestNeighbors.radius_neighbors_graph
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
``mode='distance'``, then using ``metric='precomputed'`` here.
Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.
:class:`cluster.OPTICS` provides a similar clustering with lower memory
usage.
References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
<https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf>`_.
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
<10.1145/3068335>`
ACM Transactions on Database Systems (TODS), 42(3), 19.
Examples
--------
>>> from sklearn.cluster import DBSCAN
>>> import numpy as np
>>> X = np.array([[1, 2], [2, 2], [2, 3],
... [8, 7], [8, 8], [25, 80]])
>>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
>>> clustering.labels_
array([ 0, 0, 0, 1, 1, -1])
>>> clustering
DBSCAN(eps=3, min_samples=2)
"""
_parameter_constraints: dict = {
"eps": [Interval(Real, 0.0, None, closed="neither")],
"min_samples": [Interval(Integral, 1, None, closed="left")],
"metric": [
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
callable,
],
"metric_params": [dict, None],
"algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
"leaf_size": [Interval(Integral, 1, None, closed="left")],
"p": [Interval(Real, 0.0, None, closed="left"), None],
"n_jobs": [Integral, None],
}
def __init__(
self,
eps=0.5,
*,
min_samples=5,
metric="euclidean",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=None,
n_jobs=None,
):
self.eps = eps
self.min_samples = min_samples
self.metric = metric
self.metric_params = metric_params
self.algorithm = algorithm
self.leaf_size = leaf_size
self.p = p
self.n_jobs = n_jobs
def fit(self, X, y=None, sample_weight=None):
"""Perform DBSCAN clustering from features, or distance matrix.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``metric='precomputed'``. If a sparse matrix is provided, it will
be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with a
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
Returns
-------
self : object
Returns a fitted instance of self.
"""
self._validate_params()
X = self._validate_data(X, accept_sparse="csr")
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
# Calculate neighborhood for all samples. This leaves the original
# point in, which needs to be considered later (i.e. point i is in the
# neighborhood of point i. While True, its useless information)
if self.metric == "precomputed" and sparse.issparse(X):
# set the diagonal to explicit values, as a point is its own
# neighbor
with warnings.catch_warnings():
warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place
neighbors_model = NearestNeighbors(
radius=self.eps,
algorithm=self.algorithm,
leaf_size=self.leaf_size,
metric=self.metric,
metric_params=self.metric_params,
p=self.p,
n_jobs=self.n_jobs,
)
neighbors_model.fit(X)
# This has worst case O(n^2) memory complexity
neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)
if sample_weight is None:
n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
else:
n_neighbors = np.array(
[np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]
)
# Initially, all samples are noise.
labels = np.full(X.shape[0], -1, dtype=np.intp)
# A list of all core samples found.
core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)
dbscan_inner(core_samples, neighborhoods, labels)
self.core_sample_indices_ = np.where(core_samples)[0]
self.labels_ = labels
if len(self.core_sample_indices_):
# fix for scipy sparse indexing issue
self.components_ = X[self.core_sample_indices_].copy()
else:
# no core samples
self.components_ = np.empty((0, X.shape[1]))
return self
def fit_predict(self, X, y=None, sample_weight=None):
"""Compute clusters from a data or distance matrix and predict labels.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``metric='precomputed'``. If a sparse matrix is provided, it will
be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with a
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels. Noisy samples are given the label -1.
"""
self.fit(X, sample_weight=sample_weight)
return self.labels_
def _more_tags(self):
return {"pairwise": self.metric == "precomputed"}

View file

@ -0,0 +1,75 @@
"""
Feature agglomeration. Base classes and functions for performing feature
agglomeration.
"""
# Author: V. Michel, A. Gramfort
# License: BSD 3 clause
import numpy as np
from ..base import TransformerMixin
from ..utils.validation import check_is_fitted
from scipy.sparse import issparse
###############################################################################
# Mixin class for feature agglomeration.
class AgglomerationTransform(TransformerMixin):
"""
A class for feature agglomeration via the transform interface.
"""
def transform(self, X):
"""
Transform a new matrix using the built clustering.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
A M by N array of M observations in N dimensions or a length
M array of M one-dimensional observations.
Returns
-------
Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,)
The pooled values for each feature cluster.
"""
check_is_fitted(self)
X = self._validate_data(X, reset=False)
if self.pooling_func == np.mean and not issparse(X):
size = np.bincount(self.labels_)
n_samples = X.shape[0]
# a fast way to compute the mean of grouped features
nX = np.array(
[np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)]
)
else:
nX = [
self.pooling_func(X[:, self.labels_ == l], axis=1)
for l in np.unique(self.labels_)
]
nX = np.array(nX).T
return nX
def inverse_transform(self, Xred):
"""
Inverse the transformation and return a vector of size `n_features`.
Parameters
----------
Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,)
The values to be assigned to each cluster of samples.
Returns
-------
X : ndarray of shape (n_samples, n_features) or (n_features,)
A vector of size `n_samples` with the values of `Xred` assigned to
each of the cluster of samples.
"""
check_is_fitted(self)
unil, inverse = np.unique(self.labels_, return_inverse=True)
return Xred[..., inverse]

View file

@ -0,0 +1,19 @@
from cython cimport floating
cdef floating _euclidean_dense_dense(floating*, floating*, int, bint) nogil
cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1],
floating, bint) nogil
cpdef void _relocate_empty_clusters_dense(
floating[:, ::1], floating[::1], floating[:, ::1],
floating[:, ::1], floating[::1], int[::1])
cpdef void _relocate_empty_clusters_sparse(
floating[::1], int[::1], int[::1], floating[::1], floating[:, ::1],
floating[:, ::1], floating[::1], int[::1])
cdef void _average_centers(floating[:, ::1], floating[::1])
cdef void _center_shift(floating[:, ::1], floating[:, ::1], floating[::1])

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,536 @@
"""Mean shift clustering algorithm.
Mean shift clustering aims to discover *blobs* in a smooth density of
samples. It is a centroid based algorithm, which works by updating candidates
for centroids to be the mean of the points within a given region. These
candidates are then filtered in a post-processing stage to eliminate
near-duplicates to form the final set of centroids.
Seeding is performed using a binning technique for scalability.
"""
# Authors: Conrad Lee <conradlee@gmail.com>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Martino Sorbaro <martino.sorbaro@ed.ac.uk>
import numpy as np
import warnings
from numbers import Integral, Real
from collections import defaultdict
from ..utils._param_validation import Interval
from ..utils.validation import check_is_fitted
from ..utils.parallel import delayed, Parallel
from ..utils import check_random_state, gen_batches, check_array
from ..base import BaseEstimator, ClusterMixin
from ..neighbors import NearestNeighbors
from ..metrics.pairwise import pairwise_distances_argmin
from .._config import config_context
def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0, n_jobs=None):
"""Estimate the bandwidth to use with the mean-shift algorithm.
That this function takes time at least quadratic in n_samples. For large
datasets, it's wise to set that parameter to a small value.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input points.
quantile : float, default=0.3
Should be between [0, 1]
0.5 means that the median of all pairwise distances is used.
n_samples : int, default=None
The number of samples to use. If not given, all samples are used.
random_state : int, RandomState instance, default=None
The generator used to randomly select the samples from input points
for bandwidth estimation. Use an int to make the randomness
deterministic.
See :term:`Glossary <random_state>`.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
bandwidth : float
The bandwidth parameter.
"""
X = check_array(X)
random_state = check_random_state(random_state)
if n_samples is not None:
idx = random_state.permutation(X.shape[0])[:n_samples]
X = X[idx]
n_neighbors = int(X.shape[0] * quantile)
if n_neighbors < 1: # cannot fit NearestNeighbors with n_neighbors = 0
n_neighbors = 1
nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=n_jobs)
nbrs.fit(X)
bandwidth = 0.0
for batch in gen_batches(len(X), 500):
d, _ = nbrs.kneighbors(X[batch, :], return_distance=True)
bandwidth += np.max(d, axis=1).sum()
return bandwidth / X.shape[0]
# separate function for each seed's iterative loop
def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
# For each seed, climb gradient until convergence or max_iter
bandwidth = nbrs.get_params()["radius"]
stop_thresh = 1e-3 * bandwidth # when mean has converged
completed_iterations = 0
while True:
# Find mean of points within bandwidth
i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0]
points_within = X[i_nbrs]
if len(points_within) == 0:
break # Depending on seeding strategy this condition may occur
my_old_mean = my_mean # save the old mean
my_mean = np.mean(points_within, axis=0)
# If converged or at max_iter, adds the cluster
if (
np.linalg.norm(my_mean - my_old_mean) < stop_thresh
or completed_iterations == max_iter
):
break
completed_iterations += 1
return tuple(my_mean), len(points_within), completed_iterations
def mean_shift(
X,
*,
bandwidth=None,
seeds=None,
bin_seeding=False,
min_bin_freq=1,
cluster_all=True,
max_iter=300,
n_jobs=None,
):
"""Perform mean shift clustering of data using a flat kernel.
Read more in the :ref:`User Guide <mean_shift>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data.
bandwidth : float, default=None
Kernel bandwidth.
If bandwidth is not given, it is determined using a heuristic based on
the median of all pairwise distances. This will take quadratic time in
the number of samples. The sklearn.cluster.estimate_bandwidth function
can be used to do this more efficiently.
seeds : array-like of shape (n_seeds, n_features) or None
Point used as initial kernel locations. If None and bin_seeding=False,
each data point is used as a seed. If None and bin_seeding=True,
see bin_seeding.
bin_seeding : bool, default=False
If true, initial kernel locations are not locations of all
points, but rather the location of the discretized version of
points, where points are binned onto a grid whose coarseness
corresponds to the bandwidth. Setting this option to True will speed
up the algorithm because fewer seeds will be initialized.
Ignored if seeds argument is not None.
min_bin_freq : int, default=1
To speed up the algorithm, accept only those bins with at least
min_bin_freq points as seeds.
cluster_all : bool, default=True
If true, then all points are clustered, even those orphans that are
not within any kernel. Orphans are assigned to the nearest kernel.
If false, then orphans are given cluster label -1.
max_iter : int, default=300
Maximum number of iterations, per seed point before the clustering
operation terminates (for that seed point), if has not converged yet.
n_jobs : int, default=None
The number of jobs to use for the computation. The following tasks benefit
from the parallelization:
- The search of nearest neighbors for bandwidth estimation and label
assignments. See the details in the docstring of the
``NearestNeighbors`` class.
- Hill-climbing optimization for all seeds.
See :term:`Glossary <n_jobs>` for more details.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
.. versionadded:: 0.17
Parallel Execution using *n_jobs*.
Returns
-------
cluster_centers : ndarray of shape (n_clusters, n_features)
Coordinates of cluster centers.
labels : ndarray of shape (n_samples,)
Cluster labels for each point.
Notes
-----
For an example, see :ref:`examples/cluster/plot_mean_shift.py
<sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.
"""
model = MeanShift(
bandwidth=bandwidth,
seeds=seeds,
min_bin_freq=min_bin_freq,
bin_seeding=bin_seeding,
cluster_all=cluster_all,
n_jobs=n_jobs,
max_iter=max_iter,
).fit(X)
return model.cluster_centers_, model.labels_
def get_bin_seeds(X, bin_size, min_bin_freq=1):
"""Find seeds for mean_shift.
Finds seeds by first binning data onto a grid whose lines are
spaced bin_size apart, and then choosing those bins with at least
min_bin_freq points.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input points, the same points that will be used in mean_shift.
bin_size : float
Controls the coarseness of the binning. Smaller values lead
to more seeding (which is computationally more expensive). If you're
not sure how to set this, set it to the value of the bandwidth used
in clustering.mean_shift.
min_bin_freq : int, default=1
Only bins with at least min_bin_freq will be selected as seeds.
Raising this value decreases the number of seeds found, which
makes mean_shift computationally cheaper.
Returns
-------
bin_seeds : array-like of shape (n_samples, n_features)
Points used as initial kernel positions in clustering.mean_shift.
"""
if bin_size == 0:
return X
# Bin points
bin_sizes = defaultdict(int)
for point in X:
binned_point = np.round(point / bin_size)
bin_sizes[tuple(binned_point)] += 1
# Select only those bins as seeds which have enough members
bin_seeds = np.array(
[point for point, freq in bin_sizes.items() if freq >= min_bin_freq],
dtype=np.float32,
)
if len(bin_seeds) == len(X):
warnings.warn(
"Binning data failed with provided bin_size=%f, using data points as seeds."
% bin_size
)
return X
bin_seeds = bin_seeds * bin_size
return bin_seeds
class MeanShift(ClusterMixin, BaseEstimator):
"""Mean shift clustering using a flat kernel.
Mean shift clustering aims to discover "blobs" in a smooth density of
samples. It is a centroid-based algorithm, which works by updating
candidates for centroids to be the mean of the points within a given
region. These candidates are then filtered in a post-processing stage to
eliminate near-duplicates to form the final set of centroids.
Seeding is performed using a binning technique for scalability.
Read more in the :ref:`User Guide <mean_shift>`.
Parameters
----------
bandwidth : float, default=None
Bandwidth used in the flat kernel.
If not given, the bandwidth is estimated using
sklearn.cluster.estimate_bandwidth; see the documentation for that
function for hints on scalability (see also the Notes, below).
seeds : array-like of shape (n_samples, n_features), default=None
Seeds used to initialize kernels. If not set,
the seeds are calculated by clustering.get_bin_seeds
with bandwidth as the grid size and default values for
other parameters.
bin_seeding : bool, default=False
If true, initial kernel locations are not locations of all
points, but rather the location of the discretized version of
points, where points are binned onto a grid whose coarseness
corresponds to the bandwidth. Setting this option to True will speed
up the algorithm because fewer seeds will be initialized.
The default value is False.
Ignored if seeds argument is not None.
min_bin_freq : int, default=1
To speed up the algorithm, accept only those bins with at least
min_bin_freq points as seeds.
cluster_all : bool, default=True
If true, then all points are clustered, even those orphans that are
not within any kernel. Orphans are assigned to the nearest kernel.
If false, then orphans are given cluster label -1.
n_jobs : int, default=None
The number of jobs to use for the computation. The following tasks benefit
from the parallelization:
- The search of nearest neighbors for bandwidth estimation and label
assignments. See the details in the docstring of the
``NearestNeighbors`` class.
- Hill-climbing optimization for all seeds.
See :term:`Glossary <n_jobs>` for more details.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_iter : int, default=300
Maximum number of iterations, per seed point before the clustering
operation terminates (for that seed point), if has not converged yet.
.. versionadded:: 0.22
Attributes
----------
cluster_centers_ : ndarray of shape (n_clusters, n_features)
Coordinates of cluster centers.
labels_ : ndarray of shape (n_samples,)
Labels of each point.
n_iter_ : int
Maximum number of iterations performed on each seed.
.. versionadded:: 0.22
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
KMeans : K-Means clustering.
Notes
-----
Scalability:
Because this implementation uses a flat kernel and
a Ball Tree to look up members of each kernel, the complexity will tend
towards O(T*n*log(n)) in lower dimensions, with n the number of samples
and T the number of points. In higher dimensions the complexity will
tend towards O(T*n^2).
Scalability can be boosted by using fewer seeds, for example by using
a higher value of min_bin_freq in the get_bin_seeds function.
Note that the estimate_bandwidth function is much less scalable than the
mean shift algorithm and will be the bottleneck if it is used.
References
----------
Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward
feature space analysis". IEEE Transactions on Pattern Analysis and
Machine Intelligence. 2002. pp. 603-619.
Examples
--------
>>> from sklearn.cluster import MeanShift
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = MeanShift(bandwidth=2).fit(X)
>>> clustering.labels_
array([1, 1, 1, 0, 0, 0])
>>> clustering.predict([[0, 0], [5, 5]])
array([1, 0])
>>> clustering
MeanShift(bandwidth=2)
"""
_parameter_constraints: dict = {
"bandwidth": [Interval(Real, 0, None, closed="neither"), None],
"seeds": ["array-like", None],
"bin_seeding": ["boolean"],
"min_bin_freq": [Interval(Integral, 1, None, closed="left")],
"cluster_all": ["boolean"],
"n_jobs": [Integral, None],
"max_iter": [Interval(Integral, 0, None, closed="left")],
}
def __init__(
self,
*,
bandwidth=None,
seeds=None,
bin_seeding=False,
min_bin_freq=1,
cluster_all=True,
n_jobs=None,
max_iter=300,
):
self.bandwidth = bandwidth
self.seeds = seeds
self.bin_seeding = bin_seeding
self.cluster_all = cluster_all
self.min_bin_freq = min_bin_freq
self.n_jobs = n_jobs
self.max_iter = max_iter
def fit(self, X, y=None):
"""Perform clustering.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples to cluster.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Fitted instance.
"""
self._validate_params()
X = self._validate_data(X)
bandwidth = self.bandwidth
if bandwidth is None:
bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
seeds = self.seeds
if seeds is None:
if self.bin_seeding:
seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)
else:
seeds = X
n_samples, n_features = X.shape
center_intensity_dict = {}
# We use n_jobs=1 because this will be used in nested calls under
# parallel calls to _mean_shift_single_seed so there is no need for
# for further parallelism.
nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
# execute iterations on all seeds in parallel
all_res = Parallel(n_jobs=self.n_jobs)(
delayed(_mean_shift_single_seed)(seed, X, nbrs, self.max_iter)
for seed in seeds
)
# copy results in a dictionary
for i in range(len(seeds)):
if all_res[i][1]: # i.e. len(points_within) > 0
center_intensity_dict[all_res[i][0]] = all_res[i][1]
self.n_iter_ = max([x[2] for x in all_res])
if not center_intensity_dict:
# nothing near seeds
raise ValueError(
"No point was within bandwidth=%f of any seed. Try a different seeding"
" strategy or increase the bandwidth."
% bandwidth
)
# POST PROCESSING: remove near duplicate points
# If the distance between two kernels is less than the bandwidth,
# then we have to remove one because it is a duplicate. Remove the
# one with fewer points.
sorted_by_intensity = sorted(
center_intensity_dict.items(),
key=lambda tup: (tup[1], tup[0]),
reverse=True,
)
sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
unique = np.ones(len(sorted_centers), dtype=bool)
nbrs = NearestNeighbors(radius=bandwidth, n_jobs=self.n_jobs).fit(
sorted_centers
)
for i, center in enumerate(sorted_centers):
if unique[i]:
neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[
0
]
unique[neighbor_idxs] = 0
unique[i] = 1 # leave the current point as unique
cluster_centers = sorted_centers[unique]
# ASSIGN LABELS: a point belongs to the cluster that it is closest to
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs).fit(cluster_centers)
labels = np.zeros(n_samples, dtype=int)
distances, idxs = nbrs.kneighbors(X)
if self.cluster_all:
labels = idxs.flatten()
else:
labels.fill(-1)
bool_selector = distances.flatten() <= bandwidth
labels[bool_selector] = idxs.flatten()[bool_selector]
self.cluster_centers_, self.labels_ = cluster_centers, labels
return self
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
Parameters
----------
X : array-like of shape (n_samples, n_features)
New data to predict.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
check_is_fitted(self)
X = self._validate_data(X, reset=False)
with config_context(assume_finite=True):
return pairwise_distances_argmin(X, self.cluster_centers_)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,791 @@
"""Algorithms for spectral clustering"""
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
# Brian Cheung
# Wei LI <kuantkid@gmail.com>
# Andrew Knyazev <Andrew.Knyazev@ucdenver.edu>
# License: BSD 3 clause
from numbers import Integral, Real
import warnings
import numpy as np
from scipy.linalg import LinAlgError, qr, svd
from scipy.sparse import csc_matrix
from ..base import BaseEstimator, ClusterMixin
from ..utils._param_validation import Interval, StrOptions
from ..utils import check_random_state, as_float_array
from ..metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
from ..neighbors import kneighbors_graph, NearestNeighbors
from ..manifold import spectral_embedding
from ._kmeans import k_means
def cluster_qr(vectors):
"""Find the discrete partition closest to the eigenvector embedding.
This implementation was proposed in [1]_.
.. versionadded:: 1.1
Parameters
----------
vectors : array-like, shape: (n_samples, n_clusters)
The embedding space of the samples.
Returns
-------
labels : array of integers, shape: n_samples
The cluster labels of vectors.
References
----------
.. [1] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
Anil Damle, Victor Minden, Lexing Ying
<10.1093/imaiai/iay008>`
"""
k = vectors.shape[1]
_, _, piv = qr(vectors.T, pivoting=True)
ut, _, v = svd(vectors[piv[:k], :].T)
vectors = abs(np.dot(vectors, np.dot(ut, v.conj())))
return vectors.argmax(axis=1)
def discretize(
vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None
):
"""Search for a partition matrix which is closest to the eigenvector embedding.
This implementation was proposed in [1]_.
Parameters
----------
vectors : array-like of shape (n_samples, n_clusters)
The embedding space of the samples.
copy : bool, default=True
Whether to copy vectors, or perform in-place normalization.
max_svd_restarts : int, default=30
Maximum number of attempts to restart SVD if convergence fails
n_iter_max : int, default=30
Maximum number of iterations to attempt in rotation and partition
matrix search if machine precision convergence is not reached
random_state : int, RandomState instance, default=None
Determines random number generation for rotation matrix initialization.
Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Returns
-------
labels : array of integers, shape: n_samples
The labels of the clusters.
References
----------
.. [1] `Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
Notes
-----
The eigenvector embedding is used to iteratively search for the
closest discrete partition. First, the eigenvector embedding is
normalized to the space of partition matrices. An optimal discrete
partition matrix closest to this normalized embedding multiplied by
an initial rotation is calculated. Fixing this discrete partition
matrix, an optimal rotation matrix is calculated. These two
calculations are performed until convergence. The discrete partition
matrix is returned as the clustering solution. Used in spectral
clustering, this method tends to be faster and more robust to random
initialization than k-means.
"""
random_state = check_random_state(random_state)
vectors = as_float_array(vectors, copy=copy)
eps = np.finfo(float).eps
n_samples, n_components = vectors.shape
# Normalize the eigenvectors to an equal length of a vector of ones.
# Reorient the eigenvectors to point in the negative direction with respect
# to the first element. This may have to do with constraining the
# eigenvectors to lie in a specific quadrant to make the discretization
# search easier.
norm_ones = np.sqrt(n_samples)
for i in range(vectors.shape[1]):
vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) * norm_ones
if vectors[0, i] != 0:
vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])
# Normalize the rows of the eigenvectors. Samples should lie on the unit
# hypersphere centered at the origin. This transforms the samples in the
# embedding space to the space of partition matrices.
vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis]
svd_restarts = 0
has_converged = False
# If there is an exception we try to randomize and rerun SVD again
# do this max_svd_restarts times.
while (svd_restarts < max_svd_restarts) and not has_converged:
# Initialize first column of rotation matrix with a row of the
# eigenvectors
rotation = np.zeros((n_components, n_components))
rotation[:, 0] = vectors[random_state.randint(n_samples), :].T
# To initialize the rest of the rotation matrix, find the rows
# of the eigenvectors that are as orthogonal to each other as
# possible
c = np.zeros(n_samples)
for j in range(1, n_components):
# Accumulate c to ensure row is as orthogonal as possible to
# previous picks as well as current one
c += np.abs(np.dot(vectors, rotation[:, j - 1]))
rotation[:, j] = vectors[c.argmin(), :].T
last_objective_value = 0.0
n_iter = 0
while not has_converged:
n_iter += 1
t_discrete = np.dot(vectors, rotation)
labels = t_discrete.argmax(axis=1)
vectors_discrete = csc_matrix(
(np.ones(len(labels)), (np.arange(0, n_samples), labels)),
shape=(n_samples, n_components),
)
t_svd = vectors_discrete.T * vectors
try:
U, S, Vh = np.linalg.svd(t_svd)
except LinAlgError:
svd_restarts += 1
print("SVD did not converge, randomizing and trying again")
break
ncut_value = 2.0 * (n_samples - S.sum())
if (abs(ncut_value - last_objective_value) < eps) or (n_iter > n_iter_max):
has_converged = True
else:
# otherwise calculate rotation and continue
last_objective_value = ncut_value
rotation = np.dot(Vh.T, U.T)
if not has_converged:
raise LinAlgError("SVD did not converge")
return labels
def spectral_clustering(
affinity,
*,
n_clusters=8,
n_components=None,
eigen_solver=None,
random_state=None,
n_init=10,
eigen_tol="auto",
assign_labels="kmeans",
verbose=False,
):
"""Apply clustering to a projection of the normalized Laplacian.
In practice Spectral Clustering is very useful when the structure of
the individual clusters is highly non-convex or more generally when
a measure of the center and spread of the cluster is not a suitable
description of the complete cluster. For instance, when clusters are
nested circles on the 2D plane.
If affinity is the adjacency matrix of a graph, this method can be
used to find normalized graph cuts [1]_, [2]_.
Read more in the :ref:`User Guide <spectral_clustering>`.
Parameters
----------
affinity : {array-like, sparse matrix} of shape (n_samples, n_samples)
The affinity matrix describing the relationship of the samples to
embed. **Must be symmetric**.
Possible examples:
- adjacency matrix of a graph,
- heat kernel of the pairwise distance matrix of the samples,
- symmetric k-nearest neighbours connectivity matrix of the samples.
n_clusters : int, default=None
Number of clusters to extract.
n_components : int, default=n_clusters
Number of eigenvectors to use for the spectral embedding.
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
The eigenvalue decomposition method. If None then ``'arpack'`` is used.
See [4]_ for more details regarding ``'lobpcg'``.
Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional
Algebraic MultiGrid preconditioning and requires pyamg to be installed.
It can be faster on very large sparse problems [6]_ and [7]_.
random_state : int, RandomState instance, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigenvectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
n_init : int, default=10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of n_init
consecutive runs in terms of inertia. Only used if
``assign_labels='kmeans'``.
eigen_tol : float, default="auto"
Stopping criterion for eigendecomposition of the Laplacian matrix.
If `eigen_tol="auto"` then the passed tolerance will depend on the
`eigen_solver`:
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
`eigen_tol=None` which configures the underlying `lobpcg` solver to
automatically resolve the value according to their heuristics. See,
:func:`scipy.sparse.linalg.lobpcg` for details.
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
values of `tol<1e-5` may lead to convergence issues and should be
avoided.
.. versionadded:: 1.2
Added 'auto' option.
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
The strategy to use to assign labels in the embedding
space. There are three ways to assign labels after the Laplacian
embedding. k-means can be applied and is a popular choice. But it can
also be sensitive to initialization. Discretization is another
approach which is less sensitive to random initialization [3]_.
The cluster_qr method [5]_ directly extracts clusters from eigenvectors
in spectral clustering. In contrast to k-means and discretization, cluster_qr
has no tuning parameters and is not an iterative method, yet may outperform
k-means and discretization in terms of both quality and speed.
.. versionchanged:: 1.1
Added new labeling method 'cluster_qr'.
verbose : bool, default=False
Verbosity mode.
.. versionadded:: 0.24
Returns
-------
labels : array of integers, shape: n_samples
The labels of the clusters.
Notes
-----
The graph should contain only one connected component, elsewhere
the results make little sense.
This algorithm solves the normalized cut for `k=2`: it is a
normalized spectral clustering.
References
----------
.. [1] :doi:`Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
<10.1109/34.868688>`
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
<10.1007/s11222-007-9033-z>`
.. [3] `Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
A. V. Knyazev
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
<10.1137/S1064827500366124>`
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
Anil Damle, Victor Minden, Lexing Ying
<10.1093/imaiai/iay008>`
.. [6] :doi:`Multiscale Spectral Image Segmentation Multiscale preconditioning
for computing eigenvalues of graph Laplacians in image segmentation, 2006
Andrew Knyazev
<10.13140/RG.2.2.35280.02565>`
.. [7] :doi:`Preconditioned spectral clustering for stochastic block partition
streaming graph challenge (Preliminary version at arXiv.)
David Zhuzhunashvili, Andrew Knyazev
<10.1109/HPEC.2017.8091045>`
"""
if assign_labels not in ("kmeans", "discretize", "cluster_qr"):
raise ValueError(
"The 'assign_labels' parameter should be "
"'kmeans' or 'discretize', or 'cluster_qr', "
f"but {assign_labels!r} was given"
)
if isinstance(affinity, np.matrix):
raise TypeError(
"spectral_clustering does not support passing in affinity as an "
"np.matrix. Please convert to a numpy array with np.asarray. For "
"more information see: "
"https://numpy.org/doc/stable/reference/generated/numpy.matrix.html", # noqa
)
random_state = check_random_state(random_state)
n_components = n_clusters if n_components is None else n_components
# We now obtain the real valued solution matrix to the
# relaxed Ncut problem, solving the eigenvalue problem
# L_sym x = lambda x and recovering u = D^-1/2 x.
# The first eigenvector is constant only for fully connected graphs
# and should be kept for spectral clustering (drop_first = False)
# See spectral_embedding documentation.
maps = spectral_embedding(
affinity,
n_components=n_components,
eigen_solver=eigen_solver,
random_state=random_state,
eigen_tol=eigen_tol,
drop_first=False,
)
if verbose:
print(f"Computing label assignment using {assign_labels}")
if assign_labels == "kmeans":
_, labels, _ = k_means(
maps, n_clusters, random_state=random_state, n_init=n_init, verbose=verbose
)
elif assign_labels == "cluster_qr":
labels = cluster_qr(maps)
else:
labels = discretize(maps, random_state=random_state)
return labels
class SpectralClustering(ClusterMixin, BaseEstimator):
"""Apply clustering to a projection of the normalized Laplacian.
In practice Spectral Clustering is very useful when the structure of
the individual clusters is highly non-convex, or more generally when
a measure of the center and spread of the cluster is not a suitable
description of the complete cluster, such as when clusters are
nested circles on the 2D plane.
If the affinity matrix is the adjacency matrix of a graph, this method
can be used to find normalized graph cuts [1]_, [2]_.
When calling ``fit``, an affinity matrix is constructed using either
a kernel function such the Gaussian (aka RBF) kernel with Euclidean
distance ``d(X, X)``::
np.exp(-gamma * d(X,X) ** 2)
or a k-nearest neighbors connectivity matrix.
Alternatively, a user-provided affinity matrix can be specified by
setting ``affinity='precomputed'``.
Read more in the :ref:`User Guide <spectral_clustering>`.
Parameters
----------
n_clusters : int, default=8
The dimension of the projection subspace.
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities. If None, then ``'arpack'`` is
used. See [4]_ for more details regarding `'lobpcg'`.
n_components : int, default=None
Number of eigenvectors to use for the spectral embedding. If None,
defaults to `n_clusters`.
random_state : int, RandomState instance, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigenvectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
n_init : int, default=10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of n_init
consecutive runs in terms of inertia. Only used if
``assign_labels='kmeans'``.
gamma : float, default=1.0
Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
Ignored for ``affinity='nearest_neighbors'``.
affinity : str or callable, default='rbf'
How to construct the affinity matrix.
- 'nearest_neighbors': construct the affinity matrix by computing a
graph of nearest neighbors.
- 'rbf': construct the affinity matrix using a radial basis function
(RBF) kernel.
- 'precomputed': interpret ``X`` as a precomputed affinity matrix,
where larger values indicate greater similarity between instances.
- 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph
of precomputed distances, and construct a binary affinity matrix
from the ``n_neighbors`` nearest neighbors of each instance.
- one of the kernels supported by
:func:`~sklearn.metrics.pairwise_kernels`.
Only kernels that produce similarity scores (non-negative values that
increase with similarity) should be used. This property is not checked
by the clustering algorithm.
n_neighbors : int, default=10
Number of neighbors to use when constructing the affinity matrix using
the nearest neighbors method. Ignored for ``affinity='rbf'``.
eigen_tol : float, default="auto"
Stopping criterion for eigendecomposition of the Laplacian matrix.
If `eigen_tol="auto"` then the passed tolerance will depend on the
`eigen_solver`:
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
`eigen_tol=None` which configures the underlying `lobpcg` solver to
automatically resolve the value according to their heuristics. See,
:func:`scipy.sparse.linalg.lobpcg` for details.
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
values of `tol<1e-5` may lead to convergence issues and should be
avoided.
.. versionadded:: 1.2
Added 'auto' option.
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
The strategy for assigning labels in the embedding space. There are two
ways to assign labels after the Laplacian embedding. k-means is a
popular choice, but it can be sensitive to initialization.
Discretization is another approach which is less sensitive to random
initialization [3]_.
The cluster_qr method [5]_ directly extract clusters from eigenvectors
in spectral clustering. In contrast to k-means and discretization, cluster_qr
has no tuning parameters and runs no iterations, yet may outperform
k-means and discretization in terms of both quality and speed.
.. versionchanged:: 1.1
Added new labeling method 'cluster_qr'.
degree : float, default=3
Degree of the polynomial kernel. Ignored by other kernels.
coef0 : float, default=1
Zero coefficient for polynomial and sigmoid kernels.
Ignored by other kernels.
kernel_params : dict of str to any, default=None
Parameters (keyword arguments) and values for kernel passed as
callable object. Ignored by other kernels.
n_jobs : int, default=None
The number of parallel jobs to run when `affinity='nearest_neighbors'`
or `affinity='precomputed_nearest_neighbors'`. The neighbors search
will be done in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
verbose : bool, default=False
Verbosity mode.
.. versionadded:: 0.24
Attributes
----------
affinity_matrix_ : array-like of shape (n_samples, n_samples)
Affinity matrix used for clustering. Available only after calling
``fit``.
labels_ : ndarray of shape (n_samples,)
Labels of each point
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
sklearn.cluster.KMeans : K-Means clustering.
sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of
Applications with Noise.
Notes
-----
A distance matrix for which 0 indicates identical elements and high values
indicate very dissimilar elements can be transformed into an affinity /
similarity matrix that is well-suited for the algorithm by
applying the Gaussian (aka RBF, heat) kernel::
np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
where ``delta`` is a free parameter representing the width of the Gaussian
kernel.
An alternative is to take a symmetric version of the k-nearest neighbors
connectivity matrix of the points.
If the pyamg package is installed, it is used: this greatly
speeds up computation.
References
----------
.. [1] :doi:`Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
<10.1109/34.868688>`
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
<10.1007/s11222-007-9033-z>`
.. [3] `Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
A. V. Knyazev
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
<10.1137/S1064827500366124>`
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
Anil Damle, Victor Minden, Lexing Ying
<10.1093/imaiai/iay008>`
Examples
--------
>>> from sklearn.cluster import SpectralClustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralClustering(n_clusters=2,
... assign_labels='discretize',
... random_state=0).fit(X)
>>> clustering.labels_
array([1, 1, 1, 0, 0, 0])
>>> clustering
SpectralClustering(assign_labels='discretize', n_clusters=2,
random_state=0)
"""
_parameter_constraints: dict = {
"n_clusters": [Interval(Integral, 1, None, closed="left")],
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
"n_components": [Interval(Integral, 1, None, closed="left"), None],
"random_state": ["random_state"],
"n_init": [Interval(Integral, 1, None, closed="left")],
"gamma": [Interval(Real, 0, None, closed="left")],
"affinity": [
callable,
StrOptions(
set(KERNEL_PARAMS)
| {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"}
),
],
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
"eigen_tol": [
Interval(Real, 0.0, None, closed="left"),
StrOptions({"auto"}),
],
"assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})],
"degree": [Interval(Integral, 0, None, closed="left")],
"coef0": [Interval(Real, None, None, closed="neither")],
"kernel_params": [dict, None],
"n_jobs": [Integral, None],
"verbose": ["verbose"],
}
def __init__(
self,
n_clusters=8,
*,
eigen_solver=None,
n_components=None,
random_state=None,
n_init=10,
gamma=1.0,
affinity="rbf",
n_neighbors=10,
eigen_tol="auto",
assign_labels="kmeans",
degree=3,
coef0=1,
kernel_params=None,
n_jobs=None,
verbose=False,
):
self.n_clusters = n_clusters
self.eigen_solver = eigen_solver
self.n_components = n_components
self.random_state = random_state
self.n_init = n_init
self.gamma = gamma
self.affinity = affinity
self.n_neighbors = n_neighbors
self.eigen_tol = eigen_tol
self.assign_labels = assign_labels
self.degree = degree
self.coef0 = coef0
self.kernel_params = kernel_params
self.n_jobs = n_jobs
self.verbose = verbose
def fit(self, X, y=None):
"""Perform spectral clustering from features, or affinity matrix.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
Training instances to cluster, similarities / affinities between
instances if ``affinity='precomputed'``, or distances between
instances if ``affinity='precomputed_nearest_neighbors``. If a
sparse matrix is provided in a format other than ``csr_matrix``,
``csc_matrix``, or ``coo_matrix``, it will be converted into a
sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
A fitted instance of the estimator.
"""
self._validate_params()
X = self._validate_data(
X,
accept_sparse=["csr", "csc", "coo"],
dtype=np.float64,
ensure_min_samples=2,
)
allow_squared = self.affinity in [
"precomputed",
"precomputed_nearest_neighbors",
]
if X.shape[0] == X.shape[1] and not allow_squared:
warnings.warn(
"The spectral clustering API has changed. ``fit``"
"now constructs an affinity matrix from data. To use"
" a custom affinity matrix, "
"set ``affinity=precomputed``."
)
if self.affinity == "nearest_neighbors":
connectivity = kneighbors_graph(
X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs
)
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
elif self.affinity == "precomputed_nearest_neighbors":
estimator = NearestNeighbors(
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
).fit(X)
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
elif self.affinity == "precomputed":
self.affinity_matrix_ = X
else:
params = self.kernel_params
if params is None:
params = {}
if not callable(self.affinity):
params["gamma"] = self.gamma
params["degree"] = self.degree
params["coef0"] = self.coef0
self.affinity_matrix_ = pairwise_kernels(
X, metric=self.affinity, filter_params=True, **params
)
random_state = check_random_state(self.random_state)
self.labels_ = spectral_clustering(
self.affinity_matrix_,
n_clusters=self.n_clusters,
n_components=self.n_components,
eigen_solver=self.eigen_solver,
random_state=random_state,
n_init=self.n_init,
eigen_tol=self.eigen_tol,
assign_labels=self.assign_labels,
verbose=self.verbose,
)
return self
def fit_predict(self, X, y=None):
"""Perform spectral clustering on `X` and return cluster labels.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
Training instances to cluster, similarities / affinities between
instances if ``affinity='precomputed'``, or distances between
instances if ``affinity='precomputed_nearest_neighbors``. If a
sparse matrix is provided in a format other than ``csr_matrix``,
``csc_matrix``, or ``coo_matrix``, it will be converted into a
sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels.
"""
return super().fit_predict(X, y)
def _more_tags(self):
return {
"pairwise": self.affinity
in ["precomputed", "precomputed_nearest_neighbors"]
}

View file

@ -0,0 +1,38 @@
"""
Common utilities for testing clustering.
"""
import numpy as np
###############################################################################
# Generate sample data
def generate_clustered_data(
seed=0, n_clusters=3, n_features=2, n_samples_per_cluster=20, std=0.4
):
prng = np.random.RandomState(seed)
# the data is voluntary shifted away from zero to check clustering
# algorithm robustness with regards to non centered data
means = (
np.array(
[
[1, 1, 1, 0],
[-1, -1, 0, 1],
[1, -1, 1, 1],
[-1, 1, 1, 0],
]
)
+ 10
)
X = np.empty((0, n_features))
for i in range(n_clusters):
X = np.r_[
X,
means[i][:n_features] + std * prng.randn(n_samples_per_cluster, n_features),
]
return X

Some files were not shown because too many files have changed in this diff Show more