335 lines
13 KiB
Python
335 lines
13 KiB
Python
import numpy as np
|
|
from scipy.special import betainc
|
|
from scipy._lib._array_api import xp_ravel, array_namespace, xp_promote
|
|
import scipy._lib.array_api_extra as xpx
|
|
from scipy.stats._axis_nan_policy import _broadcast_arrays, _contains_nan
|
|
from scipy.stats._stats_py import _length_nonmasked
|
|
|
|
|
|
def _quantile_iv(x, p, method, axis, nan_policy, keepdims):
|
|
xp = array_namespace(x, p)
|
|
|
|
if not xp.isdtype(xp.asarray(x).dtype, ('integral', 'real floating')):
|
|
raise ValueError("`x` must have real dtype.")
|
|
|
|
if not xp.isdtype(xp.asarray(p).dtype, 'real floating'):
|
|
raise ValueError("`p` must have real floating dtype.")
|
|
|
|
x, p = xp_promote(x, p, force_floating=True, xp=xp)
|
|
dtype = x.dtype
|
|
|
|
axis_none = axis is None
|
|
ndim = max(x.ndim, p.ndim)
|
|
if axis_none:
|
|
x = xp_ravel(x)
|
|
p = xp_ravel(p)
|
|
axis = 0
|
|
elif np.iterable(axis) or int(axis) != axis:
|
|
message = "`axis` must be an integer or None."
|
|
raise ValueError(message)
|
|
elif (axis >= ndim) or (axis < -ndim):
|
|
message = "`axis` is not compatible with the shapes of the inputs."
|
|
raise ValueError(message)
|
|
axis = int(axis)
|
|
|
|
methods = {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
|
|
'hazen', 'interpolated_inverted_cdf', 'linear',
|
|
'median_unbiased', 'normal_unbiased', 'weibull',
|
|
'harrell-davis'}
|
|
if method not in methods:
|
|
message = f"`method` must be one of {methods}"
|
|
raise ValueError(message)
|
|
|
|
contains_nans = _contains_nan(x, nan_policy, xp_omit_okay=True, xp=xp)
|
|
|
|
if keepdims not in {None, True, False}:
|
|
message = "If specified, `keepdims` must be True or False."
|
|
raise ValueError(message)
|
|
|
|
# If data has length zero along `axis`, the result will be an array of NaNs just
|
|
# as if the data had length 1 along axis and were filled with NaNs. This is treated
|
|
# naturally below whether `nan_policy` is `'propagate'` or `'omit'`.
|
|
if x.shape[axis] == 0:
|
|
shape = list(x.shape)
|
|
shape[axis] = 1
|
|
x = xp.full(shape, xp.asarray(xp.nan, dtype=dtype))
|
|
|
|
y = xp.sort(x, axis=axis)
|
|
y, p = _broadcast_arrays((y, p), axis=axis)
|
|
|
|
if (keepdims is False) and (p.shape[axis] != 1):
|
|
message = "`keepdims` may be False only if the length of `p` along `axis` is 1."
|
|
raise ValueError(message)
|
|
keepdims = (p.shape[axis] != 1) if keepdims is None else keepdims
|
|
|
|
y = xp.moveaxis(y, axis, -1)
|
|
p = xp.moveaxis(p, axis, -1)
|
|
|
|
n = _length_nonmasked(y, -1, xp=xp, keepdims=True)
|
|
n = xp.asarray(n, dtype=dtype)
|
|
if contains_nans:
|
|
nans = xp.isnan(y)
|
|
|
|
# Note that if length along `axis` were 0 to begin with,
|
|
# it is now length 1 and filled with NaNs.
|
|
if nan_policy == 'propagate':
|
|
nan_out = xp.any(nans, axis=-1)
|
|
else: # 'omit'
|
|
non_nan = xp.astype(~nans, xp.uint64)
|
|
n_int = xp.sum(non_nan, axis=-1, keepdims=True)
|
|
n = xp.astype(n_int, dtype)
|
|
# NaNs are produced only if slice is empty after removing NaNs
|
|
nan_out = xp.any(n == 0, axis=-1)
|
|
n = xpx.at(n, nan_out).set(y.shape[-1]) # avoids pytorch/pytorch#146211
|
|
|
|
if xp.any(nan_out):
|
|
y = xp.asarray(y, copy=True) # ensure writable
|
|
y = xpx.at(y, nan_out).set(xp.nan)
|
|
elif xp.any(nans) and method == 'harrell-davis':
|
|
y = xp.asarray(y, copy=True) # ensure writable
|
|
y = xpx.at(y, nans).set(0) # any non-nan will prevent NaN from propagating
|
|
|
|
p_mask = (p > 1) | (p < 0) | xp.isnan(p)
|
|
if xp.any(p_mask):
|
|
p = xp.asarray(p, copy=True)
|
|
p = xpx.at(p, p_mask).set(0.5) # these get NaN-ed out at the end
|
|
|
|
return y, p, method, axis, nan_policy, keepdims, n, axis_none, ndim, p_mask, xp
|
|
|
|
|
|
def quantile(x, p, *, method='linear', axis=0, nan_policy='propagate', keepdims=None):
|
|
"""
|
|
Compute the p-th quantile of the data along the specified axis.
|
|
|
|
Parameters
|
|
----------
|
|
x : array_like of real numbers
|
|
Data array.
|
|
p : array_like of float
|
|
Probability or sequence of probabilities of the quantiles to compute.
|
|
Values must be between 0 and 1 (inclusive).
|
|
Must have length 1 along `axis` unless ``keepdims=True``.
|
|
method : str, default: 'linear'
|
|
The method to use for estimating the quantile.
|
|
The available options, numbered as they appear in [1]_, are:
|
|
|
|
1. 'inverted_cdf'
|
|
2. 'averaged_inverted_cdf'
|
|
3. 'closest_observation'
|
|
4. 'interpolated_inverted_cdf'
|
|
5. 'hazen'
|
|
6. 'weibull'
|
|
7. 'linear' (default)
|
|
8. 'median_unbiased'
|
|
9. 'normal_unbiased'
|
|
|
|
'harrell-davis' is also available to compute the quantile estimate
|
|
according to [2]_.
|
|
See Notes for details.
|
|
axis : int or None, default: 0
|
|
Axis along which the quantiles are computed.
|
|
``None`` ravels both `x` and `p` before performing the calculation,
|
|
without checking whether the original shapes were compatible.
|
|
nan_policy : str, default: 'propagate'
|
|
Defines how to handle NaNs in the input data `x`.
|
|
|
|
- ``propagate``: if a NaN is present in the axis slice (e.g. row) along
|
|
which the statistic is computed, the corresponding slice of the output
|
|
will contain NaN(s).
|
|
- ``omit``: NaNs will be omitted when performing the calculation.
|
|
If insufficient data remains in the axis slice along which the
|
|
statistic is computed, the corresponding slice of the output will
|
|
contain NaN(s).
|
|
- ``raise``: if a NaN is present, a ``ValueError`` will be raised.
|
|
|
|
If NaNs are present in `p`, a ``ValueError`` will be raised.
|
|
keepdims : bool, optional
|
|
Consider the case in which `x` is 1-D and `p` is a scalar: the quantile
|
|
is a reducing statistic, and the default behavior is to return a scalar.
|
|
If `keepdims` is set to True, the axis will not be reduced away, and the
|
|
result will be a 1-D array with one element.
|
|
|
|
The general case is more subtle, since multiple quantiles may be
|
|
requested for each axis-slice of `x`. For instance, if both `x` and `p`
|
|
are 1-D and ``p.size > 1``, no axis can be reduced away; there must be an
|
|
axis to contain the number of quantiles given by ``p.size``. Therefore:
|
|
|
|
- By default, the axis will be reduced away if possible (i.e. if there is
|
|
exactly one element of `q` per axis-slice of `x`).
|
|
- If `keepdims` is set to True, the axis will not be reduced away.
|
|
- If `keepdims` is set to False, the axis will be reduced away
|
|
if possible, and an error will be raised otherwise.
|
|
|
|
Returns
|
|
-------
|
|
quantile : scalar or ndarray
|
|
The resulting quantile(s). The dtype is the result dtype of `x` and `p`.
|
|
|
|
Notes
|
|
-----
|
|
Given a sample `x` from an underlying distribution, `quantile` provides a
|
|
nonparametric estimate of the inverse cumulative distribution function.
|
|
|
|
By default, this is done by interpolating between adjacent elements in
|
|
``y``, a sorted copy of `x`::
|
|
|
|
(1-g)*y[j] + g*y[j+1]
|
|
|
|
where the index ``j`` and coefficient ``g`` are the integral and
|
|
fractional components of ``p * (n-1)``, and ``n`` is the number of
|
|
elements in the sample.
|
|
|
|
This is a special case of Equation 1 of H&F [1]_. More generally,
|
|
|
|
- ``j = (p*n + m - 1) // 1``, and
|
|
- ``g = (p*n + m - 1) % 1``,
|
|
|
|
where ``m`` may be defined according to several different conventions.
|
|
The preferred convention may be selected using the ``method`` parameter:
|
|
|
|
=============================== =============== ===============
|
|
``method`` number in H&F ``m``
|
|
=============================== =============== ===============
|
|
``interpolated_inverted_cdf`` 4 ``0``
|
|
``hazen`` 5 ``1/2``
|
|
``weibull`` 6 ``p``
|
|
``linear`` (default) 7 ``1 - p``
|
|
``median_unbiased`` 8 ``p/3 + 1/3``
|
|
``normal_unbiased`` 9 ``p/4 + 3/8``
|
|
=============================== =============== ===============
|
|
|
|
Note that indices ``j`` and ``j + 1`` are clipped to the range ``0`` to
|
|
``n - 1`` when the results of the formula would be outside the allowed
|
|
range of non-negative indices. When ``j`` is clipped to zero, ``g`` is
|
|
set to zero as well. The ``-1`` in the formulas for ``j`` and ``g``
|
|
accounts for Python's 0-based indexing.
|
|
|
|
The table above includes only the estimators from [1]_ that are continuous
|
|
functions of probability `p` (estimators 4-9). SciPy also provides the
|
|
three discontinuous estimators from [1]_ (estimators 1-3), where ``j`` is
|
|
defined as above, ``m`` is defined as follows, and ``g`` is ``0`` when
|
|
``index = p*n + m - 1`` is less than ``0`` and otherwise is defined below.
|
|
|
|
1. ``inverted_cdf``: ``m = 0`` and ``g = int(index - j > 0)``
|
|
2. ``averaged_inverted_cdf``: ``m = 0`` and
|
|
``g = (1 + int(index - j > 0)) / 2``
|
|
3. ``closest_observation``: ``m = -1/2`` and
|
|
``g = 1 - int((index == j) & (j%2 == 1))``
|
|
|
|
A different strategy for computing quantiles from [2]_, ``method='harrell-davis'``,
|
|
uses a weighted combination of all elements. The weights are computed as:
|
|
|
|
.. math::
|
|
|
|
w_{n, i} = I_{i/n}(a, b) - I_{(i - 1)/n}(a, b)
|
|
|
|
where :math:`n` is the number of elements in the sample,
|
|
:math:`i` are the indices :math:`1, 2, ..., n-1, n` of the sorted elements,
|
|
:math:`a = p (n + 1)`, :math:`b = (1 - p)(n + 1)`,
|
|
:math:`p` is the probability of the quantile, and
|
|
:math:`I` is the regularized, lower incomplete beta function
|
|
(`scipy.special.betainc`).
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from scipy import stats
|
|
>>> x = np.asarray([[10, 8, 7, 5, 4],
|
|
... [0, 1, 2, 3, 5]])
|
|
|
|
Take the median along the last axis.
|
|
|
|
>>> stats.quantile(x, 0.5, axis=-1)
|
|
array([7., 2.])
|
|
|
|
Take a different quantile along each axis.
|
|
|
|
>>> stats.quantile(x, [[0.25], [0.75]], axis=-1, keepdims=True)
|
|
array([[5.],
|
|
[3.]])
|
|
|
|
Take multiple quantiles along each axis.
|
|
|
|
>>> stats.quantile(x, [0.25, 0.75], axis=-1)
|
|
array([[5., 8.],
|
|
[1., 3.]])
|
|
|
|
References
|
|
----------
|
|
.. [1] R. J. Hyndman and Y. Fan,
|
|
"Sample quantiles in statistical packages,"
|
|
The American Statistician, 50(4), pp. 361-365, 1996
|
|
.. [2] Harrell, Frank E., and C. E. Davis.
|
|
"A new distribution-free quantile estimator."
|
|
Biometrika 69.3 (1982): 635-640.
|
|
|
|
"""
|
|
# Input validation / standardization
|
|
|
|
temp = _quantile_iv(x, p, method, axis, nan_policy, keepdims)
|
|
y, p, method, axis, nan_policy, keepdims, n, axis_none, ndim, p_mask, xp = temp
|
|
|
|
if method in {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
|
|
'hazen', 'interpolated_inverted_cdf', 'linear',
|
|
'median_unbiased', 'normal_unbiased', 'weibull'}:
|
|
res = _quantile_hf(y, p, n, method, xp)
|
|
elif method in {'harrell-davis'}:
|
|
res = _quantile_hd(y, p, n, xp)
|
|
|
|
res = xpx.at(res, p_mask).set(xp.nan)
|
|
|
|
# Reshape per axis/keepdims
|
|
if axis_none and keepdims:
|
|
shape = (1,)*(ndim - 1) + res.shape
|
|
res = xp.reshape(res, shape)
|
|
axis = -1
|
|
|
|
res = xp.moveaxis(res, -1, axis)
|
|
|
|
if not keepdims:
|
|
res = xp.squeeze(res, axis=axis)
|
|
|
|
return res[()] if res.ndim == 0 else res
|
|
|
|
|
|
def _quantile_hf(y, p, n, method, xp):
|
|
ms = dict(inverted_cdf=0, averaged_inverted_cdf=0, closest_observation=-0.5,
|
|
interpolated_inverted_cdf=0, hazen=0.5, weibull=p, linear=1 - p,
|
|
median_unbiased=p/3 + 1/3, normal_unbiased=p/4 + 3/8)
|
|
m = ms[method]
|
|
jg = p*n + m - 1
|
|
j = jg // 1
|
|
g = jg % 1
|
|
if method == 'inverted_cdf':
|
|
g = xp.astype((g > 0), jg.dtype)
|
|
elif method == 'averaged_inverted_cdf':
|
|
g = (1 + xp.astype((g > 0), jg.dtype)) / 2
|
|
elif method == 'closest_observation':
|
|
g = (1 - xp.astype((g == 0) & (j % 2 == 1), jg.dtype))
|
|
if method in {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation'}:
|
|
g = xp.asarray(g)
|
|
g = xpx.at(g, jg < 0).set(0)
|
|
|
|
g[j < 0] = 0
|
|
j = xp.clip(j, 0., n - 1)
|
|
jp1 = xp.clip(j + 1, 0., n - 1)
|
|
|
|
return ((1 - g) * xp.take_along_axis(y, xp.astype(j, xp.int64), axis=-1)
|
|
+ g * xp.take_along_axis(y, xp.astype(jp1, xp.int64), axis=-1))
|
|
|
|
|
|
def _quantile_hd(y, p, n, xp):
|
|
# RE axis handling: We need to perform a reducing operation over rows of `y` for
|
|
# each element in the corresponding row of `p` (a la Cartesian product). Strategy:
|
|
# move rows of `p` to an axis at the front that is orthogonal to all the rest,
|
|
# perform the reducing operating over the last axis, then move the front axis back
|
|
# to the end.
|
|
p = xp.moveaxis(p, -1, 0)[..., xp.newaxis]
|
|
a = p * (n + 1)
|
|
b = (1 - p) * (n + 1)
|
|
i = xp.arange(y.shape[-1] + 1, dtype=y.dtype)
|
|
w = betainc(a, b, i / n)
|
|
w = w[..., 1:] - w[..., :-1]
|
|
w = xpx.at(w, xp.isnan(w)).set(0)
|
|
res = xp.vecdot(w, y, axis=-1)
|
|
return xp.moveaxis(res, 0, -1)
|