Skip to content

ENH improve _weighted_percentile to provide several interpolation #17768

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
347 changes: 315 additions & 32 deletions sklearn/utils/stats.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,266 @@
from collections.abc import Iterable

import numpy as np

from .extmath import stable_cumsum
from .fixes import _take_along_axis


def _weighted_percentile(array, sample_weight, percentile=50):
"""Compute weighted percentile
def _squeeze_arr(arr, n_dim):
return arr[0] if n_dim == 1 else arr


def _nearest_rank(
array,
sorted_idx,
percentile,
cum_weights,
n_rows,
n_cols,
n_dim,
):
"""Compute weighted percentile using the nearest-rank approach.

Percentile can be computed using the nearest-rank:
https://en.wikipedia.org/wiki/Percentile

n = ceil(P / 100 * N)

where n is the index of the percentile value of the percentile P and N is
the number of samples.

This definition can be adapted with weights:

n_w = ceil(P / 100 * S_w) where S_w is the sum of the weights.
"""
adjusted_percentile = percentile * cum_weights[-1]

# For percentile=0, ignore leading observations with sample_weight=0. GH20528
mask = adjusted_percentile == 0
adjusted_percentile[mask] = np.nextafter(
adjusted_percentile[mask], adjusted_percentile[mask] + 1
)

percentile_idx = np.array(
[
np.searchsorted(cum_weights[:, i], adjusted_percentile[i])
for i in range(cum_weights.shape[1])
]
)
percentile_idx = np.array(percentile_idx)
# In rare cases, percentile_idx equals to sorted_idx.shape[0]
max_idx = n_rows - 1
percentile_idx = np.apply_along_axis(
lambda x: np.clip(x, 0, max_idx), axis=0, arr=percentile_idx
)

col_index = np.arange(n_cols)
percentile_in_sorted = sorted_idx[percentile_idx, col_index]
percentile_value = array[percentile_in_sorted, col_index]
return _squeeze_arr(percentile_value, n_dim)


def _interpolation_closest_ranks(
array,
sorted_idx,
sample_weight,
sorted_weights,
percentile,
cum_weights,
interpolation,
n_rows,
n_cols,
n_dim,
):
"""Compute the weighted percentile by interpolating between the adjacent
ranks.

Percentile can be computed with 3 different alternatives when using the
interpolation between adjacent ranks:
https://en.wikipedia.org/wiki/Percentile

These 3 alternatives depend of the value of a parameter C. NumPy uses
the variant where C=1 which allows to obtain a strictly monotonically
increasing function which is defined as:

P = (x - 1) / (N - 1),

where x in [1, N]

Weighted percentile change this formula by taking into account the
weights instead of the data frequency.

P_w_i = (x - w_i) / (S_w - w_i),

where x in [1, N], w_i being the weight of sample i and S_w being the sum
of the weights.
"""
Copy link
Member

@lucyleeow lucyleeow Jun 30, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the desire to be consistent with numpy implementation of C=0 C=1 but I do think extrapolating to samples with weights causes problems. (copied over from old PR) It 'ignores' the last weight so, it doesn't give the same answer as with np.percentile and repeating the array by integer weights:

>>> _weighted_percentile(np.array([1, 2, 3, 4]), sample_weight=np.array([1, 1, 1, 5]),
...                      percentile=50, interpolation="linear")
3.3
>>> np.percentile(np.array([1, 2, 3, 4, 4, 4, 4, 4]), 50, interpolation='linear')
4.0
>>> np.median(np.array([1, 2, 3, 4, 4, 4, 4, 4]))
4.0

'ignoring' the last weight also means that these 2 have the same result:

>>> _weighted_percentile(np.array([1, 2, 3, 4]), sample_weight=np.array([1, 1, 1, 5]),
...                      percentile=50, interpolation="linear")
3.3
>>> _weighted_percentile(np.array([1, 2, 3, 4]), sample_weight=np.array([1, 1, 5, 5]),
...                      percentile=50, interpolation="linear")
3.3

It (C=1) doesn't have the same properties when implemented for unit weights. I am not sure when it would be desirable to use this implementation.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The repeating issue will not be a big deal because we are not going to use the linear interpolation to keep this property.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding the case

>>> _weighted_percentile(np.array([1, 2, 3, 4]), sample_weight=np.array([1, 1, 1, 5]),
...                      percentile=50, interpolation="linear")

the percentile requested is between 3 and 4. It will be computed using linear interpolation. In this case, it is weird because we would have the intuition that the 4 should be more attractive. But then it means that linear interpolation is probably that we don't want with weighted percentile but we would expect a more non-linear interpolation.


def _fill_nan(array, value):
nan_mask = np.isnan(array)
array[nan_mask] = value
return array

adjusted_percentile = cum_weights - sorted_weights
with np.errstate(invalid="ignore"):
adjusted_percentile /= cum_weights[-1] - sorted_weights
adjusted_percentile = _fill_nan(adjusted_percentile, 1)

percentile_higher_idx = np.array(
[
np.searchsorted(
adjusted_percentile[:, col],
percentile[col],
side="left",
)
for col in range(n_cols)
]
)

if interpolation != "higher":
# other interpolations which requires the lower percentile indice
percentile_lower_idx = np.array(
[
idx - 1 if val < 1 else idx
for idx, val in zip(percentile_higher_idx, percentile)
]
)

if interpolation == "nearest":

def _compute_abs_error(p_measured, p_desired, idx, col):
return abs(p_measured[idx[col], col] - p_desired[col])

error_higher = np.array(
[
_compute_abs_error(
p_measured=adjusted_percentile,
p_desired=percentile,
idx=percentile_higher_idx,
col=col,
)
for col in range(n_cols)
]
)
error_lower = np.array(
[
_compute_abs_error(
p_measured=adjusted_percentile,
p_desired=percentile,
idx=percentile_lower_idx,
col=col,
)
for col in range(n_cols)
]
)

percentile_idx = np.where(
error_higher >= error_lower, percentile_lower_idx, percentile_higher_idx
)
elif interpolation == "lower":
percentile_idx = percentile_lower_idx
else:
percentile_idx = percentile_higher_idx

if interpolation in ("weighted", "linear"):
percentile_value_lower = array[
sorted_idx[percentile_lower_idx, np.arange(n_cols)], np.arange(n_cols)
]
percentile_value_higher = array[
sorted_idx[percentile_higher_idx, np.arange(n_cols)], np.arange(n_cols)
]

if interpolation == "weighted":
weight_higher = sorted_weights[percentile_higher_idx, np.arange(n_cols)]
weight_lower = sorted_weights[percentile_lower_idx, np.arange(n_cols)]
weight_normalizer = weight_higher + weight_lower
with np.errstate(invalid="ignore"):
weight_higher = weight_higher / weight_normalizer
weight_higher = _fill_nan(weight_higher, 0)
weight_lower = weight_lower / weight_normalizer
weight_lower = _fill_nan(weight_lower, 0)

else:
# linear interpolation does not use weights
weight_lower = np.zeros(shape=(n_cols,))
weight_higher = np.zeros(shape=(n_cols,))

def _weighted_interpolation(
x,
x_low,
x_high,
y_low,
y_high,
w_low,
w_high,
):
"""Weighted interpolation between 2 values.

If the weights are null, then it is equivalent to a linear
interpolation.
"""
with np.errstate(invalid="ignore"):
percentile_value = y_low + ((1 - w_low) * (x - x_low)) * (
(y_high - y_low)
/ ((1 - w_high) * (x_high - x) + (1 - w_low) * (x - x_low))
)
if np.isnan(percentile_value):
# case that percentile is 100
return y_high
return percentile_value

percentile_value = np.array(
[
_weighted_interpolation(
x=percentile[col],
x_low=adjusted_percentile[percentile_lower_idx[col], col],
x_high=adjusted_percentile[percentile_higher_idx[col], col],
y_low=percentile_value_lower[col],
y_high=percentile_value_higher[col],
w_low=weight_lower[col],
w_high=weight_higher[col],
)
for col in range(n_cols)
]
)
else:
# no need to interpolate
percentile_idx = np.apply_along_axis(
lambda x: np.clip(x, 0, n_rows - 1), axis=0, arr=percentile_idx
)

percentile_value = array[
sorted_idx[percentile_idx, np.arange(n_cols)], np.arange(n_cols)
]

percentile_value = _squeeze_arr(percentile_value, n_dim)

single_sample_weight = np.count_nonzero(sample_weight, axis=0)
if np.any(single_sample_weight == 1):
# edge case where a single weight is non-null in which case the
# previous methods will fail
if not isinstance(percentile_value, Iterable):
percentile_value = _squeeze_arr(array[np.nonzero(sample_weight)], n_dim)
else:
percentile_value = np.array(
[
array[np.flatnonzero(sample_weight[:, col])[0], col]
if n_nonzero == 1
else percentile_value[col]
for col, n_nonzero in enumerate(single_sample_weight)
]
)

return percentile_value


def _weighted_percentile(
array,
sample_weight,
percentile=50,
interpolation=None,
):
"""Compute weighted percentile.

Computes lower weighted percentile. If `array` is a 2D array, the
`percentile` is computed along the axis 0.
Expand All @@ -15,56 +270,84 @@ def _weighted_percentile(array, sample_weight, percentile=50):

Parameters
----------
array : 1D or 2D array
array : ndarray of shape (n,) or (n, m)
Values to take the weighted percentile of.

sample_weight: 1D or 2D array
sample_weight: ndarray of shape (n,) or (n, m)
Weights for each value in `array`. Must be same shape as `array` or
of shape `(array.shape[0],)`.

percentile: int or float, default=50
Percentile to compute. Must be value between 0 and 100.

interpolation : {"linear", "lower", "higher", "nearest"}, default=None
The interpolation method to use when the percentile lies between
data points `i` and `j`:

* None: no interpolation will be done and the "nearest-rank" method
will be used (default).
* "linear": linearly interpolate between `i` and `j`;
* "weighted": interpolate between `i` and `j` taking into account the
weights assigned to `i` and `j`.
* "lower": i`;
* "higher": `j`;
* "nearest": `i` or `j`, whichever is nearest.

.. versionadded: 0.24

Returns
-------
percentile : int if `array` 1D, ndarray if `array` 2D
percentile_value : float or int if `array` of shape (n,), otherwise \
ndarray of shape (m,)
Weighted percentile.
"""
possible_interpolation = ("linear", "lower", "higher", "nearest", "weighted", None)
if interpolation not in possible_interpolation:
raise ValueError(
"'interpolation' should be one of "
f"{', '.join([str(val) for val in possible_interpolation])}. "
f"Got '{interpolation}' instead."
)

if np.any(np.count_nonzero(sample_weight, axis=0) < 1):
raise ValueError(
"All weights cannot be null when computing a weighted percentile."
)

n_dim = array.ndim
if n_dim == 0:
return array[()]
if array.ndim == 1:
elif n_dim == 1:
array = array.reshape((-1, 1))
# When sample_weight 1D, repeat for each array.shape[1]
if array.shape != sample_weight.shape and array.shape[0] == sample_weight.shape[0]:
sample_weight = np.tile(sample_weight, (array.shape[1], 1)).T

n_rows, n_cols = array.shape
sorted_idx = np.argsort(array, axis=0)
sorted_weights = _take_along_axis(sample_weight, sorted_idx, axis=0)
percentile = np.array([percentile / 100] * n_cols)
cum_weights = stable_cumsum(sorted_weights, axis=0)

# Find index of median prediction for each sample
weight_cdf = stable_cumsum(sorted_weights, axis=0)
adjusted_percentile = percentile / 100 * weight_cdf[-1]

# For percentile=0, ignore leading observations with sample_weight=0. GH20528
mask = adjusted_percentile == 0
adjusted_percentile[mask] = np.nextafter(
adjusted_percentile[mask], adjusted_percentile[mask] + 1
if interpolation is None:
return _nearest_rank(
array,
sorted_idx,
percentile,
cum_weights,
n_rows,
n_cols,
n_dim,
)
return _interpolation_closest_ranks(
array,
sorted_idx,
sample_weight,
sorted_weights,
percentile,
cum_weights,
interpolation,
n_rows,
n_cols,
n_dim,
)

percentile_idx = np.array(
[
np.searchsorted(weight_cdf[:, i], adjusted_percentile[i])
for i in range(weight_cdf.shape[1])
]
)
percentile_idx = np.array(percentile_idx)
# In rare cases, percentile_idx equals to sorted_idx.shape[0]
max_idx = sorted_idx.shape[0] - 1
percentile_idx = np.apply_along_axis(
lambda x: np.clip(x, 0, max_idx), axis=0, arr=percentile_idx
)

col_index = np.arange(array.shape[1])
percentile_in_sorted = sorted_idx[percentile_idx, col_index]
percentile = array[percentile_in_sorted, col_index]
return percentile[0] if n_dim == 1 else percentile
Loading