-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
ENH improve _weighted_percentile to provide several interpolation #17768
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b2abd20
79911c3
be5135e
730c218
35fbfb3
e3a54bb
e8ed64b
0dc3b31
8ccbdea
ab2696b
ab0374e
68b044a
d2a3e1e
c6de014
4578fa9
01e61bd
a20c1cb
dab5204
310068e
83daa43
dfaf653
5029065
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,266 @@ | ||
from collections.abc import Iterable | ||
|
||
import numpy as np | ||
|
||
from .extmath import stable_cumsum | ||
from .fixes import _take_along_axis | ||
|
||
|
||
def _weighted_percentile(array, sample_weight, percentile=50): | ||
"""Compute weighted percentile | ||
def _squeeze_arr(arr, n_dim): | ||
return arr[0] if n_dim == 1 else arr | ||
|
||
|
||
def _nearest_rank( | ||
array, | ||
sorted_idx, | ||
percentile, | ||
cum_weights, | ||
n_rows, | ||
n_cols, | ||
n_dim, | ||
): | ||
"""Compute weighted percentile using the nearest-rank approach. | ||
|
||
Percentile can be computed using the nearest-rank: | ||
https://en.wikipedia.org/wiki/Percentile | ||
|
||
n = ceil(P / 100 * N) | ||
|
||
where n is the index of the percentile value of the percentile P and N is | ||
the number of samples. | ||
|
||
This definition can be adapted with weights: | ||
|
||
n_w = ceil(P / 100 * S_w) where S_w is the sum of the weights. | ||
""" | ||
adjusted_percentile = percentile * cum_weights[-1] | ||
|
||
# For percentile=0, ignore leading observations with sample_weight=0. GH20528 | ||
mask = adjusted_percentile == 0 | ||
adjusted_percentile[mask] = np.nextafter( | ||
adjusted_percentile[mask], adjusted_percentile[mask] + 1 | ||
) | ||
|
||
percentile_idx = np.array( | ||
[ | ||
np.searchsorted(cum_weights[:, i], adjusted_percentile[i]) | ||
for i in range(cum_weights.shape[1]) | ||
] | ||
) | ||
percentile_idx = np.array(percentile_idx) | ||
# In rare cases, percentile_idx equals to sorted_idx.shape[0] | ||
max_idx = n_rows - 1 | ||
percentile_idx = np.apply_along_axis( | ||
lambda x: np.clip(x, 0, max_idx), axis=0, arr=percentile_idx | ||
) | ||
|
||
col_index = np.arange(n_cols) | ||
percentile_in_sorted = sorted_idx[percentile_idx, col_index] | ||
percentile_value = array[percentile_in_sorted, col_index] | ||
return _squeeze_arr(percentile_value, n_dim) | ||
|
||
|
||
def _interpolation_closest_ranks( | ||
array, | ||
sorted_idx, | ||
sample_weight, | ||
sorted_weights, | ||
percentile, | ||
cum_weights, | ||
interpolation, | ||
n_rows, | ||
n_cols, | ||
n_dim, | ||
): | ||
"""Compute the weighted percentile by interpolating between the adjacent | ||
ranks. | ||
|
||
Percentile can be computed with 3 different alternatives when using the | ||
interpolation between adjacent ranks: | ||
https://en.wikipedia.org/wiki/Percentile | ||
|
||
These 3 alternatives depend of the value of a parameter C. NumPy uses | ||
the variant where C=1 which allows to obtain a strictly monotonically | ||
increasing function which is defined as: | ||
|
||
P = (x - 1) / (N - 1), | ||
|
||
where x in [1, N] | ||
|
||
Weighted percentile change this formula by taking into account the | ||
weights instead of the data frequency. | ||
|
||
P_w_i = (x - w_i) / (S_w - w_i), | ||
|
||
where x in [1, N], w_i being the weight of sample i and S_w being the sum | ||
of the weights. | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand the desire to be consistent with numpy implementation of
'ignoring' the last weight also means that these 2 have the same result:
It ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The repeating issue will not be a big deal because we are not going to use the linear interpolation to keep this property. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Regarding the case >>> _weighted_percentile(np.array([1, 2, 3, 4]), sample_weight=np.array([1, 1, 1, 5]),
... percentile=50, interpolation="linear") the percentile requested is between 3 and 4. It will be computed using linear interpolation. In this case, it is weird because we would have the intuition that the 4 should be more attractive. But then it means that linear interpolation is probably that we don't want with weighted percentile but we would expect a more non-linear interpolation. |
||
|
||
def _fill_nan(array, value): | ||
nan_mask = np.isnan(array) | ||
array[nan_mask] = value | ||
return array | ||
|
||
adjusted_percentile = cum_weights - sorted_weights | ||
with np.errstate(invalid="ignore"): | ||
adjusted_percentile /= cum_weights[-1] - sorted_weights | ||
adjusted_percentile = _fill_nan(adjusted_percentile, 1) | ||
|
||
percentile_higher_idx = np.array( | ||
[ | ||
np.searchsorted( | ||
adjusted_percentile[:, col], | ||
percentile[col], | ||
side="left", | ||
) | ||
for col in range(n_cols) | ||
] | ||
) | ||
|
||
if interpolation != "higher": | ||
# other interpolations which requires the lower percentile indice | ||
percentile_lower_idx = np.array( | ||
[ | ||
idx - 1 if val < 1 else idx | ||
for idx, val in zip(percentile_higher_idx, percentile) | ||
] | ||
) | ||
|
||
if interpolation == "nearest": | ||
|
||
def _compute_abs_error(p_measured, p_desired, idx, col): | ||
return abs(p_measured[idx[col], col] - p_desired[col]) | ||
|
||
error_higher = np.array( | ||
[ | ||
_compute_abs_error( | ||
p_measured=adjusted_percentile, | ||
p_desired=percentile, | ||
idx=percentile_higher_idx, | ||
col=col, | ||
) | ||
for col in range(n_cols) | ||
] | ||
) | ||
error_lower = np.array( | ||
[ | ||
_compute_abs_error( | ||
p_measured=adjusted_percentile, | ||
p_desired=percentile, | ||
idx=percentile_lower_idx, | ||
col=col, | ||
) | ||
for col in range(n_cols) | ||
] | ||
) | ||
|
||
percentile_idx = np.where( | ||
error_higher >= error_lower, percentile_lower_idx, percentile_higher_idx | ||
) | ||
elif interpolation == "lower": | ||
percentile_idx = percentile_lower_idx | ||
else: | ||
percentile_idx = percentile_higher_idx | ||
|
||
if interpolation in ("weighted", "linear"): | ||
percentile_value_lower = array[ | ||
sorted_idx[percentile_lower_idx, np.arange(n_cols)], np.arange(n_cols) | ||
] | ||
percentile_value_higher = array[ | ||
sorted_idx[percentile_higher_idx, np.arange(n_cols)], np.arange(n_cols) | ||
] | ||
|
||
if interpolation == "weighted": | ||
weight_higher = sorted_weights[percentile_higher_idx, np.arange(n_cols)] | ||
weight_lower = sorted_weights[percentile_lower_idx, np.arange(n_cols)] | ||
weight_normalizer = weight_higher + weight_lower | ||
with np.errstate(invalid="ignore"): | ||
weight_higher = weight_higher / weight_normalizer | ||
weight_higher = _fill_nan(weight_higher, 0) | ||
weight_lower = weight_lower / weight_normalizer | ||
weight_lower = _fill_nan(weight_lower, 0) | ||
|
||
else: | ||
# linear interpolation does not use weights | ||
weight_lower = np.zeros(shape=(n_cols,)) | ||
weight_higher = np.zeros(shape=(n_cols,)) | ||
|
||
def _weighted_interpolation( | ||
x, | ||
x_low, | ||
x_high, | ||
y_low, | ||
y_high, | ||
w_low, | ||
w_high, | ||
): | ||
"""Weighted interpolation between 2 values. | ||
|
||
If the weights are null, then it is equivalent to a linear | ||
interpolation. | ||
""" | ||
with np.errstate(invalid="ignore"): | ||
percentile_value = y_low + ((1 - w_low) * (x - x_low)) * ( | ||
(y_high - y_low) | ||
/ ((1 - w_high) * (x_high - x) + (1 - w_low) * (x - x_low)) | ||
) | ||
if np.isnan(percentile_value): | ||
# case that percentile is 100 | ||
return y_high | ||
return percentile_value | ||
|
||
percentile_value = np.array( | ||
[ | ||
_weighted_interpolation( | ||
x=percentile[col], | ||
x_low=adjusted_percentile[percentile_lower_idx[col], col], | ||
x_high=adjusted_percentile[percentile_higher_idx[col], col], | ||
y_low=percentile_value_lower[col], | ||
y_high=percentile_value_higher[col], | ||
w_low=weight_lower[col], | ||
w_high=weight_higher[col], | ||
) | ||
for col in range(n_cols) | ||
] | ||
) | ||
else: | ||
# no need to interpolate | ||
percentile_idx = np.apply_along_axis( | ||
lambda x: np.clip(x, 0, n_rows - 1), axis=0, arr=percentile_idx | ||
) | ||
|
||
percentile_value = array[ | ||
sorted_idx[percentile_idx, np.arange(n_cols)], np.arange(n_cols) | ||
] | ||
|
||
percentile_value = _squeeze_arr(percentile_value, n_dim) | ||
|
||
single_sample_weight = np.count_nonzero(sample_weight, axis=0) | ||
if np.any(single_sample_weight == 1): | ||
# edge case where a single weight is non-null in which case the | ||
# previous methods will fail | ||
if not isinstance(percentile_value, Iterable): | ||
percentile_value = _squeeze_arr(array[np.nonzero(sample_weight)], n_dim) | ||
else: | ||
percentile_value = np.array( | ||
[ | ||
array[np.flatnonzero(sample_weight[:, col])[0], col] | ||
if n_nonzero == 1 | ||
else percentile_value[col] | ||
for col, n_nonzero in enumerate(single_sample_weight) | ||
] | ||
) | ||
|
||
return percentile_value | ||
|
||
|
||
def _weighted_percentile( | ||
array, | ||
sample_weight, | ||
percentile=50, | ||
interpolation=None, | ||
): | ||
"""Compute weighted percentile. | ||
|
||
Computes lower weighted percentile. If `array` is a 2D array, the | ||
`percentile` is computed along the axis 0. | ||
|
@@ -15,56 +270,84 @@ def _weighted_percentile(array, sample_weight, percentile=50): | |
|
||
Parameters | ||
---------- | ||
array : 1D or 2D array | ||
array : ndarray of shape (n,) or (n, m) | ||
Values to take the weighted percentile of. | ||
|
||
sample_weight: 1D or 2D array | ||
sample_weight: ndarray of shape (n,) or (n, m) | ||
Weights for each value in `array`. Must be same shape as `array` or | ||
of shape `(array.shape[0],)`. | ||
|
||
percentile: int or float, default=50 | ||
glemaitre marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Percentile to compute. Must be value between 0 and 100. | ||
|
||
interpolation : {"linear", "lower", "higher", "nearest"}, default=None | ||
The interpolation method to use when the percentile lies between | ||
data points `i` and `j`: | ||
|
||
* None: no interpolation will be done and the "nearest-rank" method | ||
will be used (default). | ||
* "linear": linearly interpolate between `i` and `j`; | ||
* "weighted": interpolate between `i` and `j` taking into account the | ||
weights assigned to `i` and `j`. | ||
* "lower": i`; | ||
* "higher": `j`; | ||
* "nearest": `i` or `j`, whichever is nearest. | ||
|
||
.. versionadded: 0.24 | ||
|
||
Returns | ||
------- | ||
percentile : int if `array` 1D, ndarray if `array` 2D | ||
percentile_value : float or int if `array` of shape (n,), otherwise \ | ||
ndarray of shape (m,) | ||
Weighted percentile. | ||
""" | ||
possible_interpolation = ("linear", "lower", "higher", "nearest", "weighted", None) | ||
if interpolation not in possible_interpolation: | ||
raise ValueError( | ||
"'interpolation' should be one of " | ||
f"{', '.join([str(val) for val in possible_interpolation])}. " | ||
f"Got '{interpolation}' instead." | ||
) | ||
|
||
if np.any(np.count_nonzero(sample_weight, axis=0) < 1): | ||
raise ValueError( | ||
"All weights cannot be null when computing a weighted percentile." | ||
) | ||
|
||
n_dim = array.ndim | ||
if n_dim == 0: | ||
return array[()] | ||
if array.ndim == 1: | ||
elif n_dim == 1: | ||
array = array.reshape((-1, 1)) | ||
# When sample_weight 1D, repeat for each array.shape[1] | ||
if array.shape != sample_weight.shape and array.shape[0] == sample_weight.shape[0]: | ||
sample_weight = np.tile(sample_weight, (array.shape[1], 1)).T | ||
|
||
n_rows, n_cols = array.shape | ||
sorted_idx = np.argsort(array, axis=0) | ||
sorted_weights = _take_along_axis(sample_weight, sorted_idx, axis=0) | ||
percentile = np.array([percentile / 100] * n_cols) | ||
cum_weights = stable_cumsum(sorted_weights, axis=0) | ||
|
||
# Find index of median prediction for each sample | ||
weight_cdf = stable_cumsum(sorted_weights, axis=0) | ||
adjusted_percentile = percentile / 100 * weight_cdf[-1] | ||
|
||
# For percentile=0, ignore leading observations with sample_weight=0. GH20528 | ||
mask = adjusted_percentile == 0 | ||
adjusted_percentile[mask] = np.nextafter( | ||
adjusted_percentile[mask], adjusted_percentile[mask] + 1 | ||
if interpolation is None: | ||
return _nearest_rank( | ||
array, | ||
sorted_idx, | ||
percentile, | ||
cum_weights, | ||
n_rows, | ||
n_cols, | ||
n_dim, | ||
) | ||
return _interpolation_closest_ranks( | ||
array, | ||
sorted_idx, | ||
sample_weight, | ||
sorted_weights, | ||
percentile, | ||
cum_weights, | ||
interpolation, | ||
n_rows, | ||
n_cols, | ||
n_dim, | ||
) | ||
|
||
percentile_idx = np.array( | ||
[ | ||
np.searchsorted(weight_cdf[:, i], adjusted_percentile[i]) | ||
for i in range(weight_cdf.shape[1]) | ||
] | ||
) | ||
percentile_idx = np.array(percentile_idx) | ||
# In rare cases, percentile_idx equals to sorted_idx.shape[0] | ||
max_idx = sorted_idx.shape[0] - 1 | ||
percentile_idx = np.apply_along_axis( | ||
lambda x: np.clip(x, 0, max_idx), axis=0, arr=percentile_idx | ||
) | ||
|
||
col_index = np.arange(array.shape[1]) | ||
percentile_in_sorted = sorted_idx[percentile_idx, col_index] | ||
percentile = array[percentile_in_sorted, col_index] | ||
return percentile[0] if n_dim == 1 else percentile |
Uh oh!
There was an error while loading. Please reload this page.