Skip to content

[MRG] Ignore and pass-through NaN values in MaxAbsScaler and maxabs_scale #11011

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 23, 2018
5 changes: 5 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,11 @@ Preprocessing
ignore and pass-through NaN values.
:issue:`11206` by :user:`Guillaume Lemaitre <glemaitre>`.

- :class:`preprocessing.MaxAbsScaler` and :func:`preprocessing.maxabs_scale`
handles and ignores NaN values.
:issue:`11011` by `Lucija Gregov <LucihaGregov>` and
:user:`Guillaume Lemaitre <glemaitre>`

- :class:`preprocessing.PowerTransformer` and
:func:`preprocessing.power_transform` ignore and pass-through NaN values.
:issue:`11306` by :user:`Guillaume Lemaitre <glemaitre>`.
Expand Down
29 changes: 19 additions & 10 deletions sklearn/preprocessing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -890,13 +890,14 @@ def partial_fit(self, X, y=None):
Ignored
"""
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
estimator=self, dtype=FLOAT_DTYPES)
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')

if sparse.issparse(X):
mins, maxs = min_max_axis(X, axis=0)
mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)
max_abs = np.maximum(np.abs(mins), np.abs(maxs))
else:
max_abs = np.abs(X).max(axis=0)
max_abs = np.nanmax(np.abs(X), axis=0)

# First pass
if not hasattr(self, 'n_samples_seen_'):
Expand All @@ -920,7 +921,8 @@ def transform(self, X):
"""
check_is_fitted(self, 'scale_')
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
estimator=self, dtype=FLOAT_DTYPES)
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')

if sparse.issparse(X):
inplace_column_scale(X, 1.0 / self.scale_)
Expand All @@ -938,7 +940,8 @@ def inverse_transform(self, X):
"""
check_is_fitted(self, 'scale_')
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
estimator=self, dtype=FLOAT_DTYPES)
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')

if sparse.issparse(X):
inplace_column_scale(X, self.scale_)
Expand Down Expand Up @@ -987,7 +990,8 @@ def maxabs_scale(X, axis=0, copy=True):

# If copy is required, it will be done inside the scaler object.
X = check_array(X, accept_sparse=('csr', 'csc'), copy=False,
ensure_2d=False, dtype=FLOAT_DTYPES)
ensure_2d=False, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')
original_ndim = X.ndim

if original_ndim == 1:
Expand Down Expand Up @@ -2110,7 +2114,8 @@ def _transform_col(self, X_col, quantiles, inverse):
lower_bound_y = quantiles[0]
upper_bound_y = quantiles[-1]
# for inverse transform, match a uniform PDF
X_col = output_distribution.cdf(X_col)
with np.errstate(invalid='ignore'): # hide NaN comparison warnings
X_col = output_distribution.cdf(X_col)
# find index for lower and higher bounds
with np.errstate(invalid='ignore'): # hide NaN comparison warnings
lower_bounds_idx = (X_col - BOUNDS_THRESHOLD <
Expand Down Expand Up @@ -2563,9 +2568,13 @@ def _check_input(self, X, check_positive=False, check_shape=False,
X = check_array(X, ensure_2d=True, dtype=FLOAT_DTYPES, copy=self.copy,
force_all_finite='allow-nan')

if check_positive and self.method == 'box-cox' and np.nanmin(X) <= 0:
raise ValueError("The Box-Cox transformation can only be applied "
"to strictly positive data")
with np.warnings.catch_warnings():
np.warnings.filterwarnings(
'ignore', r'All-NaN (slice|axis) encountered')
if (check_positive and self.method == 'box-cox' and
np.nanmin(X) <= 0):
raise ValueError("The Box-Cox transformation can only be "
"applied to strictly positive data")

if check_shape and not X.shape[1] == len(self.lambdas_):
raise ValueError("Input data has a different number of features "
Expand Down
40 changes: 29 additions & 11 deletions sklearn/preprocessing/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@

from sklearn.base import clone

from sklearn.preprocessing import maxabs_scale
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import scale
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import quantile_transform

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
Expand All @@ -31,7 +33,8 @@ def _get_valid_samples_by_column(X, col):

@pytest.mark.parametrize(
"est, func, support_sparse, strictly_positive",
[(MinMaxScaler(), minmax_scale, False, False),
[(MaxAbsScaler(), maxabs_scale, True, False),
(MinMaxScaler(), minmax_scale, False, False),
(StandardScaler(), scale, False, False),
(StandardScaler(with_mean=False), scale, True, False),
(PowerTransformer(), power_transform, False, True),
Expand All @@ -53,12 +56,17 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive):
assert np.any(np.isnan(X_test), axis=0).all()
X_test[:, 0] = np.nan # make sure this boundary case is tested

Xt = est.fit(X_train).transform(X_test)
with pytest.warns(None) as records:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not using sklearn.utils.testing.assert_no_warnings?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to stick to pytest awaiting for this feature: pytest-dev/pytest#1830
The second point is that I find more readable

with pytest.warns(None):
    X_t = est.whatever(X)

than

X_t = assert_no_warnings(est, whaterver, X)

@ogrisel are you also in favor of assert_no_warnings. If yes, 2 vs 1 and I will make the change :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No ok, this is fine as it is.

Xt = est.fit(X_train).transform(X_test)
# ensure no warnings are raised
assert len(records) == 0
# missing values should still be missing, and only them
assert_array_equal(np.isnan(Xt), np.isnan(X_test))

# check that the function leads to the same results as the class
Xt_class = est.transform(X_train)
with pytest.warns(None) as records:
Xt_class = est.transform(X_train)
assert len(records) == 0
Xt_func = func(X_train, **est.get_params())
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
Expand All @@ -74,7 +82,9 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive):
# train only on non-NaN
est.fit(_get_valid_samples_by_column(X_train, i))
# check transforming with NaN works even when training without NaN
Xt_col = est.transform(X_test[:, [i]])
with pytest.warns(None) as records:
Xt_col = est.transform(X_test[:, [i]])
assert len(records) == 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This new test_common.py assertion breaks with the PowerTransform that complains with all-nan columns.

I am not sure if we should raise this warning or not (maybe not at transform time) but this should be consistent across all the transformers.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made a push. The reason was on the np.nanmin(X) to check that the matrix was strictly positive. This case will return a full NaN matrix as well so everything will be fine (or at least the problem is forwarded to the next step in the pipeline).

assert_allclose(Xt_col, Xt[:, [i]])
# check non-NaN is handled as before - the 1st column is all nan
if not np.isnan(X_test[:, i]).all():
Expand All @@ -87,15 +97,23 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive):
est_dense = clone(est)
est_sparse = clone(est)

Xt_dense = est_dense.fit(X_train).transform(X_test)
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
with pytest.warns(None) as records:
Xt_dense = est_dense.fit(X_train).transform(X_test)
Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
assert len(records) == 0
for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix,
sparse.bsr_matrix, sparse.coo_matrix,
sparse.dia_matrix, sparse.dok_matrix,
sparse.lil_matrix):
# check that the dense and sparse inputs lead to the same results
Xt_sparse = (est_sparse.fit(sparse_constructor(X_train))
.transform(sparse_constructor(X_test)))
assert_allclose(Xt_sparse.A, Xt_dense)
Xt_inv_sparse = est_sparse.inverse_transform(Xt_sparse)
assert_allclose(Xt_inv_sparse.A, Xt_inv_dense)
# precompute the matrix to avoid catching side warnings
X_train_sp = sparse_constructor(X_train)
X_test_sp = sparse_constructor(X_test)
with pytest.warns(None) as records:
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
assert len(records) == 0
assert_allclose(Xt_sp.A, Xt_dense)
with pytest.warns(None) as records:
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
assert len(records) == 0
assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
4 changes: 2 additions & 2 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@
'RandomForestRegressor', 'Ridge', 'RidgeCV']

ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer',
'MinMaxScaler', 'StandardScaler', 'PowerTransformer',
'QuantileTransformer']
'MaxAbsScaler', 'MinMaxScaler', 'StandardScaler',
'PowerTransformer', 'QuantileTransformer']


def _yield_non_meta_checks(name, estimator):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/utils/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count
last_unnormalized_variance = last_variance * last_sample_count

with np.errstate(divide='ignore'):
with np.errstate(divide='ignore', invalid='ignore'):
last_over_new_count = last_sample_count / new_sample_count
updated_unnormalized_variance = (
last_unnormalized_variance + new_unnormalized_variance +
Expand Down