Skip to content

Prevent scalers to scale near-constant features very large values #19527

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,13 @@ Changelog
positioning strategy ``knots``.
:pr:`18368` by :user:`Christian Lorentzen <lorentzenchr>`.

- |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler`
and similar scalers detect near-constant features to avoid scaling them to
very large values. This problem happens in particular when using a scaler on
sparse data with a constant column with sample weights, in which case
centering is typically disabled. :pr:`19527` by :user:`Oliver Grisel
<ogrisel>` and :user:`Maria Telenczuk <maikia>`.

:mod:`sklearn.tree`
...................

Expand Down
6 changes: 5 additions & 1 deletion sklearn/linear_model/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,9 +246,13 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
X_var = X_var.astype(X.dtype, copy=False)

if normalize:
# Detect constant features on the computed variance, before taking
# the np.sqrt. Otherwise constant features cannot be detected with
# sample_weights.
constant_mask = X_var < 10 * np.finfo(X.dtype).eps
X_var *= X.shape[0]
X_scale = np.sqrt(X_var, out=X_var)
X_scale[X_scale < 10 * np.finfo(X_scale.dtype).eps] = 1.
X_scale[constant_mask] = 1.
if sp.issparse(X):
inplace_column_scale(X, 1. / X_scale)
else:
Expand Down
24 changes: 16 additions & 8 deletions sklearn/linear_model/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,10 +478,8 @@ def test_preprocess_data_weighted(is_sparse):
# better check the impact of feature scaling.
X[:, 0] *= 10

# Constant non-zero feature: this edge-case is currently not handled
# correctly for sparse data, see:
# https://github.com/scikit-learn/scikit-learn/issues/19450
# X[:, 2] = 1.
# Constant non-zero feature.
X[:, 2] = 1.

# Constant zero feature (non-materialized in the sparse case)
X[:, 3] = 0.
Expand All @@ -495,10 +493,12 @@ def test_preprocess_data_weighted(is_sparse):
X_sample_weight_var = np.average((X - X_sample_weight_avg)**2,
weights=sample_weight,
axis=0)
constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
assert_array_equal(constant_mask, [0, 0, 1, 1])
expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples)

# near constant features should not be scaled
expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1
expected_X_scale[constant_mask] = 1

if is_sparse:
X = sparse.csr_matrix(X)
Expand Down Expand Up @@ -538,14 +538,22 @@ def test_preprocess_data_weighted(is_sparse):
# _preprocess_data with normalize=True scales the data by the feature-wise
# euclidean norms while StandardScaler scales the data by the feature-wise
# standard deviations.
# The two are equivalent up to a ratio of np.sqrt(n_samples)
# The two are equivalent up to a ratio of np.sqrt(n_samples).
if is_sparse:
scaler = StandardScaler(with_mean=False).fit(
X, sample_weight=sample_weight)

# Non-constant features are scaled similarly with np.sqrt(n_samples)
assert_array_almost_equal(
scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray()
)
scaler.transform(X).toarray()[:, :2] / np.sqrt(n_samples),
Xt.toarray()[:, :2]
)

# Constant features go through un-scaled.
assert_array_almost_equal(
scaler.transform(X).toarray()[:, 2:],
Xt.toarray()[:, 2:]
)
else:
scaler = StandardScaler(with_mean=True).fit(
X, sample_weight=sample_weight)
Expand Down
34 changes: 26 additions & 8 deletions sklearn/preprocessing/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,36 @@
]


def _handle_zeros_in_scale(scale, copy=True):
"""Makes sure that whenever scale is zero, we handle it correctly.
def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
"""Set scales of near constant features to 1.

This happens in most scalers when we have constant features.
"""
The goal is to avoid division by very small or zero values.

Near constant features are detected automatically by identifying
scales close to machine precision unless they are precomputed by
the caller and passed with the `constant_mask` kwarg.

Typically for standard scaling, the scales are the standard
deviation while near constant features are better detected on the
computed variances which are closer to machine precision by
construction.
"""
# if we are fitting on 1D arrays, scale might be a scalar
if np.isscalar(scale):
if scale == .0:
scale = 1.
return scale
elif isinstance(scale, np.ndarray):
if constant_mask is None:
# Detect near constant values to avoid dividing by a very small
# value that could lead to suprising results and numerical
# stability issues.
constant_mask = scale < 10 * np.finfo(scale.dtype).eps

if copy:
# New array to avoid side-effects
scale = scale.copy()
scale[scale == 0.0] = 1.0
scale[constant_mask] = 1.0
return scale


Expand Down Expand Up @@ -408,7 +422,7 @@ def partial_fit(self, X, y=None):

data_range = data_max - data_min
self.scale_ = ((feature_range[1] - feature_range[0]) /
_handle_zeros_in_scale(data_range))
_handle_zeros_in_scale(data_range, copy=True))
self.min_ = feature_range[0] - data_min * self.scale_
self.data_min_ = data_min
self.data_max_ = data_max
Expand Down Expand Up @@ -850,7 +864,11 @@ def partial_fit(self, X, y=None, sample_weight=None):
self.n_samples_seen_ = self.n_samples_seen_[0]

if self.with_std:
self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
# Extract the list of near constant features on the raw variances,
# before taking the square root.
constant_mask = self.var_ < 10 * np.finfo(X.dtype).eps
self.scale_ = _handle_zeros_in_scale(
np.sqrt(self.var_), copy=False, constant_mask=constant_mask)
else:
self.scale_ = None

Expand Down Expand Up @@ -1078,7 +1096,7 @@ def partial_fit(self, X, y=None):
self.n_samples_seen_ += X.shape[0]

self.max_abs_ = max_abs
self.scale_ = _handle_zeros_in_scale(max_abs)
self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)
return self

def transform(self, X):
Expand Down
63 changes: 59 additions & 4 deletions sklearn/preprocessing/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,62 @@ def test_standard_scaler_dtype(add_sample_weight, sparse_constructor):
assert scaler.scale_.dtype == np.float64


@pytest.mark.parametrize("scaler", [
StandardScaler(with_mean=False),
RobustScaler(with_centering=False),
])
@pytest.mark.parametrize("sparse_constructor",
[np.asarray, sparse.csc_matrix, sparse.csr_matrix])
@pytest.mark.parametrize("add_sample_weight", [False, True])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("constant", [0, 1., 100.])
def test_standard_scaler_constant_features(
scaler, add_sample_weight, sparse_constructor, dtype, constant):
if (isinstance(scaler, StandardScaler)
and constant > 1
and sparse_constructor is not np.asarray
and add_sample_weight):
# https://github.com/scikit-learn/scikit-learn/issues/19546
pytest.xfail("Computation of weighted variance is numerically unstable"
" for sparse data. See: #19546.")

if isinstance(scaler, RobustScaler) and add_sample_weight:
pytest.skip(f"{scaler.__class__.__name__} does not yet support"
f" sample_weight")

rng = np.random.RandomState(0)
n_samples = 100
n_features = 1
if add_sample_weight:
fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2)
else:
fit_params = {}
X_array = np.full(shape=(n_samples, n_features), fill_value=constant,
dtype=dtype)
X = sparse_constructor(X_array)
X_scaled = scaler.fit(X, **fit_params).transform(X)

if isinstance(scaler, StandardScaler):
# The variance info should be close to zero for constant features.
assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7)

# Constant features should not be scaled (scale of 1.):
assert_allclose(scaler.scale_, np.ones(X.shape[1]))

if hasattr(X_scaled, "toarray"):
assert_allclose(X_scaled.toarray(), X_array)
else:
assert_allclose(X_scaled, X)

if isinstance(scaler, StandardScaler) and not add_sample_weight:
# Also check consistency with the standard scale function.
X_scaled_2 = scale(X, with_mean=scaler.with_mean)
if hasattr(X_scaled_2, "toarray"):
assert_allclose(X_scaled_2.toarray(), X_scaled_2.toarray())
else:
assert_allclose(X_scaled_2, X_scaled_2)


def test_scale_1d():
# 1-d inputs
X_list = [1., 3., 5., 0.]
Expand Down Expand Up @@ -538,12 +594,11 @@ def test_scaler_float16_overflow():


def test_handle_zeros_in_scale():
s1 = np.array([0, 1, 2, 3])
s1 = np.array([0, 1e-16, 1, 2, 3])
s2 = _handle_zeros_in_scale(s1, copy=True)

assert not s1[0] == s2[0]
assert_array_equal(s1, np.array([0, 1, 2, 3]))
assert_array_equal(s2, np.array([1, 1, 2, 3]))
assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3]))
assert_allclose(s2, np.array([1, 1, 1, 2, 3]))


def test_minmax_scaler_partial_fit():
Expand Down