Skip to content

Commit 25aeaf3

Browse files
glevvjeremiedbb
andauthored
ENH Add clip parameter to MaxAbsScaler (#31790)
Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
1 parent 5833812 commit 25aeaf3

File tree

4 files changed

+66
-4
lines changed

4 files changed

+66
-4
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
- :class:`preprocessing.MaxAbsScaler` can now clip out-of-range values in held-out data
2+
with the parameter `clip`.
3+
By :user:`Hleb Levitski <glevv>`.

sklearn/preprocessing/_data.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,16 @@ class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
329329
330330
clip : bool, default=False
331331
Set to True to clip transformed values of held-out data to
332-
provided `feature range`.
332+
provided `feature_range`.
333+
Since this parameter will clip values, `inverse_transform` may not
334+
be able to restore the original data.
335+
336+
.. note::
337+
Setting `clip=True` does not prevent feature drift (a distribution
338+
shift between training and test data). The transformed values are clipped
339+
to the `feature_range`, which helps avoid unintended behavior in models
340+
sensitive to out-of-range inputs (e.g. linear models). Use with care,
341+
as clipping can distort the distribution of test data.
333342
334343
.. versionadded:: 0.24
335344
@@ -1172,6 +1181,18 @@ class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
11721181
Set to False to perform inplace scaling and avoid a copy (if the input
11731182
is already a numpy array).
11741183
1184+
clip : bool, default=False
1185+
Set to True to clip transformed values of held-out data to [-1, 1].
1186+
Since this parameter will clip values, `inverse_transform` may not
1187+
be able to restore the original data.
1188+
1189+
.. note::
1190+
Setting `clip=True` does not prevent feature drift (a distribution
1191+
shift between training and test data). The transformed values are clipped
1192+
to the [-1, 1] range, which helps avoid unintended behavior in models
1193+
sensitive to out-of-range inputs (e.g. linear models). Use with care,
1194+
as clipping can distort the distribution of test data.
1195+
11751196
Attributes
11761197
----------
11771198
scale_ : ndarray of shape (n_features,)
@@ -1222,10 +1243,14 @@ class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
12221243
[ 0. , 1. , -0.5]])
12231244
"""
12241245

1225-
_parameter_constraints: dict = {"copy": ["boolean"]}
1246+
_parameter_constraints: dict = {
1247+
"copy": ["boolean"],
1248+
"clip": ["boolean"],
1249+
}
12261250

1227-
def __init__(self, *, copy=True):
1251+
def __init__(self, *, copy=True, clip=False):
12281252
self.copy = copy
1253+
self.clip = clip
12291254

12301255
def _reset(self):
12311256
"""Reset internal data-dependent state of the scaler, if necessary.
@@ -1340,8 +1365,20 @@ def transform(self, X):
13401365

13411366
if sparse.issparse(X):
13421367
inplace_column_scale(X, 1.0 / self.scale_)
1368+
if self.clip:
1369+
np.clip(X.data, -1.0, 1.0, out=X.data)
13431370
else:
13441371
X /= self.scale_
1372+
if self.clip:
1373+
device_ = device(X)
1374+
X = _modify_in_place_if_numpy(
1375+
xp,
1376+
xp.clip,
1377+
X,
1378+
xp.asarray(-1.0, dtype=X.dtype, device=device_),
1379+
xp.asarray(1.0, dtype=X.dtype, device=device_),
1380+
out=X,
1381+
)
13451382
return X
13461383

13471384
def inverse_transform(self, X):

sklearn/preprocessing/tests/test_common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def _get_valid_samples_by_column(X, col):
4242
@pytest.mark.parametrize(
4343
"est, func, support_sparse, strictly_positive, omit_kwargs",
4444
[
45-
(MaxAbsScaler(), maxabs_scale, True, False, []),
45+
(MaxAbsScaler(), maxabs_scale, True, False, ["clip"]),
4646
(MinMaxScaler(), minmax_scale, False, False, ["clip"]),
4747
(StandardScaler(), scale, False, False, []),
4848
(StandardScaler(with_mean=False), scale, True, False, []),

sklearn/preprocessing/tests/test_data.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,7 @@ def test_standard_check_array_of_inverse_transform():
707707
"estimator",
708708
[
709709
MaxAbsScaler(),
710+
MaxAbsScaler(clip=True),
710711
MinMaxScaler(),
711712
MinMaxScaler(clip=True),
712713
KernelCenterer(),
@@ -2517,6 +2518,8 @@ def test_minmax_scaler_clip(feature_range):
25172518
# test behaviour of the parameter 'clip' in MinMaxScaler
25182519
X = iris.data
25192520
scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X)
2521+
# create a test sample with features outside the training feature range:
2522+
# first 2 features < min(X) and last 2 features > max(X)
25202523
X_min, X_max = np.min(X, axis=0), np.max(X, axis=0)
25212524
X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]]
25222525
X_transformed = scaler.transform(X_test)
@@ -2526,6 +2529,25 @@ def test_minmax_scaler_clip(feature_range):
25262529
)
25272530

25282531

2532+
@pytest.mark.parametrize(
2533+
"data_constructor", [np.array] + CSC_CONTAINERS + CSR_CONTAINERS
2534+
)
2535+
def test_maxabs_scaler_clip(data_constructor):
2536+
# test behaviour of the parameter 'clip' in MaxAbsScaler
2537+
X = data_constructor(iris.data)
2538+
is_sparse = sparse.issparse(X)
2539+
scaler = MaxAbsScaler(clip=True).fit(X)
2540+
# create a test sample with features outside the training max abs range:
2541+
# first 2 features > max(abs(X)) and last 2 features < -max(abs(X))
2542+
max_abs = np.max(np.abs(X), axis=0)
2543+
max_abs = max_abs.data if is_sparse else max_abs
2544+
X_test = data_constructor(
2545+
np.hstack((max_abs[:2] + 10, -max_abs[2:] - 10)).reshape(1, -1)
2546+
)
2547+
X_transformed = scaler.transform(X_test)
2548+
assert_allclose_dense_sparse(X_transformed, data_constructor([[1, 1, -1, -1]]))
2549+
2550+
25292551
def test_standard_scaler_raise_error_for_1d_input():
25302552
"""Check that `inverse_transform` from `StandardScaler` raises an error
25312553
with 1D array.

0 commit comments

Comments
 (0)