-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[MRG] Ignore and pass-through NaN values in MaxAbsScaler and maxabs_scale #11011
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fb35442
c76ce20
4fb338f
fbd4049
7909057
49e23e4
1750375
2497932
d9eaf46
9668eb3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,11 +8,13 @@ | |
|
||
from sklearn.base import clone | ||
|
||
from sklearn.preprocessing import maxabs_scale | ||
from sklearn.preprocessing import minmax_scale | ||
from sklearn.preprocessing import scale | ||
from sklearn.preprocessing import power_transform | ||
from sklearn.preprocessing import quantile_transform | ||
|
||
from sklearn.preprocessing import MaxAbsScaler | ||
from sklearn.preprocessing import MinMaxScaler | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.preprocessing import PowerTransformer | ||
|
@@ -31,7 +33,8 @@ def _get_valid_samples_by_column(X, col): | |
|
||
@pytest.mark.parametrize( | ||
"est, func, support_sparse, strictly_positive", | ||
[(MinMaxScaler(), minmax_scale, False, False), | ||
[(MaxAbsScaler(), maxabs_scale, True, False), | ||
(MinMaxScaler(), minmax_scale, False, False), | ||
(StandardScaler(), scale, False, False), | ||
(StandardScaler(with_mean=False), scale, True, False), | ||
(PowerTransformer(), power_transform, False, True), | ||
|
@@ -53,12 +56,17 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive): | |
assert np.any(np.isnan(X_test), axis=0).all() | ||
X_test[:, 0] = np.nan # make sure this boundary case is tested | ||
|
||
Xt = est.fit(X_train).transform(X_test) | ||
with pytest.warns(None) as records: | ||
Xt = est.fit(X_train).transform(X_test) | ||
# ensure no warnings are raised | ||
assert len(records) == 0 | ||
# missing values should still be missing, and only them | ||
assert_array_equal(np.isnan(Xt), np.isnan(X_test)) | ||
|
||
# check that the function leads to the same results as the class | ||
Xt_class = est.transform(X_train) | ||
with pytest.warns(None) as records: | ||
Xt_class = est.transform(X_train) | ||
assert len(records) == 0 | ||
Xt_func = func(X_train, **est.get_params()) | ||
assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class)) | ||
assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)]) | ||
|
@@ -74,7 +82,9 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive): | |
# train only on non-NaN | ||
est.fit(_get_valid_samples_by_column(X_train, i)) | ||
# check transforming with NaN works even when training without NaN | ||
Xt_col = est.transform(X_test[:, [i]]) | ||
with pytest.warns(None) as records: | ||
Xt_col = est.transform(X_test[:, [i]]) | ||
assert len(records) == 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This new I am not sure if we should raise this warning or not (maybe not at transform time) but this should be consistent across all the transformers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I made a push. The reason was on the |
||
assert_allclose(Xt_col, Xt[:, [i]]) | ||
# check non-NaN is handled as before - the 1st column is all nan | ||
if not np.isnan(X_test[:, i]).all(): | ||
|
@@ -87,15 +97,23 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive): | |
est_dense = clone(est) | ||
est_sparse = clone(est) | ||
|
||
Xt_dense = est_dense.fit(X_train).transform(X_test) | ||
Xt_inv_dense = est_dense.inverse_transform(Xt_dense) | ||
with pytest.warns(None) as records: | ||
Xt_dense = est_dense.fit(X_train).transform(X_test) | ||
Xt_inv_dense = est_dense.inverse_transform(Xt_dense) | ||
assert len(records) == 0 | ||
for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix, | ||
sparse.bsr_matrix, sparse.coo_matrix, | ||
sparse.dia_matrix, sparse.dok_matrix, | ||
sparse.lil_matrix): | ||
# check that the dense and sparse inputs lead to the same results | ||
Xt_sparse = (est_sparse.fit(sparse_constructor(X_train)) | ||
.transform(sparse_constructor(X_test))) | ||
assert_allclose(Xt_sparse.A, Xt_dense) | ||
Xt_inv_sparse = est_sparse.inverse_transform(Xt_sparse) | ||
assert_allclose(Xt_inv_sparse.A, Xt_inv_dense) | ||
# precompute the matrix to avoid catching side warnings | ||
X_train_sp = sparse_constructor(X_train) | ||
X_test_sp = sparse_constructor(X_test) | ||
with pytest.warns(None) as records: | ||
Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp) | ||
assert len(records) == 0 | ||
assert_allclose(Xt_sp.A, Xt_dense) | ||
with pytest.warns(None) as records: | ||
Xt_inv_sp = est_sparse.inverse_transform(Xt_sp) | ||
assert len(records) == 0 | ||
assert_allclose(Xt_inv_sp.A, Xt_inv_dense) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not using
sklearn.utils.testing.assert_no_warnings
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wanted to stick to pytest awaiting for this feature: pytest-dev/pytest#1830
The second point is that I find more readable
than
@ogrisel are you also in favor of
assert_no_warnings
. If yes, 2 vs 1 and I will make the change :)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No ok, this is fine as it is.