Skip to content

[WIP] NEP-18 support for preprocessing algorithms #17744

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
18 changes: 11 additions & 7 deletions sklearn/preprocessing/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ..utils import check_array
from ..utils.extmath import row_norms
from ..utils.extmath import _incremental_mean_and_var
from ..utils.array_creation import empty_like, zeros_like
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2)
from ..utils.sparsefuncs import (inplace_column_scale,
Expand Down Expand Up @@ -70,7 +71,7 @@ def _handle_zeros_in_scale(scale, copy=True):
if scale == .0:
scale = 1.
return scale
elif isinstance(scale, np.ndarray):
elif hasattr(scale, "__array_function__"):
if copy:
# New array to avoid side-effects
scale = scale.copy()
Expand Down Expand Up @@ -739,7 +740,8 @@ def partial_fit(self, X, y=None):
self.n_samples_seen_ += X.shape[0] - counts_nan
else:
if not hasattr(self, 'n_samples_seen_'):
self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64)
self.n_samples_seen_ = zeros_like(X, shape=X.shape[1],
dtype=np.int64)

# First pass
if not hasattr(self, 'scale_'):
Expand All @@ -752,7 +754,7 @@ def partial_fit(self, X, y=None):
if not self.with_mean and not self.with_std:
self.mean_ = None
self.var_ = None
self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
self.n_samples_seen_ += (X == X).sum(axis=0)
else:
self.mean_, self.var_, self.n_samples_seen_ = \
_incremental_mean_and_var(X, self.mean_, self.var_,
Expand All @@ -761,7 +763,8 @@ def partial_fit(self, X, y=None):
# for backward-compatibility, reduce n_samples_seen_ to an integer
# if the number of samples is the same for each feature (i.e. no
# missing values)
if np.ptp(self.n_samples_seen_) == 0:
ptp = self.n_samples_seen_.max() - self.n_samples_seen_.min()
if ptp == 0:
self.n_samples_seen_ = self.n_samples_seen_[0]

if self.with_std:
Expand Down Expand Up @@ -834,7 +837,8 @@ def inverse_transform(self, X, copy=None):
if self.scale_ is not None:
inplace_column_scale(X, self.scale_)
else:
X = np.asarray(X)
if not hasattr(X, "__array_function__"):
X = np.asarray(X)
if copy:
X = X.copy()
if self.with_std:
Expand Down Expand Up @@ -1582,8 +1586,8 @@ def transform(self, X):
columns.append(bias)
XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
else:
XP = np.empty((n_samples, self.n_output_features_),
dtype=X.dtype, order=self.order)
XP = empty_like(X, order=self.order,
shape=(n_samples, self.n_output_features_))

# What follows is a faster implementation of:
# for i, comb in enumerate(combinations):
Expand Down
5 changes: 3 additions & 2 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array
from ..utils.array_creation import zeros_like, ones_like
from ..utils.validation import check_is_fitted

from ._label import _encode, _encode_check_unknown
Expand Down Expand Up @@ -100,8 +101,8 @@ def _fit(self, X, handle_unknown='error'):
def _transform(self, X, handle_unknown='error'):
X_list, n_samples, n_features = self._check_X(X)

X_int = np.zeros((n_samples, n_features), dtype=np.int)
X_mask = np.ones((n_samples, n_features), dtype=np.bool)
X_int = zeros_like(X, shape=(n_samples, n_features), dtype=np.int)
X_mask = ones_like(X, shape=(n_samples, n_features), dtype=np.bool)

if n_features != len(self.categories_):
raise ValueError(
Expand Down
62 changes: 62 additions & 0 deletions sklearn/preprocessing/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2506,3 +2506,65 @@ def test_standard_scaler_sparse_partial_fit_finite_variance(X_2):
scaler = StandardScaler(with_mean=False)
scaler.fit(X_1).partial_fit(X_2)
assert np.isfinite(scaler.var_[0])


def test_minmax_scaler_cupy():
cp = pytest.importorskip("cupy")
X_np = iris.data
X_cp = cp.asarray(X_np)

scaler = MinMaxScaler(copy=True)
t_X_cp = scaler.fit_transform(X_cp)
assert type(t_X_cp) == type(X_cp)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe also check that the min and max values of t_X_cp are 0 and 1.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

assert t_X_cp.min() == 0
assert t_X_cp.max() == 1

r_X_cp = scaler.inverse_transform(t_X_cp)
assert type(r_X_cp) == type(t_X_cp)

r_X_cp = cp.asnumpy(r_X_cp)
assert_almost_equal(r_X_cp, X_np, decimal=3)

scaler = MinMaxScaler(copy=True)
t_X_np = scaler.fit_transform(X_np)

t_X_cp = cp.asnumpy(t_X_cp)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really needed? Doesn't assert_almost_equal (or better assert_allclose) work with cupy arrays directly?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem to work.

assert_almost_equal(t_X_cp, t_X_np, decimal=3)


def test_minmax_scale_cupy():
cp = pytest.importorskip("cupy")
X_np = iris.data
X_cp = cp.asarray(X_np)

t_X_cp = minmax_scale(X_cp)
assert type(t_X_cp) == type(X_cp)

t_X_np = minmax_scale(X_np)

t_X_cp = cp.asnumpy(t_X_cp)
assert_almost_equal(t_X_cp, t_X_np, decimal=3)


@pytest.mark.parametrize("with_mean", [True, False])
@pytest.mark.parametrize("with_std", [True, False])
def test_standard_scaler_cupy(with_mean, with_std):
cp = pytest.importorskip("cupy")
X_np = iris.data
X_cp = cp.asarray(X_np)

scaler = StandardScaler(copy=True, with_mean=with_mean, with_std=with_std)
t_X_cp = scaler.fit_transform(X_cp)
assert type(t_X_cp) == type(X_cp)

r_X_cp = scaler.inverse_transform(t_X_cp)
assert type(r_X_cp) == type(t_X_cp)

r_X_cp = cp.asnumpy(r_X_cp)
assert_almost_equal(r_X_cp, X_np, decimal=3)

scaler = StandardScaler(copy=True, with_mean=with_mean, with_std=with_std)
t_X_np = scaler.fit_transform(X_np)

t_X_cp = cp.asnumpy(t_X_cp)
assert_almost_equal(t_X_cp, t_X_np, decimal=3)
49 changes: 49 additions & 0 deletions sklearn/utils/array_creation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Central place for array creation, to support non-numpy arrays.

This currently leverages NEP18 via np.{empty|zeros|ones}_like to create
non-numpy arrays.
"""

from .fixes import np_version

import numpy as np


def create_like(create, create_like):
"""Generalization of (empty|zeros|ones)_like"""
name = create.__name__

def metafunction(prototype, dtype=None, order='C', subok=True, shape=None):
"""Forwards call to numpy.{name}_like or {name}, to be compatible with NEP18.

Before numpy 1.17, numpy.{name}_like did not take a shape argument.

When version of numpy < (1, 17), and shape is provided, the call will
be forwarded to numpy.{name}. If shape is not provided, the call is
forwarded to numpy.{name}_like.
""".format(name=name)
if np_version < (1, 17):
if shape is not None:
if dtype is None:
if not hasattr(prototype, 'dtype'):
raise NotImplementedError('Passed prototype to {name}_'
'like without a dtype'.
format(name=name))
dtype = prototype.dtype
if order == 'A':
order = 'F' if prototype.flags['F_CONTIGUOUS'] else 'C'
elif order == 'K':
raise NotImplementedError('order=K not implemented')
return create(shape, dtype=dtype, order=order)
else:
return create_like(prototype, dtype=dtype, order=order,
subok=subok)
else:
return create_like(prototype, dtype=dtype, order=order,
shape=shape)
return metafunction


empty_like = create_like(np.empty, np.empty_like)
zeros_like = create_like(np.zeros, np.zeros_like)
ones_like = create_like(np.ones, np.ones_like)
Copy link
Member

@ogrisel ogrisel Nov 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that NEP-35 has been accepted this could could be simplified (on the dev version of numpy):

numpy/numpy#16935

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We would still need a similar hack as a backword compat (e.g. in sklearn/utils/fixes.py) to make it work for older versions of numpy, but we should definitely use the actual NEP-35 implementation on numpy versions that support it.

74 changes: 74 additions & 0 deletions sklearn/utils/tests/test_array_creation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import numpy as np
import pytest
from unittest.mock import MagicMock

from sklearn.utils.array_creation import empty_like
from sklearn.utils.array_creation import np_version
import sklearn.utils.array_creation


@pytest.mark.skipif(np_version < (1, 17),
reason="NEP18 not supported before 1.17")
def test_empty_like_nep18():
class ArrayLike:
__array_function__ = MagicMock(return_value=42)

# if NEP18 is supported, empty_like should be forwarded to us
array_like = ArrayLike()
value = empty_like(array_like, dtype=np.float32, shape=(4, 2))
assert value == 42


def test_empty_like():
# Normaly arrays should just work with all versions of numpy
X = np.arange(8)
Y = empty_like(X.reshape((4, 2)))
assert isinstance(Y, np.ndarray)
assert Y.shape == (4, 2)


def test_empty_like_no_nep18():
class NotAnArray:
def __array__(self):
return np.arange(8, dtype=np.float64).reshape((4, 2))
try:
# we trick this module into thinking it is working with an older
# version to also test/cover this branch with newer versions of numpy
real_np_version = sklearn.utils.array_creation.np_version
sklearn.utils.array_creation.np_version = (1, 16)

no_array = NotAnArray()
empty_like(no_array, dtype=np.float32, shape=(4, 2))
# for numpy < 1.17, we should give an error msg, if we provide shape
# with a non-numpy array, and no dtype
with pytest.raises(NotImplementedError):
empty_like(no_array, shape=(4, 2))

# we can pass a non-ndarray object, but without shape
no_array = NotAnArray()
an_array = empty_like(no_array, dtype=np.float32)
assert an_array.shape == (4, 2)
assert an_array.dtype == np.float32

# but with a ndarray, we can pass with shape
second_array = empty_like(an_array, dtype=np.float64, shape=(3, 5))
assert second_array.shape == (3, 5)
assert second_array.dtype == np.float64

# and the dtype is optional for ndarrays
second_array_same_type = empty_like(an_array, shape=(3, 5))
assert second_array_same_type.shape == (3, 5)
assert second_array_same_type.dtype == np.float32

c_like_array = empty_like(an_array.T, shape=(3, 5))
assert c_like_array.flags['C_CONTIGUOUS']

fortran_like_array = empty_like(an_array.T, order='A', shape=(3, 5))
assert fortran_like_array.flags['F_CONTIGUOUS']

# unlike numpy, we don't implement order=K
with pytest.raises(NotImplementedError):
empty_like(an_array, order='K', shape=(4, 2))

finally:
sklearn.utils.array_creation.np_version = real_np_version
8 changes: 5 additions & 3 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None):

if _get_config()['assume_finite']:
return
X = np.asanyarray(X)
if not hasattr(X, "__array_function__"):
X = np.asanyarray(X)
# First try an O(n) time, O(1) space solution for the common case that
# everything is finite; fall back to O(n) space np.isfinite to prevent
# false positives from overflow in sum method. The sum is also calculated
Expand Down Expand Up @@ -530,7 +531,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
msg_dtype=dtype)
array = array.astype(dtype, casting="unsafe", copy=False)
else:
array = np.asarray(array, order=order, dtype=dtype)
if not hasattr(array, "__array_function__"):
array = np.asarray(array, order=order, dtype=dtype)
except ComplexWarning:
raise ValueError("Complex data not supported\n"
"{}\n".format(array))
Expand Down Expand Up @@ -596,7 +598,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
context))

if copy and np.may_share_memory(array, array_orig):
array = np.array(array, dtype=dtype, order=order)
array = array.copy()

return array

Expand Down