scikit-learn · viclafargue · Jan 28, 2020 · Jan 29, 2020 · Jan 29, 2020 · Jan 31, 2020
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
@@ -23,6 +23,7 @@
 from ..utils import check_array
 from ..utils.extmath import row_norms
 from ..utils.extmath import _incremental_mean_and_var
+from ..utils.array_creation import empty_like, zeros_like
 from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
                                       inplace_csr_row_normalize_l2)
 from ..utils.sparsefuncs import (inplace_column_scale,
@@ -70,7 +71,7 @@ def _handle_zeros_in_scale(scale, copy=True):
         if scale == .0:
             scale = 1.
         return scale
-    elif isinstance(scale, np.ndarray):
+    elif hasattr(scale, "__array_function__"):
         if copy:
             # New array to avoid side-effects
             scale = scale.copy()
@@ -739,7 +740,8 @@ def partial_fit(self, X, y=None):
                     self.n_samples_seen_ += X.shape[0] - counts_nan
         else:
             if not hasattr(self, 'n_samples_seen_'):
-                self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64)
+                self.n_samples_seen_ = zeros_like(X, shape=X.shape[1],
+                                                  dtype=np.int64)
 
             # First pass
             if not hasattr(self, 'scale_'):
@@ -752,7 +754,7 @@ def partial_fit(self, X, y=None):
             if not self.with_mean and not self.with_std:
                 self.mean_ = None
                 self.var_ = None
-                self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
+                self.n_samples_seen_ += (X == X).sum(axis=0)
             else:
                 self.mean_, self.var_, self.n_samples_seen_ = \
                     _incremental_mean_and_var(X, self.mean_, self.var_,
@@ -761,7 +763,8 @@ def partial_fit(self, X, y=None):
         # for backward-compatibility, reduce n_samples_seen_ to an integer
         # if the number of samples is the same for each feature (i.e. no
         # missing values)
-        if np.ptp(self.n_samples_seen_) == 0:
+        ptp = self.n_samples_seen_.max() - self.n_samples_seen_.min()
+        if ptp == 0:
             self.n_samples_seen_ = self.n_samples_seen_[0]
 
         if self.with_std:
@@ -834,7 +837,8 @@ def inverse_transform(self, X, copy=None):
             if self.scale_ is not None:
                 inplace_column_scale(X, self.scale_)
         else:
-            X = np.asarray(X)
+            if not hasattr(X, "__array_function__"):
+                X = np.asarray(X)
             if copy:
                 X = X.copy()
             if self.with_std:
@@ -1582,8 +1586,8 @@ def transform(self, X):
                         columns.append(bias)
                 XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
             else:
-                XP = np.empty((n_samples, self.n_output_features_),
-                              dtype=X.dtype, order=self.order)
+                XP = empty_like(X, order=self.order,
+                                shape=(n_samples, self.n_output_features_))
 
                 # What follows is a faster implementation of:
                 # for i, comb in enumerate(combinations):

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -7,6 +7,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
+from ..utils.array_creation import zeros_like, ones_like
 from ..utils.validation import check_is_fitted
 
 from ._label import _encode, _encode_check_unknown
@@ -100,8 +101,8 @@ def _fit(self, X, handle_unknown='error'):
     def _transform(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
 
-        X_int = np.zeros((n_samples, n_features), dtype=np.int)
-        X_mask = np.ones((n_samples, n_features), dtype=np.bool)
+        X_int = zeros_like(X, shape=(n_samples, n_features), dtype=np.int)
+        X_mask = ones_like(X, shape=(n_samples, n_features), dtype=np.bool)
 
         if n_features != len(self.categories_):
             raise ValueError(

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -2506,3 +2506,65 @@ def test_standard_scaler_sparse_partial_fit_finite_variance(X_2):
     scaler = StandardScaler(with_mean=False)
     scaler.fit(X_1).partial_fit(X_2)
     assert np.isfinite(scaler.var_[0])
+
+
+def test_minmax_scaler_cupy():
+    cp = pytest.importorskip("cupy")
+    X_np = iris.data
+    X_cp = cp.asarray(X_np)
+
+    scaler = MinMaxScaler(copy=True)
+    t_X_cp = scaler.fit_transform(X_cp)
+    assert type(t_X_cp) == type(X_cp)
+    assert t_X_cp.min() == 0
+    assert t_X_cp.max() == 1
+
+    r_X_cp = scaler.inverse_transform(t_X_cp)
+    assert type(r_X_cp) == type(t_X_cp)
+
+    r_X_cp = cp.asnumpy(r_X_cp)
+    assert_almost_equal(r_X_cp, X_np, decimal=3)
+
+    scaler = MinMaxScaler(copy=True)
+    t_X_np = scaler.fit_transform(X_np)
+
+    t_X_cp = cp.asnumpy(t_X_cp)
+    assert_almost_equal(t_X_cp, t_X_np, decimal=3)
+
+
+def test_minmax_scale_cupy():
+    cp = pytest.importorskip("cupy")
+    X_np = iris.data
+    X_cp = cp.asarray(X_np)
+
+    t_X_cp = minmax_scale(X_cp)
+    assert type(t_X_cp) == type(X_cp)
+
+    t_X_np = minmax_scale(X_np)
+
+    t_X_cp = cp.asnumpy(t_X_cp)
+    assert_almost_equal(t_X_cp, t_X_np, decimal=3)
+
+
+@pytest.mark.parametrize("with_mean", [True, False])
+@pytest.mark.parametrize("with_std", [True, False])
+def test_standard_scaler_cupy(with_mean, with_std):
+    cp = pytest.importorskip("cupy")
+    X_np = iris.data
+    X_cp = cp.asarray(X_np)
+
+    scaler = StandardScaler(copy=True, with_mean=with_mean, with_std=with_std)
+    t_X_cp = scaler.fit_transform(X_cp)
+    assert type(t_X_cp) == type(X_cp)
+
+    r_X_cp = scaler.inverse_transform(t_X_cp)
+    assert type(r_X_cp) == type(t_X_cp)
+
+    r_X_cp = cp.asnumpy(r_X_cp)
+    assert_almost_equal(r_X_cp, X_np, decimal=3)
+
+    scaler = StandardScaler(copy=True, with_mean=with_mean, with_std=with_std)
+    t_X_np = scaler.fit_transform(X_np)
+
+    t_X_cp = cp.asnumpy(t_X_cp)
+    assert_almost_equal(t_X_cp, t_X_np, decimal=3)
diff --git a/sklearn/utils/array_creation.py b/sklearn/utils/array_creation.py
@@ -0,0 +1,49 @@
+"""Central place for array creation, to support non-numpy arrays.
+
+This currently leverages NEP18 via np.{empty|zeros|ones}_like to create
+non-numpy arrays.
+"""
+
+from .fixes import np_version
+
+import numpy as np
+
+
+def create_like(create, create_like):
+    """Generalization of (empty|zeros|ones)_like"""
+    name = create.__name__
+
+    def metafunction(prototype, dtype=None, order='C', subok=True, shape=None):
+        """Forwards call to numpy.{name}_like or {name}, to be compatible with NEP18.
+
+        Before numpy 1.17, numpy.{name}_like did not take a shape argument.
+
+        When version of numpy < (1, 17), and shape is provided, the call will
+        be forwarded to numpy.{name}. If shape is not provided, the call is
+        forwarded to numpy.{name}_like.
+        """.format(name=name)
+        if np_version < (1, 17):
+            if shape is not None:
+                if dtype is None:
+                    if not hasattr(prototype, 'dtype'):
+                        raise NotImplementedError('Passed prototype to {name}_'
+                                                  'like without a dtype'.
+                                                  format(name=name))
+                    dtype = prototype.dtype
+                if order == 'A':
+                    order = 'F' if prototype.flags['F_CONTIGUOUS'] else 'C'
+                elif order == 'K':
+                    raise NotImplementedError('order=K not implemented')
+                return create(shape, dtype=dtype, order=order)
+            else:
+                return create_like(prototype, dtype=dtype, order=order,
+                                   subok=subok)
+        else:
+            return create_like(prototype, dtype=dtype, order=order,
+                               shape=shape)
+    return metafunction
+
+
+empty_like = create_like(np.empty, np.empty_like)
+zeros_like = create_like(np.zeros, np.zeros_like)
+ones_like = create_like(np.ones, np.ones_like)
diff --git a/sklearn/utils/tests/test_array_creation.py b/sklearn/utils/tests/test_array_creation.py
@@ -0,0 +1,74 @@
+import numpy as np
+import pytest
+from unittest.mock import MagicMock
+
+from sklearn.utils.array_creation import empty_like
+from sklearn.utils.array_creation import np_version
+import sklearn.utils.array_creation
+
+
+@pytest.mark.skipif(np_version < (1, 17),
+                    reason="NEP18 not supported before 1.17")
+def test_empty_like_nep18():
+    class ArrayLike:
+        __array_function__ = MagicMock(return_value=42)
+
+    # if NEP18 is supported, empty_like should be forwarded to us
+    array_like = ArrayLike()
+    value = empty_like(array_like, dtype=np.float32, shape=(4, 2))
+    assert value == 42
+
+
+def test_empty_like():
+    # Normaly arrays should just work with all versions of numpy
+    X = np.arange(8)
+    Y = empty_like(X.reshape((4, 2)))
+    assert isinstance(Y, np.ndarray)
+    assert Y.shape == (4, 2)
+
+
+def test_empty_like_no_nep18():
+    class NotAnArray:
+        def __array__(self):
+            return np.arange(8, dtype=np.float64).reshape((4, 2))
+    try:
+        # we trick this module into thinking it is working with an older
+        # version to also test/cover this branch with newer versions of numpy
+        real_np_version = sklearn.utils.array_creation.np_version
+        sklearn.utils.array_creation.np_version = (1, 16)
+
+        no_array = NotAnArray()
+        empty_like(no_array, dtype=np.float32, shape=(4, 2))
+        # for numpy < 1.17, we should give an error msg, if we provide shape
+        # with a non-numpy array, and no dtype
+        with pytest.raises(NotImplementedError):
+            empty_like(no_array, shape=(4, 2))
+
+        # we can pass a non-ndarray object, but without shape
+        no_array = NotAnArray()
+        an_array = empty_like(no_array, dtype=np.float32)
+        assert an_array.shape == (4, 2)
+        assert an_array.dtype == np.float32
+
+        # but with a ndarray, we can pass with shape
+        second_array = empty_like(an_array, dtype=np.float64, shape=(3, 5))
+        assert second_array.shape == (3, 5)
+        assert second_array.dtype == np.float64
+
+        # and the dtype is optional for ndarrays
+        second_array_same_type = empty_like(an_array, shape=(3, 5))
+        assert second_array_same_type.shape == (3, 5)
+        assert second_array_same_type.dtype == np.float32
+
+        c_like_array = empty_like(an_array.T, shape=(3, 5))
+        assert c_like_array.flags['C_CONTIGUOUS']
+
+        fortran_like_array = empty_like(an_array.T, order='A', shape=(3, 5))
+        assert fortran_like_array.flags['F_CONTIGUOUS']
+
+        # unlike numpy, we don't implement order=K
+        with pytest.raises(NotImplementedError):
+            empty_like(an_array, order='K', shape=(4, 2))
+
+    finally:
+        sklearn.utils.array_creation.np_version = real_np_version
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -43,7 +43,8 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
 
     if _get_config()['assume_finite']:
         return
-    X = np.asanyarray(X)
+    if not hasattr(X, "__array_function__"):
+        X = np.asanyarray(X)
     # First try an O(n) time, O(1) space solution for the common case that
     # everything is finite; fall back to O(n) space np.isfinite to prevent
     # false positives from overflow in sum method. The sum is also calculated
@@ -530,7 +531,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
                                            msg_dtype=dtype)
                     array = array.astype(dtype, casting="unsafe", copy=False)
                 else:
-                    array = np.asarray(array, order=order, dtype=dtype)
+                    if not hasattr(array, "__array_function__"):
+                        array = np.asarray(array, order=order, dtype=dtype)
             except ComplexWarning:
                 raise ValueError("Complex data not supported\n"
                                  "{}\n".format(array))
@@ -596,7 +598,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
                                 context))
 
     if copy and np.may_share_memory(array, array_orig):
-        array = np.array(array, dtype=dtype, order=order)
+        array = array.copy()
 
     return array