scikit-learn · NicolasHug · Oct 24, 2019 · Aug 20, 2019 · Aug 20, 2019 · Aug 20, 2019
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -687,6 +687,12 @@ Changelog
 Miscellaneous
 .............
 
+- |API| Scikit-learn now converts any input data structure implementing a
+  duck array to a numpy array (using ``__array__``) to ensure consistent
+  behavior instead of relying on ``__array_function__`` (see `NEP 18
+  <https://numpy.org/neps/nep-0018-array-function-protocol.html>`_).
+  :pr:`14702` by `Andreas Müller`_.
+
 - |API| Replace manual checks with ``check_is_fitted``. Errors thrown when
   using a non-fitted estimators are now more uniform.
   :pr:`13013` by :user:`Agamemnon Krasoulis <agamemnonc>`.
@@ -709,6 +715,12 @@ These changes mostly affect library developers.
   Such classifiers need to have the `binary_only=True` estimator tag.
   :pr:`13875` by `Trevor Stephens`_.
 
+- Estimators are expected to convert input data (``X``, ``y``,
+  ``sample_weights``) to :class:`numpy.ndarray` and never call
+  ``__array_function__`` on the original datatype that is passed (see `NEP 18
+  <https://numpy.org/neps/nep-0018-array-function-protocol.html>`_).
+  :pr:`14702` by `Andreas Müller`_.
+
 - `requires_positive_X` estimator tag (for models that require
   X to be non-negative) is now used by :meth:`utils.estimator_checks.check_estimator`
   to make sure a proper error message is raised if X contains some negative entries.

diff --git a/sklearn/cross_decomposition/_pls_.py b/sklearn/cross_decomposition/_pls_.py
@@ -881,6 +881,7 @@ def transform(self, X, Y=None):
         Xr = (X - self.x_mean_) / self.x_std_
         x_scores = np.dot(Xr, self.x_weights_)
         if Y is not None:
+            Y = check_array(Y, ensure_2d=False, dtype=np.float64)
             if Y.ndim == 1:
                 Y = Y.reshape(-1, 1)
             Yr = (Y - self.y_mean_) / self.y_std_

diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -133,6 +133,7 @@ def fit(self, X, y, sample_weight=None):
         self.sparse_output_ = sp.issparse(y)
 
         if not self.sparse_output_:
+            y = np.asarray(y)
             y = np.atleast_1d(y)
 
         if y.ndim == 1:
@@ -470,6 +471,8 @@ def fit(self, X, y, sample_weight=None):
         self.n_outputs_ = y.shape[1]
 
         check_consistent_length(X, y, sample_weight)
+        if sample_weight is not None:
+            sample_weight = np.asarray(sample_weight)
 
         if self.strategy == "mean":
             self.constant_ = np.average(y, axis=0, weights=sample_weight)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -120,6 +120,8 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
 
     if isinstance(sample_weight, numbers.Number):
         sample_weight = None
+    if sample_weight is not None:
+        sample_weight = np.asarray(sample_weight)
 
     if check_input:
         X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
@@ -181,8 +183,10 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
 def _rescale_data(X, y, sample_weight):
     """Rescale data so as to support sample_weight"""
     n_samples = X.shape[0]
-    sample_weight = np.full(n_samples, sample_weight,
-                            dtype=np.array(sample_weight).dtype)
+    sample_weight = np.array(sample_weight)
+    if sample_weight.ndim == 0:
+        sample_weight = np.full(n_samples, sample_weight,
+                                dtype=sample_weight.dtype)
     sample_weight = np.sqrt(sample_weight)
     sw_matrix = sparse.dia_matrix((sample_weight, 0),
                                   shape=(n_samples, n_samples))
@@ -467,7 +471,7 @@ def fit(self, X, y, sample_weight=None):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          y_numeric=True, multi_output=True)
 
-        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
+        if sample_weight is not None and np.asarray(sample_weight).ndim > 1:
             raise ValueError("Sample weights must be 1D array or scalar")
 
         X, y, X_offset, y_offset, X_scale = self._preprocess_data(

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
@@ -567,7 +567,7 @@ def fit(self, X, y, sample_weight=None):
             solver = self.solver
 
         if ((sample_weight is not None) and
-                np.atleast_1d(sample_weight).ndim > 1):
+                np.asarray(sample_weight).ndim > 1):
             raise ValueError("Sample weights must be 1D array or scalar")
 
         # when X is sparse we only remove offset from y

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -643,6 +643,10 @@ def fit(self, X, y=None, groups=None, **fit_params):
             refit_metric = 'score'
 
         X, y, groups = indexable(X, y, groups)
+        # make sure fit_params are sliceable
+        fit_params_values = indexable(*fit_params.values())
+        fit_params = dict(zip(fit_params.keys(), fit_params_values))
+
         n_splits = cv.get_n_splits(X, y, groups)
 
         base_estimator = clone(self.estimator)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -209,10 +209,11 @@ def check_hyperparameter_searcher_with_fit_params(klass, **klass_kwargs):
     assert_raise_message(AssertionError,
                          "Expected fit parameter(s) ['eggs'] not seen.",
                          searcher.fit, X, y, spam=np.ones(10))
-    assert_raise_message(AssertionError,
-                         "Fit parameter spam has length 1; expected",
-                         searcher.fit, X, y, spam=np.ones(1),
-                         eggs=np.zeros(10))
+    assert_raise_message(
+        ValueError,
+        "Found input variables with inconsistent numbers of samples: [",
+        searcher.fit, X, y, spam=np.ones(1),
+        eggs=np.zeros(10))
     searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10))
 
 

diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
@@ -31,7 +31,7 @@
 from .utils.extmath import safe_sparse_dot
 from .utils.fixes import logsumexp
 from .utils.multiclass import _check_partial_fit_first_call
-from .utils.validation import check_is_fitted, check_non_negative
+from .utils.validation import check_is_fitted, check_non_negative, column_or_1d
 
 __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB',
            'CategoricalNB']
@@ -204,6 +204,7 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
+        y = column_or_1d(y, warn=True)
         return self._partial_fit(X, y, np.unique(y), _refit=True,
                                  sample_weight=sample_weight)
 
@@ -622,6 +623,7 @@ def fit(self, X, y, sample_weight=None):
         # this means we also don't have to cast X to floating point
         if sample_weight is not None:
             Y = Y.astype(np.float64, copy=False)
+            sample_weight = np.asarray(sample_weight)
             sample_weight = np.atleast_2d(sample_weight)
             Y *= check_array(sample_weight).T
 

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -80,6 +80,7 @@ def _yield_checks(name, estimator):
     yield check_estimators_dtypes
     yield check_fit_score_takes_y
     yield check_sample_weights_pandas_series
+    yield check_sample_weights_not_an_array
     yield check_sample_weights_list
     yield check_sample_weights_invariance
     yield check_estimators_fit_returns_self
@@ -538,11 +539,17 @@ class _NotAnArray:
     """
 
     def __init__(self, data):
-        self.data = data
+        self.data = np.asarray(data)
 
     def __array__(self, dtype=None):
         return self.data
 
+    def __array_function__(self, func, types, args, kwargs):
+        if func.__name__ == "may_share_memory":
+            return True
+        raise TypeError("Don't want to call array_function {}!".format(
+            func.__name__))
+
 
 @deprecated("NotAnArray is deprecated in version "
             "0.22 and will be removed in version 0.24.")
@@ -719,6 +726,23 @@ def check_sample_weights_pandas_series(name, estimator_orig):
                            "input of type pandas.Series to class weight.")
 
 
+@ignore_warnings(category=(DeprecationWarning, FutureWarning))
+def check_sample_weights_not_an_array(name, estimator_orig):
+    # check that estimators will accept a 'sample_weight' parameter of
+    # type _NotAnArray in the 'fit' function.
+    estimator = clone(estimator_orig)
+    if has_fit_parameter(estimator, "sample_weight"):
+        X = np.array([[1, 1], [1, 2], [1, 3], [1, 4],
+                      [2, 1], [2, 2], [2, 3], [2, 4],
+                      [3, 1], [3, 2], [3, 3], [3, 4]])
+        X = _NotAnArray(pairwise_estimator_convert_X(X, estimator_orig))
+        y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
+        weights = _NotAnArray([1] * 12)
+        if _safe_tags(estimator, "multioutput_only"):
+            y = _NotAnArray(y.data.reshape(-1, 1))
+        estimator.fit(X, y, sample_weight=weights)
+
+
 @ignore_warnings(category=(DeprecationWarning, FutureWarning))
 def check_sample_weights_list(name, estimator_orig):
     # check that estimators will accept a 'sample_weight' parameter of
@@ -1162,8 +1186,10 @@ def _check_transformer(name, transformer_orig, X, y):
     # fit
 
     if name in CROSS_DECOMPOSITION:
-        y_ = np.c_[y, y]
+        y_ = np.c_[np.asarray(y), np.asarray(y)]
         y_[::2, 1] *= 2
+        if isinstance(X, _NotAnArray):
+            y_ = _NotAnArray(y_)
     else:
         y_ = y
 

diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
@@ -353,6 +353,8 @@ def class_distribution(y, sample_weight=None):
     class_prior = []
 
     n_samples, n_outputs = y.shape
+    if sample_weight is not None:
+        sample_weight = np.asarray(sample_weight)
 
     if issparse(y):
         y = y.tocsc()
@@ -362,7 +364,7 @@ def class_distribution(y, sample_weight=None):
             col_nonzero = y.indices[y.indptr[k]:y.indptr[k + 1]]
             # separate sample weights for zero and non-zero elements
             if sample_weight is not None:
-                nz_samp_weight = np.asarray(sample_weight)[col_nonzero]
+                nz_samp_weight = sample_weight[col_nonzero]
                 zeros_samp_weight_sum = (np.sum(sample_weight) -
                                          np.sum(nz_samp_weight))
             else:

diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
@@ -13,7 +13,7 @@
                                    ignore_warnings,
                                    assert_warns, assert_raises,
                                    SkipTest)
-from sklearn.utils.estimator_checks import check_estimator
+from sklearn.utils.estimator_checks import check_estimator, _NotAnArray
 from sklearn.utils.estimator_checks \
     import check_class_weight_balanced_linear_classifier
 from sklearn.utils.estimator_checks import set_random_state
@@ -23,6 +23,7 @@
 from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.estimator_checks import check_outlier_corruption
+from sklearn.utils.fixes import _parse_version
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LinearRegression, SGDClassifier
 from sklearn.mixture import GaussianMixture
@@ -305,6 +306,17 @@ def _more_tags(self):
         return {"requires_positive_y": True}
 
 
+def test_not_an_array_array_function():
+    np_version = _parse_version(np.__version__)
+    if np_version < (1, 17):
+        raise SkipTest("array_function protocol not supported in numpy <1.17")
+    not_array = _NotAnArray(np.ones(10))
+    msg = "Don't want to call array_function sum!"
+    assert_raises_regex(TypeError, msg, np.sum, not_array)
+    # always returns True
+    assert np.may_share_memory(not_array, None)
+
+
 def test_check_fit_score_takes_y_works_on_deprecated_fit():
     # Tests that check_fit_score_takes_y works on a class with
     # a deprecated fit method

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -765,6 +765,7 @@ def column_or_1d(y, warn=False):
     y : array
 
     """
+    y = np.asarray(y)
     shape = np.shape(y)
     if len(shape) == 1:
         return np.ravel(y)