scikit-learn · anhqngo · Jun 9, 2020 · Jun 11, 2020 · Jun 12, 2020 · Jun 12, 2020
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -567,6 +567,15 @@ Changelog
   with different endianness.
   :pr:`17644` by :user:`Qi Zhang <qzhang90>`.
 
+:mod:`sklearn.utils`
+.........................
+
+- |Enhancement| Add ``check_methods_sample_order_invariance`` to
+  :func:`~utils.estimator_checks.check_estimator`, which checks that
+  estimator methods are invariant if applied to the same dataset
+  with different sample order :pr:`17598` by :user:`Jason Ngo <ngojason9>`.
+
+
 Code and Documentation Contributors
 -----------------------------------
 

diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -351,6 +351,8 @@ def _more_tags(self):
             'poor_score': True, 'no_validation': True,
             '_xfail_checks': {
                 'check_methods_subset_invariance':
+                'fails for the predict method',
+                'check_methods_sample_order_invariance':
                 'fails for the predict method'
             }
         }

diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
@@ -370,6 +370,14 @@ def fit_transform(self, X, y=None):
         """
         return self.fit(X).transform(X)
 
+    def _more_tags(self):
+        return {
+            '_xfail_checks': {
+                'check_methods_sample_order_invariance':
+                'check is not applicable.'
+            }
+        }
+
 
 class RadiusNeighborsTransformer(RadiusNeighborsMixin,
                                  TransformerMixin,
@@ -543,3 +551,11 @@ def fit_transform(self, X, y=None):
             The matrix is of CSR format.
         """
         return self.fit(X).transform(X)
+
+    def _more_tags(self):
+        return {
+            '_xfail_checks': {
+                'check_methods_sample_order_invariance':
+                'check is not applicable.'
+            }
+        }
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
@@ -382,6 +382,8 @@ def _more_tags(self):
         return {
             '_xfail_checks': {
                 'check_methods_subset_invariance':
-                'fails for the decision_function method'
+                'fails for the decision_function method',
+                'check_methods_sample_order_invariance':
+                'fails for the score_samples method',
             }
         }
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -270,6 +270,7 @@ def _yield_all_checks(estimator):
             yield check
     yield check_parameters_default_constructible
     yield check_fit2d_predict1d
+    yield check_methods_sample_order_invariance
     yield check_methods_subset_invariance
     yield check_fit2d_1sample
     yield check_fit2d_1feature
@@ -1168,6 +1169,41 @@ def check_methods_subset_invariance(name, estimator_orig, strict_mode=True):
                             atol=1e-7, err_msg=msg)
 
 
+@ignore_warnings(category=FutureWarning)
+def check_methods_sample_order_invariance(name, estimator_orig, strict_mode=True):
-def check_methods_sample_order_invariance(name, estimator_orig, strict_mode=True):
+def check_methods_sample_order_invariance(name, estimator_orig,
+                                          strict_mode=True):
-def check_methods_sample_order_invariance(name, estimator_orig, strict_mode=True):
+def check_methods_sample_order_invariance(name, estimator_orig,
+                                          strict_mode=True):
+    # check that method gives invariant results if applied
+    # on a subset with different sample order
+    rnd = np.random.RandomState(0)
+    X = 3 * rnd.uniform(size=(20, 3))
+    X = _pairwise_estimator_convert_X(X, estimator_orig)
+    y = X[:, 0].astype(np.int)
+    if estimator_orig._get_tags()['binary_only']:
+        y[y == 2] = 1
+    estimator = clone(estimator_orig)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    if hasattr(estimator, "n_components"):
+        estimator.n_components = 1
+    if hasattr(estimator, "n_clusters"):
+        estimator.n_clusters = 2
+
+    set_random_state(estimator, 1)
+    estimator.fit(X, y)
+
+    idx = np.random.permutation(X.shape[0])
+
+    for method in ["predict", "transform", "decision_function",
+                   "score_samples", "predict_proba"]:
+        msg = ("{method} of {name} is not invariant when applied to a dataset"
+               "with different sample order.").format(method=method, name=name)
+
+        if hasattr(estimator, method):
+            assert_allclose_dense_sparse(getattr(estimator, method)(X)[idx],
+                                         getattr(estimator, method)(X[idx]),
+                                         atol=1e-9,
+                                         err_msg=msg)
+
+
 @ignore_warnings
 def check_fit2d_1sample(name, estimator_orig, strict_mode=True):
     # Check that fitting a 2d array with only one sample either works or

diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
@@ -270,6 +270,27 @@ def predict(self, X):
         return np.zeros(X.shape[0])
 
 
+class NotInvariantSampleOrder(BaseEstimator):
+    def fit(self, X, y):
+        X, y = self._validate_data(
+            X, y,
+            accept_sparse=("csr", "csc"),
+            multi_output=True,
+            y_numeric=True)
+        # store the original X to check for sample order later
+        self._X = X
+        return self
+
+    def predict(self, X):
+        X = check_array(X)
+        # if the input contains the same elements but different sample order,
+        # then just return zeros.
+        if (np.array_equiv(np.sort(X, axis=0), np.sort(self._X, axis=0)) and
+           (X != self._X).any()):
+            return np.zeros(X.shape[0])
+        return X[:, 0]
+
+
 class LargeSparseNotSupportedClassifier(BaseEstimator):
     def fit(self, X, y):
         X, y = self._validate_data(
@@ -455,6 +476,13 @@ def test_check_estimator():
            ' with _ but wrong_attribute added')
     assert_raises_regex(AssertionError, msg,
                         check_estimator, SetsWrongAttribute())
+    # check for sample order invariance
+    name = NotInvariantSampleOrder.__name__
+    method = 'predict'
+    msg = ("{method} of {name} is not invariant when applied to a dataset"
+           "with different sample order.").format(method=method, name=name)
+    assert_raises_regex(AssertionError, msg,
+                        check_estimator, NotInvariantSampleOrder())
     # check for invariant method
     name = NotInvariantPredict.__name__
     method = 'predict'