Skip to content

TST Add sample order invariance to estimator_checks #17598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
9 changes: 9 additions & 0 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,15 @@ Changelog
with different endianness.
:pr:`17644` by :user:`Qi Zhang <qzhang90>`.

:mod:`sklearn.utils`
.........................

- |Enhancement| Add ``check_methods_sample_order_invariance`` to
:func:`~utils.estimator_checks.check_estimator`, which checks that
estimator methods are invariant if applied to the same dataset
with different sample order :pr:`17598` by :user:`Jason Ngo <ngojason9>`.


Code and Documentation Contributors
-----------------------------------

Expand Down
2 changes: 2 additions & 0 deletions sklearn/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,8 @@ def _more_tags(self):
'poor_score': True, 'no_validation': True,
'_xfail_checks': {
'check_methods_subset_invariance':
'fails for the predict method',
'check_methods_sample_order_invariance':
'fails for the predict method'
}
}
Expand Down
16 changes: 16 additions & 0 deletions sklearn/neighbors/_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,14 @@ def fit_transform(self, X, y=None):
"""
return self.fit(X).transform(X)

def _more_tags(self):
return {
'_xfail_checks': {
'check_methods_sample_order_invariance':
'check is not applicable.'
}
}


class RadiusNeighborsTransformer(RadiusNeighborsMixin,
TransformerMixin,
Expand Down Expand Up @@ -543,3 +551,11 @@ def fit_transform(self, X, y=None):
The matrix is of CSR format.
"""
return self.fit(X).transform(X)

def _more_tags(self):
return {
'_xfail_checks': {
'check_methods_sample_order_invariance':
'check is not applicable.'
}
}
4 changes: 3 additions & 1 deletion sklearn/neural_network/_rbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,8 @@ def _more_tags(self):
return {
'_xfail_checks': {
'check_methods_subset_invariance':
'fails for the decision_function method'
'fails for the decision_function method',
'check_methods_sample_order_invariance':
'fails for the score_samples method',
}
}
36 changes: 36 additions & 0 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def _yield_all_checks(estimator):
yield check
yield check_parameters_default_constructible
yield check_fit2d_predict1d
yield check_methods_sample_order_invariance
yield check_methods_subset_invariance
yield check_fit2d_1sample
yield check_fit2d_1feature
Expand Down Expand Up @@ -1168,6 +1169,41 @@ def check_methods_subset_invariance(name, estimator_orig, strict_mode=True):
atol=1e-7, err_msg=msg)


@ignore_warnings(category=FutureWarning)
def check_methods_sample_order_invariance(name, estimator_orig, strict_mode=True):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def check_methods_sample_order_invariance(name, estimator_orig, strict_mode=True):
def check_methods_sample_order_invariance(name, estimator_orig,
strict_mode=True):

# check that method gives invariant results if applied
# on a subset with different sample order
rnd = np.random.RandomState(0)
X = 3 * rnd.uniform(size=(20, 3))
X = _pairwise_estimator_convert_X(X, estimator_orig)
y = X[:, 0].astype(np.int)
if estimator_orig._get_tags()['binary_only']:
y[y == 2] = 1
estimator = clone(estimator_orig)
y = _enforce_estimator_tags_y(estimator, y)

if hasattr(estimator, "n_components"):
estimator.n_components = 1
if hasattr(estimator, "n_clusters"):
estimator.n_clusters = 2

set_random_state(estimator, 1)
estimator.fit(X, y)

idx = np.random.permutation(X.shape[0])

for method in ["predict", "transform", "decision_function",
"score_samples", "predict_proba"]:
msg = ("{method} of {name} is not invariant when applied to a dataset"
"with different sample order.").format(method=method, name=name)

if hasattr(estimator, method):
assert_allclose_dense_sparse(getattr(estimator, method)(X)[idx],
getattr(estimator, method)(X[idx]),
atol=1e-9,
err_msg=msg)


@ignore_warnings
def check_fit2d_1sample(name, estimator_orig, strict_mode=True):
# Check that fitting a 2d array with only one sample either works or
Expand Down
28 changes: 28 additions & 0 deletions sklearn/utils/tests/test_estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,27 @@ def predict(self, X):
return np.zeros(X.shape[0])


class NotInvariantSampleOrder(BaseEstimator):
def fit(self, X, y):
X, y = self._validate_data(
X, y,
accept_sparse=("csr", "csc"),
multi_output=True,
y_numeric=True)
# store the original X to check for sample order later
self._X = X
return self

def predict(self, X):
X = check_array(X)
# if the input contains the same elements but different sample order,
# then just return zeros.
if (np.array_equiv(np.sort(X, axis=0), np.sort(self._X, axis=0)) and
(X != self._X).any()):
return np.zeros(X.shape[0])
return X[:, 0]


class LargeSparseNotSupportedClassifier(BaseEstimator):
def fit(self, X, y):
X, y = self._validate_data(
Expand Down Expand Up @@ -455,6 +476,13 @@ def test_check_estimator():
' with _ but wrong_attribute added')
assert_raises_regex(AssertionError, msg,
check_estimator, SetsWrongAttribute())
# check for sample order invariance
name = NotInvariantSampleOrder.__name__
method = 'predict'
msg = ("{method} of {name} is not invariant when applied to a dataset"
"with different sample order.").format(method=method, name=name)
assert_raises_regex(AssertionError, msg,
check_estimator, NotInvariantSampleOrder())
# check for invariant method
name = NotInvariantPredict.__name__
method = 'predict'
Expand Down