From 4f5097520c95ed3089c45397e1def893e09a61b2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 28 Sep 2020 18:50:24 -0400 Subject: [PATCH 01/21] WIP --- sklearn/utils/estimator_checks.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5b99e8e56c420..257da7dad4ada 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -760,6 +760,9 @@ def _generate_sparse_matrix(X_csr): def check_estimator_sparse_data(name, estimator_orig, strict_mode=True): + # Make sure that the estimator either accepts sparse data in fit and + # predict, or that it fails with a helpful error message. + # XXX this is a non-API check rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < .8] = 0 @@ -817,6 +820,7 @@ def check_estimator_sparse_data(name, estimator_orig, strict_mode=True): def check_sample_weights_pandas_series(name, estimator_orig, strict_mode=True): # check that estimators will accept a 'sample_weight' parameter of # type pandas.Series in the 'fit' function. + # XXX pure API check estimator = clone(estimator_orig) if has_fit_parameter(estimator, "sample_weight"): try: @@ -844,6 +848,7 @@ def check_sample_weights_pandas_series(name, estimator_orig, strict_mode=True): def check_sample_weights_not_an_array(name, estimator_orig, strict_mode=True): # check that estimators will accept a 'sample_weight' parameter of # type _NotAnArray in the 'fit' function. + # XXX pure API check estimator = clone(estimator_orig) if has_fit_parameter(estimator, "sample_weight"): X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], @@ -861,6 +866,7 @@ def check_sample_weights_not_an_array(name, estimator_orig, strict_mode=True): def check_sample_weights_list(name, estimator_orig, strict_mode=True): # check that estimators will accept a 'sample_weight' parameter of # type list in the 'fit' function. + # XXX: pure API check if has_fit_parameter(estimator_orig, "sample_weight"): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) @@ -878,6 +884,7 @@ def check_sample_weights_list(name, estimator_orig, strict_mode=True): def check_sample_weights_shape(name, estimator_orig, strict_mode=True): # check that estimators raise an error if sample_weight # shape mismatches the input + # XXX: pure API check?????? Are error checks API checks????? if (has_fit_parameter(estimator_orig, "sample_weight") and not (hasattr(estimator_orig, "_pairwise") and estimator_orig._pairwise)): @@ -906,6 +913,7 @@ def check_sample_weights_invariance(name, estimator_orig, kind="ones", # unit weights and no weights # For kind="zeros" check that setting sample_weight to 0 is equivalent # to removing corresponding samples. + # XXX: non-API check estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) set_random_state(estimator1, random_state=0) @@ -955,6 +963,7 @@ def check_sample_weights_invariance(name, estimator_orig, kind="ones", @ignore_warnings(category=(FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig, strict_mode=True): # check that estimators treat dtype object as numeric if possible + # XXXX api or not????? partially???? rng = np.random.RandomState(0) X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig) X = X.astype(object) @@ -988,6 +997,7 @@ def check_dtype_object(name, estimator_orig, strict_mode=True): def check_complex_data(name, estimator_orig, strict_mode=True): # check that estimators raise an exception on providing complex data + #XXX: error check... ????? X = np.random.sample(10) + 1j * np.random.sample(10) X = X.reshape(-1, 1) y = np.random.sample(10) + 1j * np.random.sample(10) @@ -998,12 +1008,9 @@ def check_complex_data(name, estimator_orig, strict_mode=True): @ignore_warnings def check_dict_unchanged(name, estimator_orig, strict_mode=True): - # this estimator raises - # ValueError: Found array with 0 feature(s) (shape=(23, 0)) - # while a minimum of 1 is required. - # error - if name in ['SpectralCoclustering']: - return + # check that calling the prediction method does not alter the __dict__ + # attribute of the estimator. + # XXX: pure API check rnd = np.random.RandomState(0) if name in ['RANSACRegressor']: X = 3 * rnd.uniform(size=(20, 3)) From ef04fceb957a73869412514929439902a3f98127 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 5 Oct 2020 13:39:41 -0400 Subject: [PATCH 02/21] WIP --- sklearn/model_selection/_split.py | 15 ++-- sklearn/utils/estimator_checks.py | 115 +++++++++++++++++++++++------- 2 files changed, 100 insertions(+), 30 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index dfdbdebeb8b58..65aac104af6e4 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1744,15 +1744,20 @@ def _iter_indices(self, X, y, groups=None): test = [] for i in range(n_classes): - permutation = rng.permutation(class_counts[i]) - perm_indices_class_i = class_indices[i].take(permutation, - mode='clip') + # permutation = rng.permutation(class_counts[i]) + # perm_indices_class_i = class_indices[i].take(permutation, + # mode='clip') + perm_indices_class_i = class_indices[i] + + train.extend(perm_indices_class_i[:n_i[i]]) test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]) - train = rng.permutation(train) - test = rng.permutation(test) + # train = rng.permutation(train) + # test = rng.permutation(test) + train = np.array(train) + test = np.array(test) yield train, test diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 257da7dad4ada..9456dae78ffe3 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -963,7 +963,7 @@ def check_sample_weights_invariance(name, estimator_orig, kind="ones", @ignore_warnings(category=(FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig, strict_mode=True): # check that estimators treat dtype object as numeric if possible - # XXXX api or not????? partially???? + # XXX probably API except for error msg rng = np.random.RandomState(0) X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig) X = X.astype(object) @@ -1050,6 +1050,7 @@ def _is_public_parameter(attr): @ignore_warnings(category=FutureWarning) def check_dont_overwrite_parameters(name, estimator_orig, strict_mode=True): # check that fit method only changes or sets private attributes + #XXX pure API check if hasattr(estimator_orig.__init__, "deprecated_original"): # to not check deprecated classes return @@ -1101,7 +1102,8 @@ def check_dont_overwrite_parameters(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_fit2d_predict1d(name, estimator_orig, strict_mode=True): - # check by fitting a 2d array and predicting with a 1d array + # check that predicting with a 1d array raises an error + # XXX Make message validation optional rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -1151,6 +1153,7 @@ def _apply_on_subsets(func, X): def check_methods_subset_invariance(name, estimator_orig, strict_mode=True): # check that method gives invariant results if applied # on mini batches or the whole set + # XXX: non API check rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -1184,6 +1187,7 @@ def check_fit2d_1sample(name, estimator_orig, strict_mode=True): # Check that fitting a 2d array with only one sample either works or # returns an informative message. The error message should either mention # the number of samples or the number of classes. + # XXX Non API check rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(1, 10)) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -1214,6 +1218,7 @@ def check_fit2d_1sample(name, estimator_orig, strict_mode=True): def check_fit2d_1feature(name, estimator_orig, strict_mode=True): # check fitting a 2d array with only 1 feature either works or returns # informative message + # XXX non API check rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(10, 1)) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -1244,6 +1249,7 @@ def check_fit2d_1feature(name, estimator_orig, strict_mode=True): @ignore_warnings def check_fit1d(name, estimator_orig, strict_mode=True): # check fitting 1d X array raises a ValueError + # XXX Pure API check rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20)) y = X.astype(int) @@ -1297,6 +1303,9 @@ def check_transformer_data_not_an_array(name, transformer, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_transformers_unfitted(name, transformer, strict_mode=True): + # Make sure the unfitted transformer raises an error when transform is + # called + # XXX: non API check X, y = _regression_dataset() transformer = clone(transformer) @@ -1311,6 +1320,13 @@ def check_transformers_unfitted(name, transformer, strict_mode=True): def _check_transformer(name, transformer_orig, X, y, strict_mode=True): + # Check that: + # - fit_transform returns n_samples transformed samples + # - fit_transform and transform give equivalent results. + # - fit_transform gives the same results twice + # - an error is raised if transform is called with an incorrect number of + # features + # XXX: Only make first and last checks part of API n_samples, n_features = np.asarray(X).shape transformer = clone(transformer_orig) set_random_state(transformer) @@ -1331,12 +1347,13 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): X_pred = transformer_clone.fit_transform(X, y=y_) if isinstance(X_pred, tuple): + # for cross-decomposition estimators that transform both X and y for x_pred in X_pred: assert x_pred.shape[0] == n_samples else: # check for consistent n_samples assert X_pred.shape[0] == n_samples - + if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) @@ -1379,7 +1396,6 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): not transformer._get_tags()["stateless"] and \ X.ndim == 2 and X.shape[1] > 1: - # If it's not an array, it does not have a 'T' property with raises( ValueError, err_msg=f"The transformer {name} does not raise an error " @@ -1391,11 +1407,14 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): @ignore_warnings def check_pipeline_consistency(name, estimator_orig, strict_mode=True): + # check that make_pipeline(est) gives results as est for scores and + # transforms + # XXX: full API + if estimator_orig._get_tags()['non_deterministic']: msg = name + ' is non deterministic' raise SkipTest(msg) - # check that make_pipeline(est) gives same score as est X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() @@ -1422,6 +1441,7 @@ def check_pipeline_consistency(name, estimator_orig, strict_mode=True): def check_fit_score_takes_y(name, estimator_orig, strict_mode=True): # check that all estimators accept an optional y # in fit and score so they can be used in pipelines + # XXX : full API check rnd = np.random.RandomState(0) n_samples = 30 X = rnd.uniform(size=(n_samples, 3)) @@ -1449,6 +1469,8 @@ def check_fit_score_takes_y(name, estimator_orig, strict_mode=True): @ignore_warnings def check_estimators_dtypes(name, estimator_orig, strict_mode=True): + # Check that methods can handle X input of different float and int dtypes + # XXX not an API check rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) X_train_32 = _pairwise_estimator_convert_X(X_train_32, estimator_orig) @@ -1475,6 +1497,7 @@ def check_transformer_preserve_dtypes( ): # check that dtype are preserved meaning if input X is of some dtype # X_transformed should be from the same dtype. + # XXX: not an API check X, y = make_blobs( n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], @@ -1506,6 +1529,9 @@ def check_transformer_preserve_dtypes( @ignore_warnings(category=FutureWarning) def check_estimators_empty_data_messages(name, estimator_orig, strict_mode=True): + # Make sure that a ValueError is raised when fit is called on data with no + # sample or no features. + # XXX: API or not? e = clone(estimator_orig) set_random_state(e, 1) @@ -1531,7 +1557,9 @@ def check_estimators_empty_data_messages(name, estimator_orig, @ignore_warnings(category=FutureWarning) def check_estimators_nan_inf(name, estimator_orig, strict_mode=True): - # Checks that Estimator X's do not contain NaN or inf. + # Checks that fit, predict and transform raise an error if X contains nans + # or inf. + # XXX: probably not API? rnd = np.random.RandomState(0) X_train_finite = _pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig) @@ -1581,7 +1609,8 @@ def check_estimators_nan_inf(name, estimator_orig, strict_mode=True): @ignore_warnings def check_nonsquare_error(name, estimator_orig, strict_mode=True): - """Test that error is thrown when non-square data provided.""" + # Check that error is raised when non-square data is provided in fit + # XXX: API X, y = make_blobs(n_samples=20, n_features=10) estimator = clone(estimator_orig) @@ -1596,7 +1625,9 @@ def check_nonsquare_error(name, estimator_orig, strict_mode=True): @ignore_warnings def check_estimators_pickle(name, estimator_orig, strict_mode=True): - """Test that we can pickle all estimators.""" + # Test that we can pickle all estimators and that the pickled estimator + # gives the same predictions + # XXX: Non API check check_methods = ["predict", "transform", "decision_function", "predict_proba"] @@ -1641,7 +1672,9 @@ def check_estimators_pickle(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_estimators_partial_fit_n_features(name, estimator_orig, strict_mode=True): - # check if number of features changes between calls to partial_fit. + # check that an error is raised when number of features changes between + # calls to partial_fit. + # XXX: non API check if not hasattr(estimator_orig, 'partial_fit'): return estimator = clone(estimator_orig) @@ -1668,6 +1701,11 @@ def check_estimators_partial_fit_n_features(name, estimator_orig, @ignore_warnings(category=FutureWarning) def check_classifier_multioutput(name, estimator, strict_mode=True): + # Make sure that the output of predict_proba and decision_function is + # correct for multiouput classification (multilabel, multiclass). Also + # checks that predict_proba and decision_function have consistent + # predictions, i.e. the orders are consistent. + # XXX: full API check n_samples, n_labels, n_classes = 42, 5, 3 tags = estimator._get_tags() estimator = clone(estimator) @@ -1726,6 +1764,9 @@ def check_classifier_multioutput(name, estimator, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_regressor_multioutput(name, estimator, strict_mode=True): + # Make sure that multioutput regressors output float64 predictions and that + # the shape is correct. + # XXX: make the first check not an API check estimator = clone(estimator) n_samples = n_features = 10 @@ -1743,7 +1784,7 @@ def check_regressor_multioutput(name, estimator, strict_mode=True): "Multioutput predictions by a regressor are expected to be" " floating-point precision. Got {} instead".format(y_pred.dtype)) assert y_pred.shape == y.shape, ( - "The shape of the orediction for multioutput data is incorrect." + "The shape of the prediction for multioutput data is incorrect." " Expected {}, got {}.") @@ -1776,6 +1817,7 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False, pred = clusterer.labels_ assert pred.shape == (n_samples,) + # XXX: skip the rest when api_only is True assert adjusted_rand_score(pred, y) > 0.4 if clusterer._get_tags()['non_deterministic']: return @@ -1810,7 +1852,8 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False, @ignore_warnings(category=FutureWarning) def check_clusterer_compute_labels_predict(name, clusterer_orig, strict_mode=True): - """Check that predict is invariant of compute_labels.""" + # Check that predict is invariant of compute_labels + # XXX: non API check X, y = make_blobs(n_samples=20, random_state=0) clusterer = clone(clusterer_orig) set_random_state(clusterer) @@ -1825,6 +1868,10 @@ def check_clusterer_compute_labels_predict(name, clusterer_orig, @ignore_warnings(category=FutureWarning) def check_classifiers_one_label(name, classifier_orig, strict_mode=True): + # Check that a classifier can fit when there's only 1 class, or that it + # raises a proper error. If it can fit, we also make sure that it can + # predict. + # XXX: non API check error_string_fit = "Classifier can't train when only one class is present." error_string_predict = ("Classifier can't predict when only one class is " "present.") @@ -1902,7 +1949,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, assert y_pred.shape == (n_samples,) # training set performance - if not tags['poor_score']: + if not tags['poor_score']: # XXX: not API assert accuracy_score(y, y_pred) > 0.83 # raises error on malformed input for predict @@ -1933,10 +1980,10 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, else: assert decision.shape == (n_samples, 1) dec_pred = (decision.ravel() > 0).astype(int) - assert_array_equal(dec_pred, y_pred) + assert_array_equal(dec_pred, y_pred) # XXX not API else: assert decision.shape == (n_samples, n_classes) - assert_array_equal(np.argmax(decision, axis=1), y_pred) + assert_array_equal(np.argmax(decision, axis=1), y_pred) # XXX not API # raises error on malformed input for decision_function if not tags["no_validation"]: @@ -1961,9 +2008,9 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, # predict_proba agrees with predict y_prob = classifier.predict_proba(X) assert y_prob.shape == (n_samples, n_classes) - assert_array_equal(np.argmax(y_prob, axis=1), y_pred) + assert_array_equal(np.argmax(y_prob, axis=1), y_pred)# XXX not API # check that probas for all classes sum to one - assert_array_almost_equal(np.sum(y_prob, axis=1), + assert_array_almost_equal(np.sum(y_prob, axis=1),# XXX not API np.ones(n_samples)) if not tags["no_validation"]: # raises error on malformed input for predict_proba @@ -1979,7 +2026,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, err_msg=msg.format(name, "predict_proba"), ): classifier.predict_proba(X.T) - if hasattr(classifier, "predict_log_proba"): + if hasattr(classifier, "predict_log_proba"):# XXX not API # predict_log_proba is a transformation of predict_proba y_log_prob = classifier.predict_log_proba(X) assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9) @@ -2040,7 +2087,7 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True, with raises(ValueError): estimator.predict(X.T) - # decision_function agrees with predict + # decision_function agrees with predict XXX not API dec_pred = (decision >= 0).astype(int) dec_pred[dec_pred == 0] = -1 assert_array_equal(dec_pred, y_pred) @@ -2058,6 +2105,7 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True, estimator.score_samples(X.T) # contamination parameter (not for OneClassSVM which has the nu parameter) + # XXX: not API if (hasattr(estimator, 'contamination') and not hasattr(estimator, 'novelty')): # proportion of outliers equal to contamination parameter when not @@ -2090,6 +2138,8 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True, @ignore_warnings(category=(FutureWarning)) def check_classifiers_multilabel_representation_invariance( name, classifier_orig, strict_mode=True): + # check different target representations for multilabel classifiers + # XXX: pure API check X, y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, @@ -2125,7 +2175,8 @@ def check_classifiers_multilabel_representation_invariance( @ignore_warnings(category=FutureWarning) def check_estimators_fit_returns_self(name, estimator_orig, readonly_memmap=False, strict_mode=True): - """Check if self is returned when calling fit.""" + # Check that self is returned when calling fit. + # XXX pure API check X, y = make_blobs(random_state=0, n_samples=21) # some want non-negative input X -= X.min() @@ -2143,11 +2194,10 @@ def check_estimators_fit_returns_self(name, estimator_orig, @ignore_warnings def check_estimators_unfitted(name, estimator_orig, strict_mode=True): - """Check that predict raises an exception in an unfitted estimator. - - Unfitted estimators should raise a NotFittedError. - """ + # Check that predict raises an exception in an unfitted estimator. + # Unfitted estimators should raise a NotFittedError. # Common test for Regressors, Classifiers and Outlier detection estimators + # XXX pure API X, y = _regression_dataset() estimator = clone(estimator_orig) @@ -2160,6 +2210,9 @@ def check_estimators_unfitted(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_supervised_y_2d(name, estimator_orig, strict_mode=True): + # Check that estimators that don't support multi-ouput raise a warning if y + # is not 1d, and that they just ravel y + # XXX pure API check tags = estimator_orig._get_tags() if tags['multioutput_only']: # These only work on 2d, so this test makes no sense @@ -2227,7 +2280,6 @@ def check_classifiers_predictions(X, y, name, classifier_orig, (classifier, ", ".join(map(str, y_exp)), ", ".join(map(str, y_pred)))) - # training set performance if name != "ComplementNB": # This is a pathological data set for ComplementNB. # For some specific cases 'ComplementNB' predicts less classes @@ -2245,6 +2297,9 @@ def _choose_check_classifiers_labels(name, y, y_names): def check_classifiers_classes(name, classifier_orig, strict_mode=True): + # Check that decision function > 0 => pos class + # Also checks the classes_ attribute. + # XXX pure API check X_multiclass, y_multiclass = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) X_multiclass, y_multiclass = shuffle(X_multiclass, y_multiclass, @@ -2283,6 +2338,9 @@ def check_classifiers_classes(name, classifier_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_regressors_int(name, regressor_orig, strict_mode=True): + # Check that regressors give same prediction when y is encoded as int or + # float + # XXX: API check ? X, _ = _regression_dataset() X = _pairwise_estimator_convert_X(X[:50], regressor_orig) rnd = np.random.RandomState(0) @@ -2312,6 +2370,12 @@ def check_regressors_int(name, regressor_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_regressors_train(name, regressor_orig, readonly_memmap=False, X_dtype=np.float64, strict_mode=True): + # Check that regressors: + # - raise an error when X and y have different number of samples + # - accept lists as input to fit + # - predict n_samples predictions + # - have a score > .5 on simple data + # XXX: all API checks except the last one X, y = _regression_dataset() X = X.astype(X_dtype) X = _pairwise_estimator_convert_X(X, regressor_orig) @@ -2353,6 +2417,7 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False, # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped + # XXX: non API if not regressor._get_tags()["poor_score"]: assert regressor.score(X, y_) > 0.5 @@ -2360,7 +2425,7 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False, @ignore_warnings def check_regressors_no_decision_function(name, regressor_orig, strict_mode=True): - # checks whether regressors have decision_function or predict_proba + # check that regressors decision_function or predict_proba rng = np.random.RandomState(0) regressor = clone(regressor_orig) From 94b069ea73bc9c7eb89b0e6bec2ce8777348e4e4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 6 Oct 2020 11:24:20 -0400 Subject: [PATCH 03/21] WIP --- sklearn/utils/estimator_checks.py | 46 +++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 9456dae78ffe3..79ce263441381 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2426,6 +2426,7 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False, def check_regressors_no_decision_function(name, regressor_orig, strict_mode=True): # check that regressors decision_function or predict_proba + # XXX: full API check rng = np.random.RandomState(0) regressor = clone(regressor_orig) @@ -2451,7 +2452,11 @@ def check_regressors_no_decision_function(name, regressor_orig, @ignore_warnings(category=FutureWarning) def check_class_weight_classifiers(name, classifier_orig, strict_mode=True): - + # Make sure that classifiers take class_weight into account by creating a + # very noisy balanced dataset. We make sure that passing a very imbalanced + # class_weights helps recovering a good score. + # XXX: full non-API check + if classifier_orig._get_tags()['binary_only']: problems = [2] else: @@ -2499,6 +2504,7 @@ def check_class_weight_classifiers(name, classifier_orig, strict_mode=True): def check_class_weight_balanced_classifiers(name, classifier_orig, X_train, y_train, X_test, y_test, weights, strict_mode=True): + # XXX: it's never ever used, just ignore classifier = clone(classifier_orig) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) @@ -2519,8 +2525,10 @@ def check_class_weight_balanced_classifiers(name, classifier_orig, X_train, @ignore_warnings(category=FutureWarning) def check_class_weight_balanced_linear_classifier(name, Classifier, strict_mode=True): - """Test class weights with non-contiguous class labels.""" + # Check that class_weight='balanced' is equivalent to manually passing + # class proportions. # this is run on classes, not instances, though this should be changed + # XXX: non API check X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = np.array([1, 1, 1, -1, -1]) @@ -2558,6 +2566,8 @@ def check_class_weight_balanced_linear_classifier(name, Classifier, @ignore_warnings(category=FutureWarning) def check_estimators_overwrite_params(name, estimator_orig, strict_mode=True): + # Check that calling fit does not alter the output of get_params + # XXX: full API check X, y = make_blobs(random_state=0, n_samples=21) # some want non-negative input X -= X.min() @@ -2593,7 +2603,10 @@ def check_estimators_overwrite_params(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_no_attributes_set_in_init(name, estimator_orig, strict_mode=True): - """Check setting during init.""" + # Check that: + # - init does not set any attribute apart from the parameters + # - all parameters of init are set as attributes + # XXX: full API check estimator = clone(estimator_orig) if hasattr(type(estimator).__init__, "deprecated_original"): return @@ -2627,6 +2640,9 @@ def check_no_attributes_set_in_init(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_sparsify_coefficients(name, estimator_orig, strict_mode=True): + # Check that sparsified coefs produce the same predictions as the + # originals coefs + # XXX: full non API check X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, -2], [2, 2], [-2, -2]]) y = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3]) @@ -2651,6 +2667,9 @@ def check_sparsify_coefficients(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_classifier_data_not_an_array(name, estimator_orig, strict_mode=True): + # Check that estimator yields same predictions whether an array was passed + # or not + # XXX: full API X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1], [0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]]) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -2663,6 +2682,9 @@ def check_classifier_data_not_an_array(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_regressor_data_not_an_array(name, estimator_orig, strict_mode=True): + # Check that estimator yields same predictions whether an array was passed + # or not + # XXX: full API X, y = _regression_dataset() X = _pairwise_estimator_convert_X(X, estimator_orig) y = _enforce_estimator_tags_y(estimator_orig, y) @@ -2716,8 +2738,9 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type, def check_parameters_default_constructible(name, Estimator, strict_mode=True): - # test default-constructibility - # get rid of deprecation warnings + # Check that the estimator's default parameters are immutable (sort of). + # Also check that get_params returns exactly the default parameters values + # XXX: full API check Estimator = Estimator.__class__ @@ -2846,6 +2869,7 @@ def check_non_transformer_estimators_n_iter(name, estimator_orig, strict_mode=True): # Test that estimators that are not transformers with a parameter # max_iter, return the attribute of n_iter_ at least 1. + # XXX: full API # These models are dependent on external solvers like # libsvm and accessing the iter parameter is non-trivial. @@ -2880,6 +2904,7 @@ def check_non_transformer_estimators_n_iter(name, estimator_orig, def check_transformer_n_iter(name, estimator_orig, strict_mode=True): # Test that transformers with a parameter max_iter, return the # attribute of n_iter_ at least 1. + # XXX: full API estimator = clone(estimator_orig) if hasattr(estimator, "max_iter"): if name in CROSS_DECOMPOSITION: @@ -2904,7 +2929,8 @@ def check_transformer_n_iter(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_get_params_invariance(name, estimator_orig, strict_mode=True): - # Checks if get_params(deep=False) is a subset of get_params(deep=True) + # Checks that get_params(deep=False) is a subset of get_params(deep=True) + # XXX: full API e = clone(estimator_orig) shallow_params = e.get_params(deep=False) @@ -2918,6 +2944,7 @@ def check_get_params_invariance(name, estimator_orig, strict_mode=True): def check_set_params(name, estimator_orig, strict_mode=True): # Check that get_params() returns the same thing # before and after set_params() with some fuzz + # XXX: full API check estimator = clone(estimator_orig) orig_params = estimator.get_params(deep=False) @@ -2972,6 +2999,7 @@ def check_set_params(name, estimator_orig, strict_mode=True): def check_classifiers_regression_target(name, estimator_orig, strict_mode=True): # Check if classifier throws an exception when fed regression targets + # XXX API check X, y = _regression_dataset() @@ -2987,6 +3015,7 @@ def check_classifiers_regression_target(name, estimator_orig, def check_decision_proba_consistency(name, estimator_orig, strict_mode=True): # Check whether an estimator having both decision_function and # predict_proba methods has outputs with perfect rank correlation. + # XXX: fulll non API check centers = [(2, 2), (4, 4)] X, y = make_blobs(n_samples=100, random_state=0, n_features=4, @@ -3028,6 +3057,8 @@ def check_outliers_fit_predict(name, estimator_orig, strict_mode=True): if hasattr(estimator, 'predict'): y_pred_2 = estimator.fit(X).predict(X) assert_array_equal(y_pred, y_pred_2) + + # XXX: next check isn't API check if hasattr(estimator, "contamination"): # proportion of outliers equal to contamination parameter when not @@ -3057,6 +3088,7 @@ def check_outliers_fit_predict(name, estimator_orig, strict_mode=True): def check_fit_non_negative(name, estimator_orig, strict_mode=True): # Check that proper warning is raised for non-negative X # when tag requires_positive_X is present + # XXX: full non API check + remove if else X = np.array([[-1., 1], [-1., 1]]) y = np.array([1, 2]) estimator = clone(estimator_orig) @@ -3076,6 +3108,8 @@ def check_fit_idempotent(name, estimator_orig, strict_mode=True): # predict(), predict_proba(), decision_function() and transform() return # the same results. + # XXX full API check + check_methods = ["predict", "transform", "decision_function", "predict_proba"] rng = np.random.RandomState(0) From e4f889ffa44b31f92f375130789062c77c21415b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 9 Oct 2020 10:35:45 -0400 Subject: [PATCH 04/21] some more --- sklearn/tests/test_common.py | 62 ++--- sklearn/utils/estimator_checks.py | 434 ++++++++++++++---------------- 2 files changed, 236 insertions(+), 260 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index b84b66d1fb919..e0ed782fe596b 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -212,8 +212,9 @@ def test_class_support_removed(): class MyNMFWithBadErrorMessage(NMF): # Same as NMF but raises an uninformative error message if X has negative - # value. This estimator would fail the check suite in strict mode, - # specifically it would fail check_fit_non_negative + # value. This estimator would fail the check suite with api_only=False, + # specifically it would fail check_fit_non_negative because its error + # message doesn't match def fit(self, X, y=None, **params): X = check_array(X, accept_sparse=('csr', 'csc'), dtype=[np.float64, np.float32]) @@ -225,51 +226,52 @@ def fit(self, X, y=None, **params): return super().fit(X, y, **params) -def test_strict_mode_check_estimator(): - # Tests various conditions for the strict mode of check_estimator() +def test_api_only_check_estimator(): + # Tests various conditions for the api_only parameter of check_estimator() # Details are in the comments - # LogisticRegression has no _xfail_checks, so when strict_mode is on, there + # LogisticRegression has no _xfail_checks, so when api_only=False, there # should be no skipped tests. with pytest.warns(None) as catched_warnings: - check_estimator(LogisticRegression(), strict_mode=True) + check_estimator(LogisticRegression(), api_only=False) assert not any(isinstance(w, SkipTestWarning) for w in catched_warnings) - # When strict mode is off, check_n_features should be skipped because it's - # a fully strict check - msg_check_n_features_in = 'check_n_features_in is fully strict ' - with pytest.warns(SkipTestWarning, match=msg_check_n_features_in): - check_estimator(LogisticRegression(), strict_mode=False) + # When api_only is True, check_fit2d_1sample should be skipped + # because it's not an API check + skip_match = 'check_fit2d_1sample is not an API check' + with pytest.warns(SkipTestWarning, match=skip_match): + check_estimator(LogisticRegression(), api_only=True) # NuSVC has some _xfail_checks. They should be skipped regardless of - # strict_mode + # api_only with pytest.warns(SkipTestWarning, match='fails for the decision_function method'): - check_estimator(NuSVC(), strict_mode=True) - # When strict mode is off, check_n_features_in is skipped along with the - # rest of the xfail_checks - with pytest.warns(SkipTestWarning, match=msg_check_n_features_in): - check_estimator(NuSVC(), strict_mode=False) - - # MyNMF will fail check_fit_non_negative() in strict mode because it yields - # a bad error message + check_estimator(NuSVC(), api_only=False) + # When api_only is True, check_fit2d_1sample is skipped along + # with the rest of the xfail_checks + with pytest.warns(SkipTestWarning, match=skip_match): + check_estimator(NuSVC(), api_only=True) + + # MyNMF will fail check_fit_non_negative() with api_only=False because it + # yields a bad error message with pytest.raises( AssertionError, match="The error message should contain" ): - check_estimator(MyNMFWithBadErrorMessage(), strict_mode=True) - # However, it should pass the test suite in non-strict mode because when - # strict mode is off, check_fit_non_negative() will not check the exact - # error messsage. (We still assert that the warning from - # check_n_features_in is raised) - with pytest.warns(SkipTestWarning, match=msg_check_n_features_in): - check_estimator(MyNMFWithBadErrorMessage(), strict_mode=False) + check_estimator(MyNMFWithBadErrorMessage(), api_only=False) + # However, it should pass the test suite with api_only=True because when in + # this case, check_fit_non_negative() will not check the exact error + # messsage. (We still assert that the warning from + # check_fit2d_1sample is raised) + with pytest.warns(SkipTestWarning, match=skip_match): + check_estimator(MyNMFWithBadErrorMessage(), api_only=True) @parametrize_with_checks([LogisticRegression(), NuSVC(), MyNMFWithBadErrorMessage()], - strict_mode=False) -def test_strict_mode_parametrize_with_checks(estimator, check): - # Ideally we should assert that the strict checks are Xfailed... + api_only=True) +def test_api_only_parametrize_with_checks(estimator, check): + # Ideally we should assert that the NON_API checks are either Xfailed or + # Xpassed check(estimator) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6dfbc55d6d956..267c5677831c6 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -14,7 +14,6 @@ from . import IS_PYPY from .. import config_context from ._testing import _get_args -from ._testing import assert_raise_message from ._testing import assert_array_equal from ._testing import assert_array_almost_equal from ._testing import assert_allclose @@ -150,7 +149,7 @@ def _yield_classifier_checks(classifier): @ignore_warnings(category=FutureWarning) -def check_supervised_y_no_nan(name, estimator_orig, strict_mode=True): +def check_supervised_y_no_nan(name, estimator_orig, api_only=False): # Checks that the Estimator targets are not NaN. estimator = clone(estimator_orig) rng = np.random.RandomState(888) @@ -370,14 +369,14 @@ def _construct_instance(Estimator): return estimator -def _maybe_mark_xfail(estimator, check, strict_mode, pytest): +def _maybe_mark_xfail(estimator, check, api_only, pytest): # Mark (estimator, check) pairs as XFAIL if needed (see conditions in # _should_be_skipped_or_marked()) # This is similar to _maybe_skip(), but this one is used by # @parametrize_with_checks() instead of check_estimator() should_be_marked, reason = _should_be_skipped_or_marked(estimator, check, - strict_mode) + api_only) if not should_be_marked: return estimator, check else: @@ -385,14 +384,14 @@ def _maybe_mark_xfail(estimator, check, strict_mode, pytest): marks=pytest.mark.xfail(reason=reason)) -def _maybe_skip(estimator, check, strict_mode): +def _maybe_skip(estimator, check, api_only): # Wrap a check so that it's skipped if needed (see conditions in # _should_be_skipped_or_marked()) # This is similar to _maybe_mark_xfail(), but this one is used by # check_estimator() instead of @parametrize_with_checks which requires # pytest should_be_skipped, reason = _should_be_skipped_or_marked(estimator, check, - strict_mode) + api_only) if not should_be_skipped: return check @@ -409,15 +408,15 @@ def wrapped(*args, **kwargs): return wrapped -def _should_be_skipped_or_marked(estimator, check, strict_mode): +def _should_be_skipped_or_marked(estimator, check, api_only): # Return whether a check should be skipped (when using check_estimator()) # or marked as XFAIL (when using @parametrize_with_checks()), along with a # reason. # A check should be skipped or marked if either: # - the check is in the _xfail_checks tag of the estimator - # - the check is fully strict and strict mode is off - # Checks that are only partially strict will not be skipped since we want - # to run their non-strict parts. + # - the check is not an API check and api_only is True + # Checks that are a mix of API and non-API checks will not be skipped since + # we want to run their API-checking parts. check_name = (check.func.__name__ if isinstance(check, partial) else check.__name__) @@ -426,13 +425,13 @@ def _should_be_skipped_or_marked(estimator, check, strict_mode): if check_name in xfail_checks: return True, xfail_checks[check_name] - if check_name in _FULLY_STRICT_CHECKS and not strict_mode: - return True, f'{check_name} is fully strict and strict mode is off' + if check_name in _NON_API_CHECKS and api_only: + return True, f'{check_name} is not an API check and api_only is True.' return False, 'placeholder reason that will never be used' -def parametrize_with_checks(estimators, strict_mode=True): +def parametrize_with_checks(estimators, api_only=False): """Pytest specific decorator for parametrizing estimator checks. The `id` of each check is set to be a pprint version of the estimator @@ -450,18 +449,18 @@ def parametrize_with_checks(estimators, strict_mode=True): Passing a class was deprecated in version 0.23, and support for classes was removed in 0.24. Pass an instance instead. - strict_mode : bool, default=True - If True, the full check suite is run. - If False, only the non-strict part of the check suite is run. + api_only : bool, default=False + If True, the check suite will only ensure pure API-compatibility, and + will ignore other checks like controlling error messages or + prediction performance on easy datasets. + By default, the entire check suite is run. - In non-strict mode, some checks will be easier to pass: e.g., they - will only make sure an error is raised instead of also checking the - full error message. - Some checks are considered completely strict, in which case they are - treated as if they were in the estimators' `_xfails_checks` tag: they - will be marked as `xfail` for pytest. See :ref:`estimator_tags` for - more info on the `_xfails_check` tag. The set of strict checks is in - `sklearn.utils.estimator_checks._FULLY_STRICT_CHECKS`. + When True, some checks will be easier to pass. Some other checks will + be treated as if they were in the estimators' `_xfails_checks` tag: + they will be marked as `xfail` for pytest, but they will still be + run. If they pass, pytest will label them as `xpass`. These checks + are in `sklearn.utils.estimator_checks._NON_API_CHECKS`. See + :ref:`estimator_tags` for more info on the `_xfails_check` tag. .. versionadded:: 0.24 @@ -493,14 +492,14 @@ def checks_generator(): for estimator in estimators: name = type(estimator).__name__ for check in _yield_all_checks(estimator): - check = partial(check, name, strict_mode=strict_mode) - yield _maybe_mark_xfail(estimator, check, strict_mode, pytest) + check = partial(check, name, api_only=api_only) + yield _maybe_mark_xfail(estimator, check, api_only, pytest) return pytest.mark.parametrize("estimator, check", checks_generator(), ids=_get_check_estimator_ids) -def check_estimator(Estimator, generate_only=False, strict_mode=True): +def check_estimator(Estimator, generate_only=False, api_only=False): """Check if estimator adheres to scikit-learn conventions. This estimator will run an extensive test-suite for input validation, @@ -536,18 +535,17 @@ def check_estimator(Estimator, generate_only=False, strict_mode=True): .. versionadded:: 0.22 - strict_mode : bool, default=True - If True, the full check suite is run. - If False, only the non-strict part of the check suite is run. + api_only : bool, default=False + If True, the check suite will only ensure pure API-compatibility, and + will ignore other checks like controlling error messages or + prediction performance on easy datasets. + By default, the entire check suite is run. - In non-strict mode, some checks will be easier to pass: e.g., they - will only make sure an error is raised instead of also checking the - full error message. - Some checks are considered completely strict, in which case they are - treated as if they were in the estimators' `_xfails_checks` tag: they - will be ignored with a warning. See :ref:`estimator_tags` for more - info on the `_xfails_check` tag. The set of strict checks is in - `sklearn.utils.estimator_checks._FULLY_STRICT_CHECKS`. + When True, some checks will be easier to pass. Some other checks will + be treated as if they were in the estimators' `_xfails_checks` tag: + they will be ignored with a warning. These checks are in + `sklearn.utils.estimator_checks._NON_API_CHECKS`. See + :ref:`estimator_tags` for more info on the `_xfails_check` tag. .. versionadded:: 0.24 @@ -568,8 +566,8 @@ def check_estimator(Estimator, generate_only=False, strict_mode=True): def checks_generator(): for check in _yield_all_checks(estimator): - check = _maybe_skip(estimator, check, strict_mode) - yield estimator, partial(check, name, strict_mode=strict_mode) + check = _maybe_skip(estimator, check, api_only) + yield estimator, partial(check, name, api_only=api_only) if generate_only: return checks_generator() @@ -761,10 +759,9 @@ def _generate_sparse_matrix(X_csr): yield sparse_format + "_64", X -def check_estimator_sparse_data(name, estimator_orig, strict_mode=True): +def check_estimator_sparse_data(name, estimator_orig, api_only=False): # Make sure that the estimator either accepts sparse data in fit and # predict, or that it fails with a helpful error message. - # XXX this is a non-API check rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < .8] = 0 @@ -819,10 +816,9 @@ def check_estimator_sparse_data(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_sample_weights_pandas_series(name, estimator_orig, strict_mode=True): +def check_sample_weights_pandas_series(name, estimator_orig, api_only=False): # check that estimators will accept a 'sample_weight' parameter of # type pandas.Series in the 'fit' function. - # XXX pure API check estimator = clone(estimator_orig) if has_fit_parameter(estimator, "sample_weight"): try: @@ -847,10 +843,9 @@ def check_sample_weights_pandas_series(name, estimator_orig, strict_mode=True): @ignore_warnings(category=(FutureWarning)) -def check_sample_weights_not_an_array(name, estimator_orig, strict_mode=True): +def check_sample_weights_not_an_array(name, estimator_orig, api_only=False): # check that estimators will accept a 'sample_weight' parameter of # type _NotAnArray in the 'fit' function. - # XXX pure API check estimator = clone(estimator_orig) if has_fit_parameter(estimator, "sample_weight"): X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], @@ -865,10 +860,9 @@ def check_sample_weights_not_an_array(name, estimator_orig, strict_mode=True): @ignore_warnings(category=(FutureWarning)) -def check_sample_weights_list(name, estimator_orig, strict_mode=True): +def check_sample_weights_list(name, estimator_orig, api_only=False): # check that estimators will accept a 'sample_weight' parameter of # type list in the 'fit' function. - # XXX: pure API check if has_fit_parameter(estimator_orig, "sample_weight"): estimator = clone(estimator_orig) rnd = np.random.RandomState(0) @@ -883,10 +877,9 @@ def check_sample_weights_list(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_sample_weights_shape(name, estimator_orig, strict_mode=True): +def check_sample_weights_shape(name, estimator_orig, api_only=False): # check that estimators raise an error if sample_weight # shape mismatches the input - # XXX: pure API check?????? Are error checks API checks????? if (has_fit_parameter(estimator_orig, "sample_weight") and not _is_pairwise(estimator_orig)): estimator = clone(estimator_orig) @@ -909,12 +902,11 @@ def check_sample_weights_shape(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_sample_weights_invariance(name, estimator_orig, kind="ones", - strict_mode=True): + api_only=False): # For kind="ones" check that the estimators yield same results for # unit weights and no weights # For kind="zeros" check that setting sample_weight to 0 is equivalent # to removing corresponding samples. - # XXX: non-API check estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) set_random_state(estimator1, random_state=0) @@ -962,9 +954,8 @@ def check_sample_weights_invariance(name, estimator_orig, kind="ones", @ignore_warnings(category=(FutureWarning, UserWarning)) -def check_dtype_object(name, estimator_orig, strict_mode=True): +def check_dtype_object(name, estimator_orig, api_only=False): # check that estimators treat dtype object as numeric if possible - # XXX probably API except for error msg rng = np.random.RandomState(0) X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig) X = X.astype(object) @@ -985,8 +976,8 @@ def check_dtype_object(name, estimator_orig, strict_mode=True): if 'string' not in tags['X_types']: X[0, 0] = {'foo': 'bar'} - msg = "argument must be a string.* number" - with raises(TypeError, match=msg): + match = None if api_only else "argument must be a string.* number" + with raises(TypeError, match=match): estimator.fit(X, y) else: # Estimators supporting string will not call np.asarray to convert the @@ -996,9 +987,8 @@ def check_dtype_object(name, estimator_orig, strict_mode=True): estimator.fit(X, y) -def check_complex_data(name, estimator_orig, strict_mode=True): +def check_complex_data(name, estimator_orig, api_only=False): # check that estimators raise an exception on providing complex data - #XXX: error check... ????? X = np.random.sample(10) + 1j * np.random.sample(10) X = X.reshape(-1, 1) y = np.random.sample(10) + 1j * np.random.sample(10) @@ -1008,10 +998,9 @@ def check_complex_data(name, estimator_orig, strict_mode=True): @ignore_warnings -def check_dict_unchanged(name, estimator_orig, strict_mode=True): +def check_dict_unchanged(name, estimator_orig, api_only=False): # check that calling the prediction method does not alter the __dict__ # attribute of the estimator. - # XXX: pure API check rnd = np.random.RandomState(0) if name in ['RANSACRegressor']: X = 3 * rnd.uniform(size=(20, 3)) @@ -1049,9 +1038,8 @@ def _is_public_parameter(attr): @ignore_warnings(category=FutureWarning) -def check_dont_overwrite_parameters(name, estimator_orig, strict_mode=True): +def check_dont_overwrite_parameters(name, estimator_orig, api_only=False): # check that fit method only changes or sets private attributes - #XXX pure API check if hasattr(estimator_orig.__init__, "deprecated_original"): # to not check deprecated classes return @@ -1102,9 +1090,8 @@ def check_dont_overwrite_parameters(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_fit2d_predict1d(name, estimator_orig, strict_mode=True): +def check_fit2d_predict1d(name, estimator_orig, api_only=False): # check that predicting with a 1d array raises an error - # XXX Make message validation optional rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -1123,8 +1110,9 @@ def check_fit2d_predict1d(name, estimator_orig, strict_mode=True): for method in ["predict", "transform", "decision_function", "predict_proba"]: if hasattr(estimator, method): - assert_raise_message(ValueError, "Reshape your data", - getattr(estimator, method), X[0]) + match = None if api_only else "Reshape your data" + with raises(ValueError, match=match): + getattr(estimator, method)(X[0]) def _apply_on_subsets(func, X): @@ -1147,10 +1135,9 @@ def _apply_on_subsets(func, X): @ignore_warnings(category=FutureWarning) -def check_methods_subset_invariance(name, estimator_orig, strict_mode=True): +def check_methods_subset_invariance(name, estimator_orig, api_only=False): # check that method gives invariant results if applied # on mini batches or the whole set - # XXX: non API check rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20, 3)) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -1181,7 +1168,7 @@ def check_methods_subset_invariance(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_methods_sample_order_invariance( - name, estimator_orig, strict_mode=True + name, estimator_orig, api_only=False ): # check that method gives invariant results if applied # on a subset with different sample order @@ -1217,11 +1204,10 @@ def check_methods_sample_order_invariance( @ignore_warnings -def check_fit2d_1sample(name, estimator_orig, strict_mode=True): +def check_fit2d_1sample(name, estimator_orig, api_only=False): # Check that fitting a 2d array with only one sample either works or # returns an informative message. The error message should either mention # the number of samples or the number of classes. - # XXX Non API check rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(1, 10)) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -1249,10 +1235,9 @@ def check_fit2d_1sample(name, estimator_orig, strict_mode=True): @ignore_warnings -def check_fit2d_1feature(name, estimator_orig, strict_mode=True): +def check_fit2d_1feature(name, estimator_orig, api_only=False): # check fitting a 2d array with only 1 feature either works or returns # informative message - # XXX non API check rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(10, 1)) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -1281,9 +1266,8 @@ def check_fit2d_1feature(name, estimator_orig, strict_mode=True): @ignore_warnings -def check_fit1d(name, estimator_orig, strict_mode=True): +def check_fit1d(name, estimator_orig, api_only=False): # check fitting 1d X array raises a ValueError - # XXX Pure API check rnd = np.random.RandomState(0) X = 3 * rnd.uniform(size=(20)) y = X.astype(int) @@ -1302,7 +1286,7 @@ def check_fit1d(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_transformer_general(name, transformer, readonly_memmap=False, - strict_mode=True): + api_only=False): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) @@ -1316,7 +1300,7 @@ def check_transformer_general(name, transformer, readonly_memmap=False, @ignore_warnings(category=FutureWarning) -def check_transformer_data_not_an_array(name, transformer, strict_mode=True): +def check_transformer_data_not_an_array(name, transformer, api_only=False): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) @@ -1332,10 +1316,9 @@ def check_transformer_data_not_an_array(name, transformer, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_transformers_unfitted(name, transformer, strict_mode=True): +def check_transformers_unfitted(name, transformer, api_only=False): # Make sure the unfitted transformer raises an error when transform is # called - # XXX: non API check X, y = _regression_dataset() transformer = clone(transformer) @@ -1349,14 +1332,13 @@ def check_transformers_unfitted(name, transformer, strict_mode=True): transformer.transform(X) -def _check_transformer(name, transformer_orig, X, y, strict_mode=True): +def _check_transformer(name, transformer_orig, X, y, api_only=False): # Check that: # - fit_transform returns n_samples transformed samples - # - fit_transform and transform give equivalent results. - # - fit_transform gives the same results twice # - an error is raised if transform is called with an incorrect number of # features - # XXX: Only make first and last checks part of API + # - fit_transform and transform give equivalent results. + # - fit_transform gives the same results twice n_samples, n_features = np.asarray(X).shape transformer = clone(transformer_orig) set_random_state(transformer) @@ -1385,6 +1367,22 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): assert X_pred.shape[0] == n_samples if hasattr(transformer, 'transform'): + + # raises error on malformed input for transform + if hasattr(X, 'shape') and \ + not transformer._get_tags()["stateless"] and \ + X.ndim == 2 and X.shape[1] > 1: + + with raises( + ValueError, + err_msg=f"The transformer {name} does not raise an error " + "when the number of features in transform is different from " + "the number of features in fit." + ): + transformer.transform(X[:, :-1]) + if api_only: + return + if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) @@ -1421,25 +1419,11 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): assert _num_samples(X_pred2) == n_samples assert _num_samples(X_pred3) == n_samples - # raises error on malformed input for transform - if hasattr(X, 'shape') and \ - not transformer._get_tags()["stateless"] and \ - X.ndim == 2 and X.shape[1] > 1: - - with raises( - ValueError, - err_msg=f"The transformer {name} does not raise an error " - "when the number of features in transform is different from " - "the number of features in fit." - ): - transformer.transform(X[:, :-1]) - @ignore_warnings -def check_pipeline_consistency(name, estimator_orig, strict_mode=True): +def check_pipeline_consistency(name, estimator_orig, api_only=False): # check that make_pipeline(est) gives results as est for scores and # transforms - # XXX: full API if estimator_orig._get_tags()['non_deterministic']: msg = name + ' is non deterministic' @@ -1468,10 +1452,9 @@ def check_pipeline_consistency(name, estimator_orig, strict_mode=True): @ignore_warnings -def check_fit_score_takes_y(name, estimator_orig, strict_mode=True): +def check_fit_score_takes_y(name, estimator_orig, api_only=False): # check that all estimators accept an optional y # in fit and score so they can be used in pipelines - # XXX : full API check rnd = np.random.RandomState(0) n_samples = 30 X = rnd.uniform(size=(n_samples, 3)) @@ -1498,9 +1481,8 @@ def check_fit_score_takes_y(name, estimator_orig, strict_mode=True): @ignore_warnings -def check_estimators_dtypes(name, estimator_orig, strict_mode=True): +def check_estimators_dtypes(name, estimator_orig, api_only=False): # Check that methods can handle X input of different float and int dtypes - # XXX not an API check rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) X_train_32 = _pairwise_estimator_convert_X(X_train_32, estimator_orig) @@ -1523,11 +1505,10 @@ def check_estimators_dtypes(name, estimator_orig, strict_mode=True): def check_transformer_preserve_dtypes( - name, transformer_orig, strict_mode=True + name, transformer_orig, api_only=False ): # check that dtype are preserved meaning if input X is of some dtype # X_transformed should be from the same dtype. - # XXX: not an API check X, y = make_blobs( n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], @@ -1558,10 +1539,9 @@ def check_transformer_preserve_dtypes( @ignore_warnings(category=FutureWarning) def check_estimators_empty_data_messages(name, estimator_orig, - strict_mode=True): + api_only=False): # Make sure that a ValueError is raised when fit is called on data with no # sample or no features. - # XXX: API or not? e = clone(estimator_orig) set_random_state(e, 1) @@ -1580,16 +1560,15 @@ def check_estimators_empty_data_messages(name, estimator_orig, # and ignored by unsupervised models y = _enforce_estimator_tags_y(e, np.array([1, 0, 1])) msg = (r"0 feature\(s\) \(shape=\(3, 0\)\) while a minimum of \d* " - "is required.") + "is required.") if not api_only else None with raises(ValueError, match=msg): e.fit(X_zero_features, y) @ignore_warnings(category=FutureWarning) -def check_estimators_nan_inf(name, estimator_orig, strict_mode=True): +def check_estimators_nan_inf(name, estimator_orig, api_only=False): # Checks that fit, predict and transform raise an error if X contains nans # or inf. - # XXX: probably not API? rnd = np.random.RandomState(0) X_train_finite = _pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig) @@ -1638,9 +1617,9 @@ def check_estimators_nan_inf(name, estimator_orig, strict_mode=True): @ignore_warnings -def check_nonsquare_error(name, estimator_orig, strict_mode=True): - # Check that error is raised when non-square data is provided in fit - # XXX: API +def check_nonsquare_error(name, estimator_orig, api_only=False): + # Check that error is raised when non-square data is provided in fit for a + # pairwise estimator X, y = make_blobs(n_samples=20, n_features=10) estimator = clone(estimator_orig) @@ -1654,10 +1633,9 @@ def check_nonsquare_error(name, estimator_orig, strict_mode=True): @ignore_warnings -def check_estimators_pickle(name, estimator_orig, strict_mode=True): +def check_estimators_pickle(name, estimator_orig, api_only=False): # Test that we can pickle all estimators and that the pickled estimator # gives the same predictions - # XXX: Non API check check_methods = ["predict", "transform", "decision_function", "predict_proba"] @@ -1701,10 +1679,9 @@ def check_estimators_pickle(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_estimators_partial_fit_n_features(name, estimator_orig, - strict_mode=True): + api_only=False): # check that an error is raised when number of features changes between # calls to partial_fit. - # XXX: non API check if not hasattr(estimator_orig, 'partial_fit'): return estimator = clone(estimator_orig) @@ -1730,12 +1707,11 @@ def check_estimators_partial_fit_n_features(name, estimator_orig, @ignore_warnings(category=FutureWarning) -def check_classifier_multioutput(name, estimator, strict_mode=True): +def check_classifier_multioutput(name, estimator, api_only=False): # Make sure that the output of predict_proba and decision_function is # correct for multiouput classification (multilabel, multiclass). Also # checks that predict_proba and decision_function have consistent # predictions, i.e. the orders are consistent. - # XXX: full API check n_samples, n_labels, n_classes = 42, 5, 3 tags = estimator._get_tags() estimator = clone(estimator) @@ -1760,9 +1736,10 @@ def check_classifier_multioutput(name, estimator, strict_mode=True): "multioutput data is incorrect. Expected {}, got {}." .format((n_samples, n_classes), decision.shape)) - dec_pred = (decision > 0).astype(int) - dec_exp = estimator.classes_[dec_pred] - assert_array_equal(dec_exp, y_pred) + if not api_only: + dec_pred = (decision > 0).astype(int) + dec_exp = estimator.classes_[dec_pred] + assert_array_equal(dec_exp, y_pred) if hasattr(estimator, "predict_proba"): y_prob = estimator.predict_proba(X) @@ -1773,16 +1750,21 @@ def check_classifier_multioutput(name, estimator, strict_mode=True): "The shape of the probability for multioutput data is" " incorrect. Expected {}, got {}." .format((n_samples, 2), y_prob[i].shape)) - assert_array_equal( - np.argmax(y_prob[i], axis=1).astype(int), - y_pred[:, i] - ) + if not api_only: + assert_array_equal( + np.argmax(y_prob[i], axis=1).astype(int), + y_pred[:, i] + ) elif not tags['poor_score']: assert y_prob.shape == (n_samples, n_classes), ( "The shape of the probability for multioutput data is" " incorrect. Expected {}, got {}." .format((n_samples, n_classes), y_prob.shape)) - assert_array_equal(y_prob.round().astype(int), y_pred) + if not api_only: + assert_array_equal(y_prob.round().astype(int), y_pred) + + if api_only: + return if (hasattr(estimator, "decision_function") and hasattr(estimator, "predict_proba")): @@ -1793,10 +1775,9 @@ def check_classifier_multioutput(name, estimator, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_regressor_multioutput(name, estimator, strict_mode=True): +def check_regressor_multioutput(name, estimator, api_only=False): # Make sure that multioutput regressors output float64 predictions and that # the shape is correct. - # XXX: make the first check not an API check estimator = clone(estimator) n_samples = n_features = 10 @@ -1810,9 +1791,10 @@ def check_regressor_multioutput(name, estimator, strict_mode=True): estimator.fit(X, y) y_pred = estimator.predict(X) - assert y_pred.dtype == np.dtype('float64'), ( - "Multioutput predictions by a regressor are expected to be" - " floating-point precision. Got {} instead".format(y_pred.dtype)) + if not api_only: + assert y_pred.dtype == np.dtype('float64'), ( + "Multioutput predictions by a regressor are expected to be" + " floating-point precision. Got {} instead".format(y_pred.dtype)) assert y_pred.shape == y.shape, ( "The shape of the prediction for multioutput data is incorrect." " Expected {}, got {}.") @@ -1820,7 +1802,7 @@ def check_regressor_multioutput(name, estimator, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_clustering(name, clusterer_orig, readonly_memmap=False, - strict_mode=True): + api_only=False): clusterer = clone(clusterer_orig) X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) @@ -1847,7 +1829,10 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False, pred = clusterer.labels_ assert pred.shape == (n_samples,) - # XXX: skip the rest when api_only is True + + if api_only: + return + assert adjusted_rand_score(pred, y) > 0.4 if clusterer._get_tags()['non_deterministic']: return @@ -1881,9 +1866,8 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False, @ignore_warnings(category=FutureWarning) def check_clusterer_compute_labels_predict(name, clusterer_orig, - strict_mode=True): + api_only=False): # Check that predict is invariant of compute_labels - # XXX: non API check X, y = make_blobs(n_samples=20, random_state=0) clusterer = clone(clusterer_orig) set_random_state(clusterer) @@ -1897,11 +1881,10 @@ def check_clusterer_compute_labels_predict(name, clusterer_orig, @ignore_warnings(category=FutureWarning) -def check_classifiers_one_label(name, classifier_orig, strict_mode=True): +def check_classifiers_one_label(name, classifier_orig, api_only=False): # Check that a classifier can fit when there's only 1 class, or that it # raises a proper error. If it can fit, we also make sure that it can # predict. - # XXX: non API check error_string_fit = "Classifier can't train when only one class is present." error_string_predict = ("Classifier can't predict when only one class is " "present.") @@ -1928,7 +1911,7 @@ def check_classifiers_one_label(name, classifier_orig, strict_mode=True): @ignore_warnings # Warnings are raised by decision function def check_classifiers_train(name, classifier_orig, readonly_memmap=False, - X_dtype='float64', strict_mode=True): + X_dtype='float64', api_only=False): X_m, y_m = make_blobs(n_samples=300, random_state=0) X_m = X_m.astype(X_dtype) X_m, y_m = shuffle(X_m, y_m, random_state=7) @@ -1979,7 +1962,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, assert y_pred.shape == (n_samples,) # training set performance - if not tags['poor_score']: # XXX: not API + if not tags['poor_score'] and not api_only: assert accuracy_score(y, y_pred) > 0.83 # raises error on malformed input for predict @@ -2009,11 +1992,13 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, assert decision.shape == (n_samples,) else: assert decision.shape == (n_samples, 1) - dec_pred = (decision.ravel() > 0).astype(int) - assert_array_equal(dec_pred, y_pred) # XXX not API + if not api_only: + dec_pred = (decision.ravel() > 0).astype(int) + assert_array_equal(dec_pred, y_pred) else: assert decision.shape == (n_samples, n_classes) - assert_array_equal(np.argmax(decision, axis=1), y_pred) # XXX not API + if not api_only: + assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function if not tags["no_validation"]: @@ -2038,10 +2023,11 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, # predict_proba agrees with predict y_prob = classifier.predict_proba(X) assert y_prob.shape == (n_samples, n_classes) - assert_array_equal(np.argmax(y_prob, axis=1), y_pred)# XXX not API - # check that probas for all classes sum to one - assert_array_almost_equal(np.sum(y_prob, axis=1),# XXX not API - np.ones(n_samples)) + if not api_only: + assert_array_equal(np.argmax(y_prob, axis=1), y_pred) + # check that probas for all classes sum to one + assert_array_almost_equal(np.sum(y_prob, axis=1), + np.ones(n_samples)) if not tags["no_validation"]: # raises error on malformed input for predict_proba if _is_pairwise(classifier_orig): @@ -2056,7 +2042,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, err_msg=msg.format(name, "predict_proba"), ): classifier.predict_proba(X.T) - if hasattr(classifier, "predict_log_proba"):# XXX not API + if hasattr(classifier, "predict_log_proba") and not api_only: # predict_log_proba is a transformation of predict_proba y_log_prob = classifier.predict_log_proba(X) assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9) @@ -2064,7 +2050,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, def check_outlier_corruption(num_outliers, expected_outliers, decision, - strict_mode=True): + api_only=False): # Check for deviation from the precise given contamination level that may # be due to ties in the anomaly scores. if num_outliers < expected_outliers: @@ -2085,7 +2071,7 @@ def check_outlier_corruption(num_outliers, expected_outliers, decision, def check_outliers_train(name, estimator_orig, readonly_memmap=True, - strict_mode=True): + api_only=False): n_samples = 300 X, _ = make_blobs(n_samples=n_samples, random_state=0) X = shuffle(X, random_state=7) @@ -2117,10 +2103,11 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True, with raises(ValueError): estimator.predict(X.T) - # decision_function agrees with predict XXX not API - dec_pred = (decision >= 0).astype(int) - dec_pred[dec_pred == 0] = -1 - assert_array_equal(dec_pred, y_pred) + # decision_function agrees with predict + if not api_only: + dec_pred = (decision >= 0).astype(int) + dec_pred[dec_pred == 0] = -1 + assert_array_equal(dec_pred, y_pred) # raises error on malformed input for decision_function with raises(ValueError): @@ -2135,9 +2122,9 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True, estimator.score_samples(X.T) # contamination parameter (not for OneClassSVM which has the nu parameter) - # XXX: not API if (hasattr(estimator, 'contamination') - and not hasattr(estimator, 'novelty')): + and not hasattr(estimator, 'novelty') + and not api_only): # proportion of outliers equal to contamination parameter when not # set to 'auto'. This is true for the training set and cannot thus be # checked as follows for estimators with a novelty parameter such as @@ -2167,9 +2154,8 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True, @ignore_warnings(category=(FutureWarning)) def check_classifiers_multilabel_representation_invariance( - name, classifier_orig, strict_mode=True): + name, classifier_orig, api_only=False): # check different target representations for multilabel classifiers - # XXX: pure API check X, y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, @@ -2204,9 +2190,8 @@ def check_classifiers_multilabel_representation_invariance( @ignore_warnings(category=FutureWarning) def check_estimators_fit_returns_self(name, estimator_orig, - readonly_memmap=False, strict_mode=True): + readonly_memmap=False, api_only=False): # Check that self is returned when calling fit. - # XXX pure API check X, y = make_blobs(random_state=0, n_samples=21) # some want non-negative input X -= X.min() @@ -2223,11 +2208,10 @@ def check_estimators_fit_returns_self(name, estimator_orig, @ignore_warnings -def check_estimators_unfitted(name, estimator_orig, strict_mode=True): +def check_estimators_unfitted(name, estimator_orig, api_only=False): # Check that predict raises an exception in an unfitted estimator. # Unfitted estimators should raise a NotFittedError. # Common test for Regressors, Classifiers and Outlier detection estimators - # XXX pure API X, y = _regression_dataset() estimator = clone(estimator_orig) @@ -2239,10 +2223,9 @@ def check_estimators_unfitted(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_supervised_y_2d(name, estimator_orig, strict_mode=True): +def check_supervised_y_2d(name, estimator_orig, api_only=False): # Check that estimators that don't support multi-ouput raise a warning if y # is not 1d, and that they just ravel y - # XXX pure API check tags = estimator_orig._get_tags() rnd = np.random.RandomState(0) n_samples = 30 @@ -2277,7 +2260,7 @@ def check_supervised_y_2d(name, estimator_orig, strict_mode=True): @ignore_warnings def check_classifiers_predictions(X, y, name, classifier_orig, - strict_mode=True): + api_only=False): classes = np.unique(y) classifier = clone(classifier_orig) if name == 'BernoulliNB': @@ -2323,10 +2306,9 @@ def _choose_check_classifiers_labels(name, y, y_names): return y if name in ["LabelPropagation", "LabelSpreading"] else y_names -def check_classifiers_classes(name, classifier_orig, strict_mode=True): +def check_classifiers_classes(name, classifier_orig, api_only=False): # Check that decision function > 0 => pos class # Also checks the classes_ attribute. - # XXX pure API check X_multiclass, y_multiclass = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) X_multiclass, y_multiclass = shuffle(X_multiclass, y_multiclass, @@ -2364,10 +2346,9 @@ def check_classifiers_classes(name, classifier_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_regressors_int(name, regressor_orig, strict_mode=True): +def check_regressors_int(name, regressor_orig, api_only=False): # Check that regressors give same prediction when y is encoded as int or # float - # XXX: API check ? X, _ = _regression_dataset() X = _pairwise_estimator_convert_X(X[:50], regressor_orig) rnd = np.random.RandomState(0) @@ -2396,13 +2377,12 @@ def check_regressors_int(name, regressor_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_regressors_train(name, regressor_orig, readonly_memmap=False, - X_dtype=np.float64, strict_mode=True): + X_dtype=np.float64, api_only=False): # Check that regressors: # - raise an error when X and y have different number of samples # - accept lists as input to fit # - predict n_samples predictions # - have a score > .5 on simple data - # XXX: all API checks except the last one X, y = _regression_dataset() X = X.astype(X_dtype) X = _pairwise_estimator_convert_X(X, regressor_orig) @@ -2444,17 +2424,15 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False, # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped - # XXX: non API - if not regressor._get_tags()["poor_score"]: + if not regressor._get_tags()["poor_score"] and not api_only: assert regressor.score(X, y_) > 0.5 @ignore_warnings def check_regressors_no_decision_function(name, regressor_orig, - strict_mode=True): + api_only=False): # check that regressors don't have a decision_function, predict_proba, or # predict_log_proba method. - # XXX: full API check rng = np.random.RandomState(0) regressor = clone(regressor_orig) @@ -2469,11 +2447,10 @@ def check_regressors_no_decision_function(name, regressor_orig, @ignore_warnings(category=FutureWarning) -def check_class_weight_classifiers(name, classifier_orig, strict_mode=True): +def check_class_weight_classifiers(name, classifier_orig, api_only=False): # Make sure that classifiers take class_weight into account by creating a # very noisy balanced dataset. We make sure that passing a very imbalanced # class_weights helps recovering a good score. - # XXX: full non-API check if classifier_orig._get_tags()['binary_only']: problems = [2] @@ -2521,8 +2498,7 @@ def check_class_weight_classifiers(name, classifier_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_class_weight_balanced_classifiers(name, classifier_orig, X_train, y_train, X_test, y_test, weights, - strict_mode=True): - # XXX: it's never ever used, just ignore + api_only=False): classifier = clone(classifier_orig) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) @@ -2542,11 +2518,10 @@ def check_class_weight_balanced_classifiers(name, classifier_orig, X_train, @ignore_warnings(category=FutureWarning) def check_class_weight_balanced_linear_classifier(name, Classifier, - strict_mode=True): + api_only=False): # Check that class_weight='balanced' is equivalent to manually passing # class proportions. # this is run on classes, not instances, though this should be changed - # XXX: non API check X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = np.array([1, 1, 1, -1, -1]) @@ -2583,9 +2558,8 @@ def check_class_weight_balanced_linear_classifier(name, Classifier, @ignore_warnings(category=FutureWarning) -def check_estimators_overwrite_params(name, estimator_orig, strict_mode=True): +def check_estimators_overwrite_params(name, estimator_orig, api_only=False): # Check that calling fit does not alter the output of get_params - # XXX: full API check X, y = make_blobs(random_state=0, n_samples=21) # some want non-negative input X -= X.min() @@ -2620,11 +2594,10 @@ def check_estimators_overwrite_params(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_no_attributes_set_in_init(name, estimator_orig, strict_mode=True): +def check_no_attributes_set_in_init(name, estimator_orig, api_only=False): # Check that: # - init does not set any attribute apart from the parameters # - all parameters of init are set as attributes - # XXX: full API check estimator = clone(estimator_orig) if hasattr(type(estimator).__init__, "deprecated_original"): return @@ -2657,10 +2630,9 @@ def check_no_attributes_set_in_init(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_sparsify_coefficients(name, estimator_orig, strict_mode=True): +def check_sparsify_coefficients(name, estimator_orig, api_only=False): # Check that sparsified coefs produce the same predictions as the # originals coefs - # XXX: full non API check X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, -2], [2, 2], [-2, -2]]) y = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3]) @@ -2684,10 +2656,9 @@ def check_sparsify_coefficients(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_classifier_data_not_an_array(name, estimator_orig, strict_mode=True): +def check_classifier_data_not_an_array(name, estimator_orig, api_only=False): # Check that estimator yields same predictions whether an array was passed # or not - # XXX: full API X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1], [0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]]) X = _pairwise_estimator_convert_X(X, estimator_orig) @@ -2699,10 +2670,9 @@ def check_classifier_data_not_an_array(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_regressor_data_not_an_array(name, estimator_orig, strict_mode=True): +def check_regressor_data_not_an_array(name, estimator_orig, api_only=False): # Check that estimator yields same predictions whether an array was passed # or not - # XXX: full API X, y = _regression_dataset() X = _pairwise_estimator_convert_X(X, estimator_orig) y = _enforce_estimator_tags_y(estimator_orig, y) @@ -2713,7 +2683,7 @@ def check_regressor_data_not_an_array(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type, - strict_mode=True): + api_only=False): if name in CROSS_DECOMPOSITION: raise SkipTest("Skipping check_estimators_data_not_an_array " "for cross decomposition module as estimators " @@ -2755,10 +2725,10 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type, assert_allclose(pred1, pred2, atol=1e-2, err_msg=name) -def check_parameters_default_constructible(name, Estimator, strict_mode=True): +def check_parameters_default_constructible(name, Estimator, api_only=False): # Check that the estimator's default parameters are immutable (sort of). - # Also check that get_params returns exactly the default parameters values - # XXX: full API check + # Also check that get_params returns exactly the default parameters values + # on an unfitted estimator Estimator = Estimator.__class__ @@ -2884,10 +2854,9 @@ def _enforce_estimator_tags_x(estimator, X): @ignore_warnings(category=FutureWarning) def check_non_transformer_estimators_n_iter(name, estimator_orig, - strict_mode=True): + api_only=False): # Test that estimators that are not transformers with a parameter # max_iter, return the attribute of n_iter_ at least 1. - # XXX: full API # These models are dependent on external solvers like # libsvm and accessing the iter parameter is non-trivial. @@ -2919,10 +2888,9 @@ def check_non_transformer_estimators_n_iter(name, estimator_orig, @ignore_warnings(category=FutureWarning) -def check_transformer_n_iter(name, estimator_orig, strict_mode=True): +def check_transformer_n_iter(name, estimator_orig, api_only=False): # Test that transformers with a parameter max_iter, return the # attribute of n_iter_ at least 1. - # XXX: full API estimator = clone(estimator_orig) if hasattr(estimator, "max_iter"): if name in CROSS_DECOMPOSITION: @@ -2946,9 +2914,8 @@ def check_transformer_n_iter(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_get_params_invariance(name, estimator_orig, strict_mode=True): +def check_get_params_invariance(name, estimator_orig, api_only=False): # Checks that get_params(deep=False) is a subset of get_params(deep=True) - # XXX: full API e = clone(estimator_orig) shallow_params = e.get_params(deep=False) @@ -2959,10 +2926,9 @@ def check_get_params_invariance(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) -def check_set_params(name, estimator_orig, strict_mode=True): +def check_set_params(name, estimator_orig, api_only=False): # Check that get_params() returns the same thing # before and after set_params() with some fuzz - # XXX: full API check estimator = clone(estimator_orig) orig_params = estimator.get_params(deep=False) @@ -3015,25 +2981,23 @@ def check_set_params(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_classifiers_regression_target(name, estimator_orig, - strict_mode=True): + api_only=False): # Check if classifier throws an exception when fed regression targets - # XXX API check X, y = _regression_dataset() X = X + 1 + abs(X.min(axis=0)) # be sure that X is non-negative e = clone(estimator_orig) - msg = "Unknown label type: " + match = None if api_only else "Unknown label type: " if not e._get_tags()["no_validation"]: - with raises(ValueError, match=msg): + with raises(ValueError, match=match): e.fit(X, y) @ignore_warnings(category=FutureWarning) -def check_decision_proba_consistency(name, estimator_orig, strict_mode=True): +def check_decision_proba_consistency(name, estimator_orig, api_only=False): # Check whether an estimator having both decision_function and # predict_proba methods has outputs with perfect rank correlation. - # XXX: fulll non API check centers = [(2, 2), (4, 4)] X, y = make_blobs(n_samples=100, random_state=0, n_features=4, @@ -3053,7 +3017,7 @@ def check_decision_proba_consistency(name, estimator_orig, strict_mode=True): assert_array_equal(rankdata(a), rankdata(b)) -def check_outliers_fit_predict(name, estimator_orig, strict_mode=True): +def check_outliers_fit_predict(name, estimator_orig, api_only=False): # Check fit_predict for outlier detectors. n_samples = 300 @@ -3076,9 +3040,7 @@ def check_outliers_fit_predict(name, estimator_orig, strict_mode=True): y_pred_2 = estimator.fit(X).predict(X) assert_array_equal(y_pred, y_pred_2) - # XXX: next check isn't API check - - if hasattr(estimator, "contamination"): + if hasattr(estimator, "contamination") and not api_only: # proportion of outliers equal to contamination parameter when not # set to 'auto' expected_outliers = 30 @@ -3103,22 +3065,18 @@ def check_outliers_fit_predict(name, estimator_orig, strict_mode=True): estimator.fit_predict(X) -def check_fit_non_negative(name, estimator_orig, strict_mode=True): - # Check that proper warning is raised for non-negative X +def check_fit_non_negative(name, estimator_orig, api_only=False): + # Check that proper error is raised for non-negative X # when tag requires_positive_X is present - # XXX: full non API check + remove if else X = np.array([[-1., 1], [-1., 1]]) y = np.array([1, 2]) estimator = clone(estimator_orig) - if strict_mode: - with raises(ValueError, match="Negative values in data passed to"): - estimator.fit(X, y) - else: # Don't check error message if strict mode is off - with raises(ValueError): - estimator.fit(X, y) + match = None if api_only else "Negative values in data passed to" + with raises(ValueError, match=match): + estimator.fit(X, y) -def check_fit_idempotent(name, estimator_orig, strict_mode=True): +def check_fit_idempotent(name, estimator_orig, api_only=False): # Check that est.fit(X) is the same as est.fit(X).fit(X). Ideally we would # check that the estimated parameters during training (e.g. coefs_) are # the same, but having a universal comparison function for those @@ -3126,8 +3084,6 @@ def check_fit_idempotent(name, estimator_orig, strict_mode=True): # predict(), predict_proba(), decision_function() and transform() return # the same results. - # XXX full API check - check_methods = ["predict", "transform", "decision_function", "predict_proba"] rng = np.random.RandomState(0) @@ -3175,7 +3131,7 @@ def check_fit_idempotent(name, estimator_orig, strict_mode=True): ) -def check_n_features_in(name, estimator_orig, strict_mode=True): +def check_n_features_in(name, estimator_orig, api_only=False): # Make sure that n_features_in_ attribute doesn't exist until fit is # called, and that its value is correct. @@ -3213,7 +3169,7 @@ def check_n_features_in(name, estimator_orig, strict_mode=True): ) -def check_requires_y_none(name, estimator_orig, strict_mode=True): +def check_requires_y_none(name, estimator_orig, api_only=False): # Make sure that an estimator with requires_y=True fails gracefully when # given y=None @@ -3245,7 +3201,7 @@ def check_requires_y_none(name, estimator_orig, strict_mode=True): warnings.warn(warning_msg, FutureWarning) -def check_n_features_in_after_fitting(name, estimator_orig, strict_mode=True): +def check_n_features_in_after_fitting(name, estimator_orig, api_only=False): # Make sure that n_features_in are checked after fitting tags = estimator_orig._get_tags() @@ -3299,7 +3255,25 @@ def check_n_features_in_after_fitting(name, estimator_orig, strict_mode=True): estimator.partial_fit(X_bad, y) -# set of checks that are completely strict, i.e. they have no non-strict part -_FULLY_STRICT_CHECKS = set([ - 'check_n_features_in', +# set of checks that do not check API-compatibility. They are ignored when +# api_only is True. +_NON_API_CHECKS = set([ + 'check_estimator_sparse_data', + 'check_sample_weights_invariance', + 'check_complex_data', + 'check_methods_subset_invariance', + 'check_methods_sample_order_invariance', + 'check_fit2d_1sample', + 'check_fit2d_1featureucheck_estimators_dtypes', + 'check_estimators_dtypes', + 'check_transformer_preserve_dtypes', + 'check_estimators_nan_inf', + 'check_estimators_pickle', + 'check_clusterer_compute_labels_predict', + 'check_classifiers_one_label', + 'check_regressors_int', + 'check_class_weight_classifiers', + 'check_class_weight_balanced_linear_classifier', + 'check_sparsify_coefficients', + 'check_decision_proba_consistency', ]) From 6fda30c3cb7621d9dee285aa2f3c179fecb1b5b1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 9 Oct 2020 10:37:01 -0400 Subject: [PATCH 05/21] ooops --- sklearn/model_selection/_split.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index b99985cf91eca..c48cdc486e2b0 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1744,20 +1744,15 @@ def _iter_indices(self, X, y, groups=None): test = [] for i in range(n_classes): - # permutation = rng.permutation(class_counts[i]) - # perm_indices_class_i = class_indices[i].take(permutation, - # mode='clip') - perm_indices_class_i = class_indices[i] - - + permutation = rng.permutation(class_counts[i]) + perm_indices_class_i = class_indices[i].take(permutation, + mode='clip') train.extend(perm_indices_class_i[:n_i[i]]) test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]) - # train = rng.permutation(train) - # test = rng.permutation(test) - train = np.array(train) - test = np.array(test) + train = rng.permutation(train) + test = rng.permutation(test) yield train, test From db71e0f92f92d2d9fc7125e3859a6011a13a9a66 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 9 Oct 2020 11:23:53 -0400 Subject: [PATCH 06/21] some more --- doc/developers/develop.rst | 43 ++++++++++++++++++++++++------- doc/glossary.rst | 4 ++- doc/whats_new/v0.24.rst | 6 +++++ sklearn/utils/estimator_checks.py | 6 ++--- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index b7b5d2ac0316f..858fd92d2e69b 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -249,22 +249,16 @@ Rolling your own estimator If you want to implement a new estimator that is scikit-learn-compatible, whether it is just for you or for contributing it to scikit-learn, there are several internals of scikit-learn that you should be aware of in addition to -the scikit-learn API outlined above. You can check whether your estimator -adheres to the scikit-learn interface and standards by running -:func:`~sklearn.utils.estimator_checks.check_estimator` on an instance. The -:func:`~sklearn.utils.estimator_checks.parametrize_with_checks` pytest -decorator can also be used (see its docstring for details and possible -interactions with `pytest`):: - - >>> from sklearn.utils.estimator_checks import check_estimator - >>> from sklearn.svm import LinearSVC - >>> check_estimator(LinearSVC()) # passes +the scikit-learn API outlined above. The main motivation to make a class compatible to the scikit-learn estimator interface might be that you want to use it together with model evaluation and selection tools such as :class:`model_selection.GridSearchCV` and :class:`pipeline.Pipeline`. +Checking the compatibility of your estimator with scikit-learn is described +in :ref:`checking_compatibility` + Before detailing the required interface below, we describe two ways to achieve the correct interface more easily. @@ -499,6 +493,35 @@ patterns. The :mod:`sklearn.utils.multiclass` module contains useful functions for working with multiclass and multilabel problems. +.. _checking_compatibility: + +Checking the estimator's compatibility +-------------------------------------- + +You can check whether your estimator adheres to the scikit-learn interface +and standards by running +:func:`~sklearn.utils.estimator_checks.check_estimator` on an instance. + +The :func:`~sklearn.utils.estimator_checks.parametrize_with_checks` pytest +decorator can also be used (see its docstring for details and possible +interactions with `pytest`):: + + >>> from sklearn.utils.estimator_checks import check_estimator + >>> from sklearn.svm import LinearSVC + >>> check_estimator(LinearSVC()) # passes + +Both :func:`~sklearn.utils.estimator_checks.check_estimator` and +:func:`~sklearn.utils.estimator_checks.parametrize_with_checks` expose an +`api_only` parameter: when True, the check suite will only consider pure +API-compatibility checks. Some more advanced checks will be ignored, such as +ensuring that error messages are informative, or ensuring that a classifier +is able to properly discriminate classes on a simple problem. We recommend +leaving this parameter to False to guarantee robust and user-friendly +estimators. + +The kind of checks that the check suite will run can also be partially +controlled by setting estimator tags, described below: + .. _estimator_tags: Estimator Tags diff --git a/doc/glossary.rst b/doc/glossary.rst index 8530e966486aa..624a2a73c0a07 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -142,7 +142,9 @@ General Concepts We provide limited backwards compatibility assurances for the estimator checks: we may add extra requirements on estimators tested with this function, usually when these were informally - assumed but not formally tested. + assumed but not formally tested. In particular, checks that are + not API-related (i.e. those that are ignored when `api_only` is + True) may enforce backward-incompatible requirements. Despite this informal contract with our users, the software is provided as is, as stated in the license. When a release inadvertently diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index b52fbfc14bd40..ba305c5da03b7 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -621,6 +621,12 @@ Changelog estimator methods are invariant if applied to the same dataset with different sample order :pr:`17598` by :user:`Jason Ngo `. +- |Feature| :func:`~utils.estimator_checks.check_estimator` and + :func:`~utils.estimator_checks.parametrize_with_checks` now expose an + `api_only` parameter which allows to control whether the check suite should + only check for pure API-compatibility, or also run more advanced checks. + :pr:`TODO` and :pr:`17361` by `Nicolas Hug`_. + Miscellaneous ............. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 267c5677831c6..a76b17b7a48bf 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1365,7 +1365,7 @@ def _check_transformer(name, transformer_orig, X, y, api_only=False): else: # check for consistent n_samples assert X_pred.shape[0] == n_samples - + if hasattr(transformer, 'transform'): # raises error on malformed input for transform @@ -2451,7 +2451,7 @@ def check_class_weight_classifiers(name, classifier_orig, api_only=False): # Make sure that classifiers take class_weight into account by creating a # very noisy balanced dataset. We make sure that passing a very imbalanced # class_weights helps recovering a good score. - + if classifier_orig._get_tags()['binary_only']: problems = [2] else: @@ -3039,7 +3039,7 @@ def check_outliers_fit_predict(name, estimator_orig, api_only=False): if hasattr(estimator, 'predict'): y_pred_2 = estimator.fit(X).predict(X) assert_array_equal(y_pred, y_pred_2) - + if hasattr(estimator, "contamination") and not api_only: # proportion of outliers equal to contamination parameter when not # set to 'auto' From b4b813820402a21c21a9b5e9441ff354f3952d84 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 9 Oct 2020 11:43:54 -0400 Subject: [PATCH 07/21] whatsnew --- doc/whats_new/v0.24.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index ba305c5da03b7..8550dea9afd8f 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -625,7 +625,7 @@ Changelog :func:`~utils.estimator_checks.parametrize_with_checks` now expose an `api_only` parameter which allows to control whether the check suite should only check for pure API-compatibility, or also run more advanced checks. - :pr:`TODO` and :pr:`17361` by `Nicolas Hug`_. + :pr:`18582` and :pr:`17361` by `Nicolas Hug`_. Miscellaneous ............. From 41393fa9915cada6a80659addc530456977a47b8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 17 Oct 2020 09:07:30 -0400 Subject: [PATCH 08/21] addressed comments --- sklearn/tests/test_common.py | 2 +- sklearn/utils/estimator_checks.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index c662b8fc55c11..8eac4f25a82f3 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -221,7 +221,7 @@ class MyNMFWithBadErrorMessage(NMF): # Same as NMF but raises an uninformative error message if X has negative # value. This estimator would fail the check suite with api_only=False, # specifically it would fail check_fit_non_negative because its error - # message doesn't match + # message doesn't match the expected one. def __init__(self): # declare init to avoid deprecation warning since default has changed diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5d4417d8d7105..fa48d71f2f963 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1297,7 +1297,7 @@ def check_transformer_general(name, transformer, readonly_memmap=False, if readonly_memmap: X, y = create_memmap_backed_data([X, y]) - _check_transformer(name, transformer, X, y) + _check_transformer(name, transformer, X, y, api_only=api_only) @ignore_warnings(category=FutureWarning) @@ -1311,9 +1311,10 @@ def check_transformer_data_not_an_array(name, transformer, api_only=False): X = _pairwise_estimator_convert_X(X, transformer) this_X = _NotAnArray(X) this_y = _NotAnArray(np.asarray(y)) - _check_transformer(name, transformer, this_X, this_y) + _check_transformer(name, transformer, this_X, this_y, api_only=api_only) # try the same with some list - _check_transformer(name, transformer, X.tolist(), y.tolist()) + _check_transformer(name, transformer, X.tolist(), y.tolist(), + api_only=api_only) @ignore_warnings(category=FutureWarning) @@ -3269,8 +3270,7 @@ def check_n_features_in_after_fitting(name, estimator_orig, api_only=False): 'check_methods_subset_invariance', 'check_methods_sample_order_invariance', 'check_fit2d_1sample', - 'check_fit2d_1featureucheck_estimators_dtypes', - 'check_estimators_dtypes', + 'check_fit2d_1feature', 'check_transformer_preserve_dtypes', 'check_estimators_nan_inf', 'check_estimators_pickle', From cb66293d245de4f0f0f44c29cce141ad6f931713 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 28 Oct 2020 18:01:15 -0400 Subject: [PATCH 09/21] make pickle full API check --- sklearn/utils/estimator_checks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index aa9d81bba5ad0..f61ffcbc2d7a8 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3272,7 +3272,6 @@ def check_n_features_in_after_fitting(name, estimator_orig, api_only=False): 'check_fit2d_1feature', 'check_transformer_preserve_dtypes', 'check_estimators_nan_inf', - 'check_estimators_pickle', 'check_clusterer_compute_labels_predict', 'check_classifiers_one_label', 'check_regressors_int', From 1ff888766a2fe90937af8ed67923aecba95fc980 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 28 Oct 2020 18:02:35 -0400 Subject: [PATCH 10/21] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/utils/estimator_checks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index bd2566c12dcce..9d84d1de60f15 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1382,6 +1382,7 @@ def _check_transformer(name, transformer_orig, X, y, api_only=False): ): transformer.transform(X[:, :-1]) if api_only: + # The remaining asserts are non-API asserts return if name in CROSS_DECOMPOSITION: @@ -1769,6 +1770,7 @@ def check_classifier_multioutput(name, estimator, api_only=False): assert_array_equal(y_prob.round().astype(int), y_pred) if api_only: + # The remaining asserts are non-API asserts return if (hasattr(estimator, "decision_function") and @@ -1836,6 +1838,8 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False, assert pred.shape == (n_samples,) if api_only: + # The remaining asserts are non-API asserts + return assert adjusted_rand_score(pred, y) > 0.4 From a68194be052e781e01151a5c009996774ac22931 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 9 Nov 2020 16:59:51 +0100 Subject: [PATCH 11/21] TST reintroduce _safe_tags for estimator not inheriting from BaseEstimator --- sklearn/utils/estimator_checks.py | 97 ++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 33 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3cd19967ba9c1..98fd46a0b776d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -32,6 +32,7 @@ from ..base import ( clone, ClusterMixin, + _DEFAULT_TAGS, is_classifier, is_regressor, is_outlier_detector, @@ -66,9 +67,39 @@ CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'] +def _safe_tags(estimator, key=None): + """Safely get estimator tags for common checks. + + :class:`~sklearn.BaseEstimator` provides the estimator tags machinery. + However, if a compatible estimator does not inherit from this base class, + we should default to the default tag. + + Parameters + ---------- + estimator : estimator object + The estimator from which to get the tag. + key : str, default=None + Tag name to get. By default (`None`), all tags are returned. + + Returns + ------- + tags : dict + The estimator tags. + """ + if hasattr(estimator, "_get_tags"): + if key is not None: + return estimator._get_tags().get(key, _DEFAULT_TAGS[key]) + tags = estimator._get_tags() + return {key: tags.get(key, _DEFAULT_TAGS[key]) + for key in _DEFAULT_TAGS.keys()} + if key is not None: + return _DEFAULT_TAGS[key] + return _DEFAULT_TAGS + + def _yield_checks(estimator): name = estimator.__class__.__name__ - tags = estimator._get_tags() + tags = _safe_tags(estimator) pairwise = _is_pairwise(estimator) yield check_no_attributes_set_in_init @@ -116,7 +147,7 @@ def _yield_checks(estimator): def _yield_classifier_checks(classifier): - tags = classifier._get_tags() + tags = _safe_tags(classifier) # test classifiers can handle non-array data and pandas objects yield check_classifier_data_not_an_array @@ -170,7 +201,7 @@ def check_supervised_y_no_nan(name, estimator_orig, strict_mode=True): def _yield_regressor_checks(regressor): - tags = regressor._get_tags() + tags = _safe_tags(regressor) # TODO: test with intercept # TODO: test with multiple responses # basic testing @@ -196,7 +227,7 @@ def _yield_regressor_checks(regressor): def _yield_transformer_checks(transformer): - tags = transformer._get_tags() + tags = _safe_tags(transformer) # All transformers should either deal with sparse data or raise an # exception with type TypeError and an intelligible error message if not tags["no_validation"]: @@ -206,7 +237,7 @@ def _yield_transformer_checks(transformer): if tags["preserves_dtype"]: yield check_transformer_preserve_dtypes yield partial(check_transformer_general, readonly_memmap=True) - if not transformer._get_tags()["stateless"]: + if not _safe_tags(transformer, key="stateless"): yield check_transformers_unfitted # Dependent on external solvers and hence accessing the iter # param is non-trivial. @@ -243,13 +274,13 @@ def _yield_outliers_checks(estimator): # test outlier detectors can handle non-array data yield check_classifier_data_not_an_array # test if NotFittedError is raised - if estimator._get_tags()["requires_fit"]: + if _safe_tags(estimator, key="requires_fit"): yield check_estimators_unfitted def _yield_all_checks(estimator): name = estimator.__class__.__name__ - tags = estimator._get_tags() + tags = _safe_tags(estimator) if "2darray" not in tags["X_types"]: warnings.warn("Can't test estimator {} which requires input " " of type {}".format(name, tags["X_types"]), @@ -421,7 +452,7 @@ def _should_be_skipped_or_marked(estimator, check, strict_mode): check_name = (check.func.__name__ if isinstance(check, partial) else check.__name__) - xfail_checks = estimator._get_tags()['_xfail_checks'] or {} + xfail_checks = _safe_tags(estimator, key='_xfail_checks') or {} if check_name in xfail_checks: return True, xfail_checks[check_name] @@ -772,7 +803,7 @@ def check_estimator_sparse_data(name, estimator_orig, strict_mode=True): with ignore_warnings(category=FutureWarning): estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) - tags = estimator_orig._get_tags() + tags = _safe_tags(estimator_orig) for matrix_format, X in _generate_sparse_matrix(X_csr): # catch deprecation warnings with ignore_warnings(category=FutureWarning): @@ -829,7 +860,7 @@ def check_sample_weights_pandas_series(name, estimator_orig, strict_mode=True): X = pd.DataFrame(_pairwise_estimator_convert_X(X, estimator_orig)) y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = pd.Series([1] * 12) - if estimator._get_tags()["multioutput_only"]: + if _safe_tags(estimator, key="multioutput_only"): y = pd.DataFrame(y) try: estimator.fit(X, y, sample_weight=weights) @@ -854,7 +885,7 @@ def check_sample_weights_not_an_array(name, estimator_orig, strict_mode=True): X = _NotAnArray(_pairwise_estimator_convert_X(X, estimator_orig)) y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = _NotAnArray([1] * 12) - if estimator._get_tags()["multioutput_only"]: + if _safe_tags(estimator, key="multioutput_only"): y = _NotAnArray(y.data.reshape(-1, 1)) estimator.fit(X, y, sample_weight=weights) @@ -959,7 +990,7 @@ def check_dtype_object(name, estimator_orig, strict_mode=True): rng = np.random.RandomState(0) X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig) X = X.astype(object) - tags = estimator_orig._get_tags() + tags = _safe_tags(estimator_orig) y = (X[:, 0] * 4).astype(int) estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) @@ -1179,7 +1210,7 @@ def check_methods_sample_order_invariance( X = 3 * rnd.uniform(size=(20, 3)) X = _pairwise_estimator_convert_X(X, estimator_orig) y = X[:, 0].astype(np.int) - if estimator_orig._get_tags()['binary_only']: + if _safe_tags(estimator_orig, key='binary_only'): y[y == 2] = 1 estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) @@ -1368,7 +1399,7 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) - if transformer_orig._get_tags()['non_deterministic']: + if _safe_tags(transformer_orig, key='non_deterministic'): msg = name + ' is non deterministic' raise SkipTest(msg) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): @@ -1399,7 +1430,7 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): # raises error on malformed input for transform if hasattr(X, 'shape') and \ - not transformer._get_tags()["stateless"] and \ + not _safe_tags(transformer, key="stateless") and \ X.ndim == 2 and X.shape[1] > 1: # If it's not an array, it does not have a 'T' property @@ -1414,7 +1445,7 @@ def _check_transformer(name, transformer_orig, X, y, strict_mode=True): @ignore_warnings def check_pipeline_consistency(name, estimator_orig, strict_mode=True): - if estimator_orig._get_tags()['non_deterministic']: + if _safe_tags(estimator_orig, key='non_deterministic'): msg = name + ' is non deterministic' raise SkipTest(msg) @@ -1508,7 +1539,7 @@ def check_transformer_preserve_dtypes( X -= X.min() X = _pairwise_estimator_convert_X(X, transformer_orig) - for dtype in transformer_orig._get_tags()["preserves_dtype"]: + for dtype in _safe_tags(transformer_orig, key="preserves_dtype"): X_cast = X.astype(dtype) transformer = clone(transformer_orig) set_random_state(transformer) @@ -1634,7 +1665,7 @@ def check_estimators_pickle(name, estimator_orig, strict_mode=True): X -= X.min() X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) - tags = estimator_orig._get_tags() + tags = _safe_tags(estimator_orig) # include NaN values when the estimator should deal with them if tags['allow_nan']: # set randomly 10 elements to np.nan @@ -1696,7 +1727,7 @@ def check_estimators_partial_fit_n_features(name, estimator_orig, @ignore_warnings(category=FutureWarning) def check_classifier_multioutput(name, estimator, strict_mode=True): n_samples, n_labels, n_classes = 42, 5, 3 - tags = estimator._get_tags() + tags = _safe_tags(estimator) estimator = clone(estimator) X, y = make_multilabel_classification(random_state=42, n_samples=n_samples, @@ -1804,7 +1835,7 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False, pred = clusterer.labels_ assert pred.shape == (n_samples,) assert adjusted_rand_score(pred, y) > 0.4 - if clusterer._get_tags()['non_deterministic']: + if _safe_tags(clusterer, key='non_deterministic'): return set_random_state(clusterer) with warnings.catch_warnings(record=True): @@ -1896,7 +1927,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b]) problems = [(X_b, y_b)] - tags = classifier_orig._get_tags() + tags = _safe_tags(classifier_orig) if not tags['binary_only']: problems.append((X_m, y_m)) @@ -2187,7 +2218,7 @@ def check_estimators_unfitted(name, estimator_orig, strict_mode=True): @ignore_warnings(category=FutureWarning) def check_supervised_y_2d(name, estimator_orig, strict_mode=True): - tags = estimator_orig._get_tags() + tags = _safe_tags(estimator_orig) rnd = np.random.RandomState(0) n_samples = 30 X = _pairwise_estimator_convert_X( @@ -2291,7 +2322,7 @@ def check_classifiers_classes(name, classifier_orig, strict_mode=True): y_names_binary = np.take(labels_binary, y_binary) problems = [(X_binary, y_binary, y_names_binary)] - if not classifier_orig._get_tags()['binary_only']: + if not _safe_tags(classifier_orig, key='binary_only'): problems.append((X_multiclass, y_multiclass, y_names_multiclass)) for X, y, y_names in problems: @@ -2377,7 +2408,7 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False, # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped - if not regressor._get_tags()["poor_score"]: + if not _safe_tags(regressor, key="poor_score"): assert regressor.score(X, y_) > 0.5 @@ -2402,7 +2433,7 @@ def check_regressors_no_decision_function(name, regressor_orig, @ignore_warnings(category=FutureWarning) def check_class_weight_classifiers(name, classifier_orig, strict_mode=True): - if classifier_orig._get_tags()['binary_only']: + if _safe_tags(classifier_orig, key='binary_only'): problems = [2] else: problems = [2, 3] @@ -2441,7 +2472,7 @@ def check_class_weight_classifiers(name, classifier_orig, strict_mode=True): y_pred = classifier.predict(X_test) # XXX: Generally can use 0.89 here. On Windows, LinearSVC gets # 0.88 (Issue #9111) - if not classifier_orig._get_tags()['poor_score']: + if not _safe_tags(classifier_orig, key='poor_score'): assert np.mean(y_pred == 0) > 0.87 @@ -2761,16 +2792,16 @@ def param_filter(p): def _enforce_estimator_tags_y(estimator, y): # Estimators with a `requires_positive_y` tag only accept strictly positive # data - if estimator._get_tags()["requires_positive_y"]: + if _safe_tags(estimator, key="requires_positive_y"): # Create strictly positive y. The minimal increment above 0 is 1, as # y could be of integer dtype. y += 1 + abs(y.min()) # Estimators with a `binary_only` tag only accept up to two unique y values - if estimator._get_tags()["binary_only"] and y.size > 0: + if _safe_tags(estimator, key="binary_only") and y.size > 0: y = np.where(y == y.flat[0], y, y.flat[0] + 1) # Estimators in mono_output_task_error raise ValueError if y is of 1-D # Convert into a 2-D y for those estimators. - if estimator._get_tags()["multioutput_only"]: + if _safe_tags(estimator, key="multioutput_only"): return np.reshape(y, (-1, 1)) return y @@ -2782,11 +2813,11 @@ def _enforce_estimator_tags_x(estimator, X): X = X.dot(X.T) # Estimators with `1darray` in `X_types` tag only accept # X of shape (`n_samples`,) - if '1darray' in estimator._get_tags()['X_types']: + if '1darray' in _safe_tags(estimator, key='X_types'): X = X[:, 0] # Estimators with a `requires_positive_X` tag only accept # strictly positive data - if estimator._get_tags()['requires_positive_X']: + if _safe_tags(estimator, key='requires_positive_X'): X -= X.min() return X @@ -2928,7 +2959,7 @@ def check_classifiers_regression_target(name, estimator_orig, X = X + 1 + abs(X.min(axis=0)) # be sure that X is non-negative e = clone(estimator_orig) msg = "Unknown label type: " - if not e._get_tags()["no_validation"]: + if not _safe_tags(e, keyy="no_validation"): with raises(ValueError, match=msg): e.fit(X, y) @@ -3145,7 +3176,7 @@ def check_requires_y_none(name, estimator_orig, strict_mode=True): def check_n_features_in_after_fitting(name, estimator_orig, strict_mode=True): # Make sure that n_features_in are checked after fitting - tags = estimator_orig._get_tags() + tags = _safe_tags(estimator_orig) if "2darray" not in tags["X_types"] or tags["no_validation"]: return From 36f1c5c68f3ff16a5b9fa936e100de2bc7a59ffa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 9 Nov 2020 17:06:29 +0100 Subject: [PATCH 12/21] typo --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 98fd46a0b776d..ac30f66d41866 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2959,7 +2959,7 @@ def check_classifiers_regression_target(name, estimator_orig, X = X + 1 + abs(X.min(axis=0)) # be sure that X is non-negative e = clone(estimator_orig) msg = "Unknown label type: " - if not _safe_tags(e, keyy="no_validation"): + if not _safe_tags(e, key="no_validation"): with raises(ValueError, match=msg): e.fit(X, y) From 9e540141319126dce275b76a996679d45495de2e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 11 Nov 2020 13:20:26 +0100 Subject: [PATCH 13/21] TST implement minimal classifier --- sklearn/utils/tests/test_estimator_checks.py | 41 +++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index ecbf7cb7be7f4..9a069224f88ba 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -23,6 +23,7 @@ from sklearn.utils.estimator_checks import check_regressor_data_not_an_array from sklearn.utils.validation import check_is_fitted from sklearn.utils.estimator_checks import check_outlier_corruption +from sklearn.utils.estimator_checks import parametrize_with_checks from sklearn.utils.fixes import np_version, parse_version from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression, SGDClassifier @@ -418,8 +419,8 @@ def test_check_estimator(): # check that we have a set_params and can clone msg = "Passing a class was deprecated" assert_raises_regex(TypeError, msg, check_estimator, object) - msg = "object has no attribute '_get_tags'" - assert_raises_regex(AttributeError, msg, check_estimator, object()) + # msg = "object has no attribute '_get_tags'" + # assert_raises_regex(AttributeError, msg, check_estimator, object()) msg = ( "Parameter 'p' of estimator 'HasMutableParameters' is of type " "object which is not allowed" @@ -620,6 +621,42 @@ def test_check_estimator_pairwise(): check_estimator(est) +class MinimalEstimator: + + # Our minimal required supposed that the following are implemented + _get_param_names = BaseEstimator._get_param_names # used by get_params + set_params = BaseEstimator.set_params + get_params = BaseEstimator.get_params + __setstate__ = BaseEstimator.__setstate__ + __getstate__ = BaseEstimator.__getstate__ + + def fit(self, X, y): + return self + + +class MinimalClassifier(MinimalEstimator): + + def fit(self, X, y): + self.classes_ = np.unique(y) + return super().fit(X, y) + + def predict_proba(self, X): + proba_shape = (len(X), self.classes_.size) + y_proba = np.zeros(shape=proba_shape, dtype=np.float64) + y_proba[:, 0] = 1.0 + return y_proba + + def predict(self, X): + y_proba = self.predict_proba(X) + y_pred = y_proba.argmax(axis=1) + return self.classes_[y_pred] + + +@parametrize_with_checks([MinimalClassifier()], strict_mode=False) +def test_check_estimator_minimal(estimator, check): + check(estimator) + + def test_check_classifier_data_not_an_array(): assert_raises_regex(AssertionError, 'Not equal to tolerance', From eaca564fff2482539ea8ed822ec579698aca2eb5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 11 Nov 2020 14:06:15 +0100 Subject: [PATCH 14/21] allow pickling --- sklearn/utils/tests/test_estimator_checks.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 84098d052d551..14555069a73e4 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -627,8 +627,15 @@ class MinimalEstimator: _get_param_names = BaseEstimator._get_param_names # used by get_params set_params = BaseEstimator.set_params get_params = BaseEstimator.get_params - # __setstate__ = BaseEstimator.__setstate__ - # __getstate__ = BaseEstimator.__getstate__ + + def __getstate__(self): + state = self.__dict__.copy() + # only because we are within scikit-learn source code + from sklearn import __version__ + return dict(state.items(), _sklearn_version=__version__) + + def __setstate__(self, state): + self.__dict__.update(state) def fit(self, X, y): X = check_array(X) From a06dfc469bcccb7334fc7420e3bb6d4bab671797 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Nov 2020 11:45:50 +0100 Subject: [PATCH 15/21] remove base class --- sklearn/utils/estimator_checks.py | 3 +- sklearn/utils/tests/test_estimator_checks.py | 61 ++++++++++++++------ 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index cf3b2d52f4230..1e3d684b219e9 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1700,7 +1700,8 @@ def check_estimators_pickle(name, estimator_orig, api_only=False): # pickle and unpickle! pickled_estimator = pickle.dumps(estimator) - if estimator.__module__.startswith('sklearn.'): + module_name = estimator.__module__ + if module_name.startswith('sklearn.') and "test_" not in module_name: assert b"version" in pickled_estimator unpickled_estimator = pickle.loads(pickled_estimator) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 14555069a73e4..c1e1c468574f3 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -621,18 +621,16 @@ def test_check_estimator_pairwise(): check_estimator(est) -class MinimalEstimator: +class MinimalClassifier: - # Our minimal required supposed that the following are implemented - _get_param_names = BaseEstimator._get_param_names # used by get_params - set_params = BaseEstimator.set_params - get_params = BaseEstimator.get_params + def get_params(self, **params): + return {} + + def set_params(self, deep=True): + return self def __getstate__(self): - state = self.__dict__.copy() - # only because we are within scikit-learn source code - from sklearn import __version__ - return dict(state.items(), _sklearn_version=__version__) + return self.__dict__.copy() def __setstate__(self, state): self.__dict__.update(state) @@ -640,14 +638,8 @@ def __setstate__(self, state): def fit(self, X, y): X = check_array(X) self.n_features_in_ = X.shape[1] - return self - - -class MinimalClassifier(MinimalEstimator): - - def fit(self, X, y): self.classes_ = np.unique(y) - return super().fit(X, y) + return self def predict_proba(self, X): check_is_fitted(self) @@ -666,11 +658,25 @@ def score(self, X, y): return 1.0 -class MinimalRegressor(MinimalEstimator): +class MinimalRegressor: + + def get_params(self, **params): + return {} + + def set_params(self, deep=True): + return self + + def __getstate__(self): + return self.__dict__.copy() + + def __setstate__(self, state): + self.__dict__.update(state) def fit(self, X, y): + X = check_array(X) + self.n_features_in_ = X.shape[1] self._mean = np.mean(y) - return super().fit(X, y) + return self def predict(self, X): X = check_array(X) @@ -680,7 +686,24 @@ def score(self, X, y): return 1.0 -class MinimalTransformer(MinimalEstimator): +class MinimalTransformer: + + def get_params(self, **params): + return {} + + def set_params(self, deep=True): + return self + + def __getstate__(self): + return self.__dict__.copy() + + def __setstate__(self, state): + self.__dict__.update(state) + + def fit(self, X, y): + X = check_array(X) + self.n_features_in_ = X.shape[1] + return self def transform(self, X, y=None): check_is_fitted(self) From 111ef8ebda12004acb84385aab84cdf81f022865 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Nov 2020 12:03:43 +0100 Subject: [PATCH 16/21] fix issue with id --- sklearn/utils/tests/test_estimator_checks.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index c1e1c468574f3..c174fd6beb56a 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -623,6 +623,11 @@ def test_check_estimator_pairwise(): class MinimalClassifier: + def __repr__(self): + # Only required when using pytest-xdist to get an id not associated + # with the memory location + return self.__name__ + def get_params(self, **params): return {} @@ -660,6 +665,11 @@ def score(self, X, y): class MinimalRegressor: + def __repr__(self): + # Only required when using pytest-xdist to get an id not associated + # with the memory location + return self.__name__ + def get_params(self, **params): return {} @@ -688,6 +698,11 @@ def score(self, X, y): class MinimalTransformer: + def __repr__(self): + # Only required when using pytest-xdist to get an id not associated + # with the memory location + return self.__name__ + def get_params(self, **params): return {} From 3383bda3669cbf9802dcbc7646408d23114f1e28 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Nov 2020 12:06:34 +0100 Subject: [PATCH 17/21] fix --- sklearn/utils/tests/test_estimator_checks.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index c174fd6beb56a..bce33e6d44f93 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -626,7 +626,7 @@ class MinimalClassifier: def __repr__(self): # Only required when using pytest-xdist to get an id not associated # with the memory location - return self.__name__ + return self.__class__.__name__ def get_params(self, **params): return {} @@ -660,7 +660,8 @@ def predict(self, X): return self.classes_[y_pred] def score(self, X, y): - return 1.0 + from sklearn.metrics import accuracy_score + return accuracy_score(y, self.predict(X)) class MinimalRegressor: @@ -668,7 +669,7 @@ class MinimalRegressor: def __repr__(self): # Only required when using pytest-xdist to get an id not associated # with the memory location - return self.__name__ + return self.__class__.__name__ def get_params(self, **params): return {} @@ -693,7 +694,8 @@ def predict(self, X): return np.ones(shape=(X.shape[0],)) * self._mean def score(self, X, y): - return 1.0 + from sklearn.metrics import r2_score + return r2_score(y, self.predict(X)) class MinimalTransformer: @@ -701,7 +703,7 @@ class MinimalTransformer: def __repr__(self): # Only required when using pytest-xdist to get an id not associated # with the memory location - return self.__name__ + return self.__class__.__name__ def get_params(self, **params): return {} From 425746d5662a72ebd8335d6368637ced36833624 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Nov 2020 12:09:03 +0100 Subject: [PATCH 18/21] create most frequent for classifier --- sklearn/utils/tests/test_estimator_checks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index bce33e6d44f93..2f7a23bc8afec 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -643,7 +643,8 @@ def __setstate__(self, state): def fit(self, X, y): X = check_array(X) self.n_features_in_ = X.shape[1] - self.classes_ = np.unique(y) + self.classes_, counts = np.unique(y, return_counts=True) + self._most_frequent_class = self.classes_[counts.argmax()] return self def predict_proba(self, X): @@ -651,7 +652,7 @@ def predict_proba(self, X): X = check_array(X) proba_shape = (X.shape[0], self.classes_.size) y_proba = np.zeros(shape=proba_shape, dtype=np.float64) - y_proba[:, 0] = 1.0 + y_proba[:, self._most_frequent_class] = 1.0 return y_proba def predict(self, X): From c012e093efea2a658cac00f536196949a10032d4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Nov 2020 12:39:35 +0100 Subject: [PATCH 19/21] iter --- sklearn/utils/tests/test_estimator_checks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 2f7a23bc8afec..bead3db69de96 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -622,6 +622,7 @@ def test_check_estimator_pairwise(): class MinimalClassifier: + _estimator_type = "classifier" def __repr__(self): # Only required when using pytest-xdist to get an id not associated @@ -666,6 +667,7 @@ def score(self, X, y): class MinimalRegressor: + _estimator_type = "regressor" def __repr__(self): # Only required when using pytest-xdist to get an id not associated From 9bf5cbcabf1d0ec07f74a70b7485ecbd50b85ee9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Nov 2020 13:30:04 +0100 Subject: [PATCH 20/21] iter --- sklearn/utils/estimator_checks.py | 2 +- sklearn/utils/tests/test_estimator_checks.py | 24 ++++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 56907be9136ff..67d269d62f168 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2331,7 +2331,7 @@ def check_classifiers_predictions(X, y, name, classifier_orig, (classifier, ", ".join(map(str, y_exp)), ", ".join(map(str, y_pred)))) - if name != "ComplementNB": + if not api_only and name != "ComplementNB": # This is a pathological data set for ComplementNB. # For some specific cases 'ComplementNB' predicts less classes # than expected diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index bead3db69de96..61e7099d64084 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -21,7 +21,6 @@ from sklearn.utils.estimator_checks import check_no_attributes_set_in_init from sklearn.utils.estimator_checks import check_classifier_data_not_an_array from sklearn.utils.estimator_checks import check_regressor_data_not_an_array -from sklearn.utils.validation import check_is_fitted from sklearn.utils.estimator_checks import check_outlier_corruption from sklearn.utils.estimator_checks import parametrize_with_checks from sklearn.utils.fixes import np_version, parse_version @@ -33,7 +32,12 @@ from sklearn.linear_model import MultiTaskElasticNet, LogisticRegression from sklearn.svm import SVC, NuSVC from sklearn.neighbors import KNeighborsRegressor -from sklearn.utils.validation import check_array +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import ( + check_array, + check_is_fitted, + check_X_y, +) from sklearn.utils import all_estimators from sklearn.exceptions import SkipTestWarning @@ -642,18 +646,21 @@ def __setstate__(self, state): self.__dict__.update(state) def fit(self, X, y): - X = check_array(X) + X, y = check_X_y(X, y) + check_classification_targets(y) self.n_features_in_ = X.shape[1] self.classes_, counts = np.unique(y, return_counts=True) - self._most_frequent_class = self.classes_[counts.argmax()] + self._most_frequent_class_idx = counts.argmax() return self def predict_proba(self, X): check_is_fitted(self) X = check_array(X) + if X.shape[1] != self.n_features_in_: + raise ValueError proba_shape = (X.shape[0], self.classes_.size) y_proba = np.zeros(shape=proba_shape, dtype=np.float64) - y_proba[:, self._most_frequent_class] = 1.0 + y_proba[:, self._most_frequent_class_idx] = 1.0 return y_proba def predict(self, X): @@ -687,13 +694,16 @@ def __setstate__(self, state): self.__dict__.update(state) def fit(self, X, y): - X = check_array(X) + X, y = check_X_y(X, y) self.n_features_in_ = X.shape[1] self._mean = np.mean(y) return self def predict(self, X): + check_is_fitted(self) X = check_array(X) + if X.shape[1] != self.n_features_in_: + raise ValueError return np.ones(shape=(X.shape[0],)) * self._mean def score(self, X, y): @@ -720,7 +730,7 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__.update(state) - def fit(self, X, y): + def fit(self, X, y=None): X = check_array(X) self.n_features_in_ = X.shape[1] return self From d254c880297ef01b8b3177c34d976402033f2221 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Nov 2020 13:41:50 +0100 Subject: [PATCH 21/21] iter --- sklearn/utils/estimator_checks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 67d269d62f168..ff3e32978dbb8 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2382,12 +2382,16 @@ def check_classifiers_classes(name, classifier_orig, api_only=False): for X, y, y_names in problems: for y_names_i in [y_names, y_names.astype('O')]: y_ = _choose_check_classifiers_labels(name, y, y_names_i) - check_classifiers_predictions(X, y_, name, classifier_orig) + check_classifiers_predictions( + X, y_, name, classifier_orig, api_only + ) labels_binary = [-1, 1] y_names_binary = np.take(labels_binary, y_binary) y_binary = _choose_check_classifiers_labels(name, y_binary, y_names_binary) - check_classifiers_predictions(X_binary, y_binary, name, classifier_orig) + check_classifiers_predictions( + X_binary, y_binary, name, classifier_orig, api_only + ) @ignore_warnings(category=FutureWarning)