From 352b38c7d0a85cfd0f3f6759be34347046aeaa77 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 26 Oct 2018 14:51:27 +0200 Subject: [PATCH 1/6] add check_partial_fit and change in common tests accordingly --- sklearn/utils/estimator_checks.py | 20 +++++++++++--------- sklearn/utils/validation.py | 23 +++++++++++++++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5c226ac8ba8e7..4069153b02568 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -7,6 +7,7 @@ import pickle from copy import deepcopy from functools import partial +import pytest import numpy as np from scipy import sparse @@ -126,7 +127,6 @@ def _yield_classifier_checks(name, classifier): # test classifiers trained on a single label always return this label yield check_classifiers_one_label yield check_classifiers_classes - yield check_estimators_partial_fit_n_features # basic consistency testing yield check_classifiers_train yield partial(check_classifiers_train, readonly_memmap=True) @@ -179,7 +179,6 @@ def _yield_regressor_checks(name, regressor): yield check_regressors_train yield partial(check_regressors_train, readonly_memmap=True) yield check_regressor_data_not_an_array - yield check_estimators_partial_fit_n_features yield check_regressors_no_decision_function yield check_supervised_y_2d yield check_supervised_y_no_nan @@ -220,7 +219,6 @@ def _yield_clustering_checks(name, clusterer): # let's not test that here. yield check_clustering yield partial(check_clustering, readonly_memmap=True) - yield check_estimators_partial_fit_n_features yield check_non_transformer_estimators_n_iter @@ -268,6 +266,7 @@ def _yield_all_checks(name, estimator): yield check_dict_unchanged yield check_dont_overwrite_parameters yield check_fit_idempotent + yield check_estimators_partial_fit_n_features def check_estimator(Estimator): @@ -1261,12 +1260,15 @@ def check_estimators_partial_fit_n_features(name, estimator_orig): except NotImplementedError: return - with assert_raises(ValueError, - msg="The estimator {} does not raise an" - " error when the number of features" - " changes between calls to " - "partial_fit.".format(name)): - estimator.partial_fit(X[:, :-1], y) + try: + with pytest.raises(ValueError, match="Number of input features has " + "changed .* between calls to " + "partial_fit"): + estimator.partial_fit(X[:, :-1], y) + except pytest.fail.Exception: + raise AssertionError("The estimator {} does not raise an appropriate " + "error when the number of features changes " + "between calls to partial_fit.".format(name)) @ignore_warnings(category=(DeprecationWarning, FutureWarning)) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 3ae1b283ccef5..9793241d7f639 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -971,3 +971,26 @@ def check_non_negative(X, whom): if X_min < 0: raise ValueError("Negative values in data passed to %s" % whom) + + +def check_partial_fit_n_features(X, components, estimator): + """ + Check if number of features is preseved between calls to partial_fit. + + Parameters + ---------- + X : array-like + Input data for the new call to partial_fit + + components : array_like + Fitted attribute of an estimator which has the same number of features + as the input data from the first fit. + + estimator : estimator instance. + Estimator instance for which the check is performed. + """ + if X.shape[-1] != components.shape[-1]: + raise ValueError("Number of input features has changed from {0} to {1}" + " between calls to partial_fit of {2}." + "".format(X.shape[-1], components.shape[-1], + type(estimator).__name__)) From ef0b8722b8dfc6361c7db88688a389fbcf559020 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 26 Oct 2018 14:52:13 +0200 Subject: [PATCH 2/6] add check_partial_fit... to every partial_fit method --- sklearn/cluster/birch.py | 9 +++------ sklearn/cluster/k_means_.py | 3 ++- sklearn/decomposition/dict_learning.py | 3 ++- sklearn/decomposition/incremental_pca.py | 10 +++------- sklearn/decomposition/online_lda.py | 10 +++------- sklearn/linear_model/stochastic_gradient.py | 13 ++++++------- sklearn/naive_bayes.py | 12 +++++------- sklearn/neural_network/multilayer_perceptron.py | 2 +- sklearn/neural_network/rbm.py | 4 +++- sklearn/preprocessing/data.py | 16 +++++++++++++++- 10 files changed, 43 insertions(+), 39 deletions(-) diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py index 188eff02b6f02..8cb054ce84852 100644 --- a/sklearn/cluster/birch.py +++ b/sklearn/cluster/birch.py @@ -14,7 +14,7 @@ from ..externals.six.moves import xrange from ..utils import check_array from ..utils.extmath import row_norms, safe_sparse_dot -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_partial_fit_n_features from ..exceptions import NotFittedError, ConvergenceWarning from .hierarchical import AgglomerativeClustering @@ -546,11 +546,8 @@ def _check_fit(self, X): # Should raise an error if one does not fit before predicting. if not (is_fitted or has_partial_fit): raise NotFittedError("Fit training data before predicting") - - if is_fitted and X.shape[1] != self.subcluster_centers_.shape[1]: - raise ValueError( - "Training data and predicted data do " - "not have same number of features.") + if is_fitted: + check_partial_fit_n_features(X, self.subcluster_centers_, self) def predict(self, X): """ diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index b79db75e0e720..eba3258875218 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -27,7 +27,7 @@ from ..utils import check_array from ..utils import gen_batches from ..utils import check_random_state -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_partial_fit_n_features from ..utils.validation import FLOAT_DTYPES from ..utils import Parallel from ..utils import delayed @@ -1694,6 +1694,7 @@ def partial_fit(self, X, y=None, sample_weight=None): random_reassign = False distances = None else: + check_partial_fit_n_features(X, self.cluster_centers_, self) # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index f39e26e083cee..f7b86619710e7 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -19,7 +19,7 @@ from ..utils import (check_array, check_random_state, gen_even_slices, gen_batches) from ..utils.extmath import randomized_svd, row_norms -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_partial_fit_n_features from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars @@ -1412,6 +1412,7 @@ def partial_fit(self, X, y=None, iter_offset=None): self.random_state_ = check_random_state(self.random_state) X = check_array(X) if hasattr(self, 'components_'): + check_partial_fit_n_features(X, self.components_, self) dict_init = self.components_ else: dict_init = self.dict_init diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index 779ebf42b20f1..52210b1fc66c3 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -11,6 +11,7 @@ from .base import _BasePCA from ..utils import check_array, gen_batches from ..utils.extmath import svd_flip, _incremental_mean_and_var +from ..utils.validation import check_partial_fit_n_features class IncrementalPCA(_BasePCA): @@ -224,6 +225,8 @@ def partial_fit(self, X, y=None, check_input=True): n_samples, n_features = X.shape if not hasattr(self, 'components_'): self.components_ = None + elif self.components_ is not None: + check_partial_fit_n_features(X, self.components_, self) if self.n_components is None: if self.components_ is None: @@ -241,13 +244,6 @@ def partial_fit(self, X, y=None, check_input=True): else: self.n_components_ = self.n_components - if (self.components_ is not None) and (self.components_.shape[0] != - self.n_components_): - raise ValueError("Number of input features has changed from %i " - "to %i between calls to partial_fit! Try " - "setting n_components to a fixed value." % - (self.components_.shape[0], self.n_components_)) - # This is the first partial_fit if not hasattr(self, 'n_samples_seen_'): self.n_samples_seen_ = 0 diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 4c0f8625771c7..0624258d07c41 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -19,7 +19,7 @@ from ..utils import (check_random_state, check_array, gen_batches, gen_even_slices) from ..utils.fixes import logsumexp -from ..utils.validation import check_non_negative +from ..utils.validation import check_non_negative, check_partial_fit_n_features from ..utils import Parallel, delayed, effective_n_jobs from ..externals.six.moves import xrange from ..exceptions import NotFittedError @@ -493,12 +493,8 @@ def partial_fit(self, X, y=None): # initialize parameters or check if not hasattr(self, 'components_'): self._init_latent_vars(n_features) - - if n_features != self.components_.shape[1]: - raise ValueError( - "The provided data has %d dimensions while " - "the model was trained with feature size %d." % - (n_features, self.components_.shape[1])) + else: + check_partial_fit_n_features(X, self.components_, self) n_jobs = effective_n_jobs(self.n_jobs) with Parallel(n_jobs=n_jobs, verbose=max(0, diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index 146d9623f22e7..f67a49a131ce6 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -18,7 +18,7 @@ from ..utils import check_array, check_random_state, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.multiclass import _check_partial_fit_first_call -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_partial_fit_n_features from ..exceptions import ConvergenceWarning from ..externals import six from ..model_selection import StratifiedShuffleSplit, ShuffleSplit @@ -533,9 +533,8 @@ def _partial_fit(self, X, y, alpha, C, if getattr(self, "coef_", None) is None or coef_init is not None: self._allocate_parameter_mem(n_classes, n_features, coef_init, intercept_init) - elif n_features != self.coef_.shape[-1]: - raise ValueError("Number of features %d does not match previous " - "data %d." % (n_features, self.coef_.shape[-1])) + else: + check_partial_fit_n_features(X, self.coef_, self) self.loss_function_ = self._get_loss_function(loss) if not hasattr(self, "t_"): @@ -1144,9 +1143,9 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate, if getattr(self, "coef_", None) is None: self._allocate_parameter_mem(1, n_features, coef_init, intercept_init) - elif n_features != self.coef_.shape[-1]: - raise ValueError("Number of features %d does not match previous " - "data %d." % (n_features, self.coef_.shape[-1])) + else: + check_partial_fit_n_features(X, self.coef_, self) + if self.average > 0 and getattr(self, "average_coef_", None) is None: self.average_coef_ = np.zeros(n_features, dtype=np.float64, diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index dced4fbdb3dd2..cb650e6593700 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -30,7 +30,7 @@ from .utils.extmath import safe_sparse_dot from .utils.fixes import logsumexp from .utils.multiclass import _check_partial_fit_first_call -from .utils.validation import check_is_fitted +from .utils.validation import check_is_fitted, check_partial_fit_n_features from .externals import six __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB'] @@ -382,9 +382,8 @@ def _partial_fit(self, X, y, classes=None, _refit=False, self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) else: - if X.shape[1] != self.theta_.shape[1]: - msg = "Number of features %d does not match previous data %d." - raise ValueError(msg % (X.shape[1], self.theta_.shape[1])) + check_partial_fit_n_features(X, self.theta_, self) + # Put epsilon back in each time self.sigma_[:, :] -= self.epsilon_ @@ -527,9 +526,8 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64) - elif n_features != self.coef_.shape[1]: - msg = "Number of features %d does not match previous data %d." - raise ValueError(msg % (n_features, self.coef_.shape[-1])) + else: + check_partial_fit_n_features(X, self.coef_, self) Y = label_binarize(y, classes=self.classes_) if Y.shape[1] == 1: diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py index de559dc67e18f..4754b3ec22d41 100644 --- a/sklearn/neural_network/multilayer_perceptron.py +++ b/sklearn/neural_network/multilayer_perceptron.py @@ -24,7 +24,7 @@ from ..utils import check_array, check_X_y, column_or_1d from ..exceptions import ConvergenceWarning from ..utils.extmath import safe_sparse_dot -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_partial_fit_n_features from ..utils.multiclass import _check_partial_fit_first_call, unique_labels from ..utils.multiclass import type_of_target diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py index 1361bffe0d240..6a76e568938c7 100644 --- a/sklearn/neural_network/rbm.py +++ b/sklearn/neural_network/rbm.py @@ -21,7 +21,7 @@ from ..utils import gen_even_slices from ..utils.extmath import safe_sparse_dot from ..utils.extmath import log_logistic -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_partial_fit_n_features class BernoulliRBM(BaseEstimator, TransformerMixin): @@ -243,6 +243,8 @@ def partial_fit(self, X, y=None): (self.n_components, X.shape[1]) ), order='F') + else: + check_partial_fit_n_features(X, self.components_, self) if not hasattr(self, 'intercept_hidden_'): self.intercept_hidden_ = np.zeros(self.n_components, ) if not hasattr(self, 'intercept_visible_'): diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 48e78302e0594..5482e8bb23f35 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -31,7 +31,7 @@ mean_variance_axis, incr_mean_variance_axis, min_max_axis) from ..utils.validation import (check_is_fitted, check_random_state, - FLOAT_DTYPES) + FLOAT_DTYPES, check_partial_fit_n_features) from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -358,6 +358,8 @@ def partial_fit(self, X, y=None): self.n_samples_seen_ = X.shape[0] # Next steps else: + check_partial_fit_n_features(X, self.scale_, self) + data_min = np.minimum(self.data_min_, data_min) data_max = np.maximum(self.data_max_, data_max) self.n_samples_seen_ += X.shape[0] @@ -652,6 +654,16 @@ def partial_fit(self, X, y=None): self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1]).astype(np.int64) + # if first pass: store number of features + if not hasattr(self, "mean_"): + self._n_features_ = X.shape[1] + + # check number of features consistency for next passes + if hasattr(self, "mean_") and self.mean_ is not None: + check_partial_fit_n_features(X, self.mean_, self) + if hasattr(self, "scale_") and self.scale_ is not None: + check_partial_fit_n_features(X, self.scale_, self) + if sparse.issparse(X): if self.with_mean: raise ValueError( @@ -911,6 +923,8 @@ def partial_fit(self, X, y=None): self.n_samples_seen_ = X.shape[0] # Next passes else: + check_partial_fit_n_features(X, self.scale_, self) + max_abs = np.maximum(self.max_abs_, max_abs) self.n_samples_seen_ += X.shape[0] From 08062873a9ed58e9ff75d3af3057fdfe17da2082 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 26 Oct 2018 18:30:14 +0200 Subject: [PATCH 3/6] add mlp --- sklearn/neural_network/multilayer_perceptron.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py index 4754b3ec22d41..ff41de7600ec5 100644 --- a/sklearn/neural_network/multilayer_perceptron.py +++ b/sklearn/neural_network/multilayer_perceptron.py @@ -340,6 +340,8 @@ def _fit(self, X, y, incremental=False): incremental): # First time training the model self._initialize(y, layer_units) + else: + check_partial_fit_n_features(X, self.coefs_[0].T, self) # lbfgs does not support mini-batches if self.solver == 'lbfgs': From f1bfca330cbc236e6b783668d7797ca47f4aa354 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 29 Oct 2018 10:11:41 +0100 Subject: [PATCH 4/6] remove duplicate test + fix transform test --- sklearn/decomposition/tests/test_online_lda.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index 0abc2efe75ec2..887f9de0cd15c 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -142,21 +142,6 @@ def test_lda_fit_transform(method): assert_array_almost_equal(X_fit, X_trans, 4) -def test_lda_partial_fit_dim_mismatch(): - # test `n_features` mismatch in `partial_fit` - rng = np.random.RandomState(0) - n_components = rng.randint(3, 6) - n_col = rng.randint(6, 10) - X_1 = np.random.randint(4, size=(10, n_col)) - X_2 = np.random.randint(4, size=(10, n_col + 1)) - lda = LatentDirichletAllocation(n_components=n_components, - learning_offset=5., total_samples=20, - random_state=rng) - lda.partial_fit(X_1) - assert_raises_regexp(ValueError, r"^The provided data has", - lda.partial_fit, X_2) - - def test_invalid_params(): # test `_check_params` method X = np.ones((5, 10)) @@ -202,7 +187,7 @@ def test_lda_transform_mismatch(): random_state=rng) lda.partial_fit(X) assert_raises_regexp(ValueError, r"^The provided data has", - lda.partial_fit, X_2) + lda.transform, X_2) @if_safe_multiprocessing_with_blas From 6b6b6edcb5324fb5f297c52430bb4a76093431fb Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 29 Oct 2018 10:12:32 +0100 Subject: [PATCH 5/6] incremental_pca un-remove component mismatch err msg --- sklearn/decomposition/incremental_pca.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index 52210b1fc66c3..a21bf7c078b5a 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -244,6 +244,13 @@ def partial_fit(self, X, y=None, check_input=True): else: self.n_components_ = self.n_components + if (self.components_ is not None) and (self.components_.shape[0] != + self.n_components_): + raise ValueError("Number of components has changed from %i " + "to %i between calls to partial_fit! Try " + "setting n_components to a fixed value." % + (self.components_.shape[0], self.n_components_)) + # This is the first partial_fit if not hasattr(self, 'n_samples_seen_'): self.n_samples_seen_ = 0 From 627d2ee7596b66c8bdfd584d3656acded9d6e942 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 30 Oct 2018 10:57:37 +0100 Subject: [PATCH 6/6] undo pytest dependency, use assert_raises_regex --- sklearn/utils/estimator_checks.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4069153b02568..8bb76692f0c79 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -7,7 +7,6 @@ import pickle from copy import deepcopy from functools import partial -import pytest import numpy as np from scipy import sparse @@ -1260,15 +1259,14 @@ def check_estimators_partial_fit_n_features(name, estimator_orig): except NotImplementedError: return - try: - with pytest.raises(ValueError, match="Number of input features has " - "changed .* between calls to " - "partial_fit"): - estimator.partial_fit(X[:, :-1], y) - except pytest.fail.Exception: - raise AssertionError("The estimator {} does not raise an appropriate " - "error when the number of features changes " - "between calls to partial_fit.".format(name)) + match = ("Number of input features has changed .* between " + "calls to partial_fit") + msg = ("The estimator {} does not raise an appropriate error when " + "the number of features changes between calls to " + "partial_fit.".format(name)) + + with assert_raises_regex(ValueError, match, msg=msg): + estimator.partial_fit(X[:, :-1], y) @ignore_warnings(category=(DeprecationWarning, FutureWarning))