From bb8f4c22f497e7c015cf0078f1c9bf0133f0c78c Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 8 Jul 2020 15:58:46 +0200 Subject: [PATCH 01/10] document and deprecate minibatchkmeans attributes --- sklearn/cluster/_kmeans.py | 48 +++++++++++++++++++-------- sklearn/cluster/tests/test_k_means.py | 13 ++++++++ 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 842c44721da1c..917e6ddb3e559 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -26,6 +26,7 @@ from ..utils import check_array from ..utils import gen_batches from ..utils import check_random_state +from ..utils import deprecated from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils._openmp_helpers import _openmp_effective_n_threads from ..exceptions import ConvergenceWarning @@ -1517,6 +1518,12 @@ class MiniBatchKMeans(KMeans): defined as the sum of square distances of samples to their nearest neighbor. + counts_ : ndarray of shape (n_clusters,) + Weigth sum of each cluster. + + init_size_ : int + The effective number of samples used for the initialization. + See Also -------- KMeans @@ -1574,6 +1581,18 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, self.init_size = init_size self.reassignment_ratio = reassignment_ratio + @deprecated("The attribute 'counts_' is deprecated in 0.24 and will be " + "removed in 0.26.") + @property + def counts_(self): + return self._counts + + @deprecated("The attribute 'init_size_' is deprecated in 0.24 and will be " + "removed in 0.26.") + @property + def init_size_(self): + return self._init_size + def fit(self, X, y=None, sample_weight=None): """Compute the centroids on X by chunking it into mini-batches. @@ -1642,9 +1661,10 @@ def fit(self, X, y=None, sample_weight=None): init_size = 3 * self.batch_size if init_size > n_samples: init_size = n_samples - self.init_size_ = init_size + self._init_size = init_size - validation_indices = random_state.randint(0, n_samples, init_size) + validation_indices = random_state.randint(0, n_samples, + self._init_size) X_valid = X[validation_indices] sample_weight_valid = sample_weight[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] @@ -1666,7 +1686,7 @@ def fit(self, X, y=None, sample_weight=None): X, self.n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, - init_size=init_size) + init_size=self._init_size) # Compute the label assignment on the init dataset _mini_batch_step( @@ -1685,7 +1705,7 @@ def fit(self, X, y=None, sample_weight=None): % (init_idx + 1, n_init, inertia)) if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers - self.counts_ = weight_sums + self._counts = weight_sums best_inertia = inertia # Empty context to be used inplace by the convergence check routine @@ -1702,7 +1722,7 @@ def fit(self, X, y=None, sample_weight=None): batch_inertia, centers_squared_diff = _mini_batch_step( X[minibatch_indices], sample_weight[minibatch_indices], x_squared_norms[minibatch_indices], - self.cluster_centers_, self.counts_, + self.cluster_centers_, self._counts, old_center_buffer, tol > 0.0, distances=distances, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function @@ -1710,7 +1730,7 @@ def fit(self, X, y=None, sample_weight=None): # counts, in order to force this reassignment to happen # every once in a while random_reassign=((iteration_idx + 1) - % (10 + int(self.counts_.min())) == 0), + % (10 + int(self._counts.min())) == 0), random_state=random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) @@ -1795,18 +1815,18 @@ def partial_fit(self, X, y=None, sample_weight=None): sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) x_squared_norms = row_norms(X, squared=True) - self.random_state_ = getattr(self, "random_state_", + self._random_state = getattr(self, "_random_state", check_random_state(self.random_state)) - if (not hasattr(self, 'counts_') + if (not hasattr(self, '_counts') or not hasattr(self, 'cluster_centers_')): # this is the first call partial_fit on this object: # initialize the cluster centers self.cluster_centers_ = _init_centroids( X, self.n_clusters, self.init, - random_state=self.random_state_, + random_state=self._random_state, x_squared_norms=x_squared_norms, init_size=self.init_size) - self.counts_ = np.zeros(self.n_clusters, + self._counts = np.zeros(self.n_clusters, dtype=sample_weight.dtype) random_reassign = False distances = None @@ -1814,8 +1834,8 @@ def partial_fit(self, X, y=None, sample_weight=None): # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts - random_reassign = self.random_state_.randint( - 10 * (1 + self.counts_.min())) == 0 + random_reassign = self._random_state.randint( + 10 * (1 + self._counts.min())) == 0 distances = np.zeros(X.shape[0], dtype=X.dtype) # Raise error if partial_fit called on data with different number @@ -1826,10 +1846,10 @@ def partial_fit(self, X, y=None, sample_weight=None): "data %d." % (X.shape[1], self.cluster_centers_.shape[1])) _mini_batch_step(X, sample_weight, x_squared_norms, - self.cluster_centers_, self.counts_, + self.cluster_centers_, self._counts, np.zeros(0, dtype=X.dtype), 0, random_reassign=random_reassign, distances=distances, - random_state=self.random_state_, + random_state=self._random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 955a52dfdb413..d40a224581321 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -1108,3 +1108,16 @@ def test_sample_weight_unchanged(): # internally, sample_weight is rescale to sum up to n_samples = 3 assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) + + +@pytest.mark.parametrize("attr", ["counts_", "init_size_"]) +def test_minibatch_kmeans_deprecated_attributes(attr): + # check that we raise a deprecation warning when accessing `init_size_` + # FIXME: remove in 0.26 + depr_msg = (f"The attribute '{attr}' is deprecated in 0.24 and will be " + f"removed in 0.26.") + km = MiniBatchKMeans(n_clusters=2, n_init=1, init='random', random_state=0) + km.fit(X) + + with pytest.warns(FutureWarning, match=depr_msg): + getattr(km, attr) From 68faa0e3cbfb2b187fb7b8608488142fbaf9be46 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 8 Jul 2020 16:26:29 +0200 Subject: [PATCH 02/10] fix mypy --- sklearn/cluster/_kmeans.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 917e6ddb3e559..fe8ee963e9b83 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1521,9 +1521,15 @@ class MiniBatchKMeans(KMeans): counts_ : ndarray of shape (n_clusters,) Weigth sum of each cluster. + .. deprecated:: 0.24 + This attribute is deprecated in 0.24 and will be removed in 0.26. + init_size_ : int The effective number of samples used for the initialization. + .. deprecated:: 0.24 + This attribute is deprecated in 0.24 and will be removed in 0.26. + See Also -------- KMeans @@ -1581,14 +1587,14 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, self.init_size = init_size self.reassignment_ratio = reassignment_ratio - @deprecated("The attribute 'counts_' is deprecated in 0.24 and will be " - "removed in 0.26.") + @deprecated("The attribute 'counts_' is deprecated in 0.24" # type: ignore + " and will be removed in 0.26.") @property def counts_(self): return self._counts - @deprecated("The attribute 'init_size_' is deprecated in 0.24 and will be " - "removed in 0.26.") + @deprecated("The attribute 'init_size_' is deprecated in " # type: ignore + "0.24 and will be removed in 0.26.") @property def init_size_(self): return self._init_size From 9fbc07891d142239116231a3c27ea63631ee4047 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 9 Jul 2020 14:04:57 +0200 Subject: [PATCH 03/10] use private attrs in tests --- sklearn/cluster/tests/test_k_means.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index d40a224581321..7a11755169a27 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -465,7 +465,7 @@ def test_minibatch_reassign(): # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, - mb_k_means.counts_, + mb_k_means._counts, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, @@ -485,7 +485,7 @@ def test_minibatch_reassign(): # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, - mb_k_means.counts_, + mb_k_means._counts, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, @@ -543,7 +543,7 @@ def test_minibatch_default_init_size(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, batch_size=10, random_state=42, n_init=1).fit(X) - assert mb_k_means.init_size_ == 3 * mb_k_means.batch_size + assert mb_k_means._init_size == 3 * mb_k_means.batch_size _check_fitted_model(mb_k_means) @@ -558,7 +558,7 @@ def test_minibatch_set_init_size(): init_size=666, random_state=42, n_init=1).fit(X) assert mb_k_means.init_size == 666 - assert mb_k_means.init_size_ == n_samples + assert mb_k_means._init_size == n_samples _check_fitted_model(mb_k_means) From 4eb30c36b3e85a038c9bd61bd4faab031dc823a0 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 10 Jul 2020 12:07:30 +0200 Subject: [PATCH 04/10] what's new --- doc/whats_new/v0.24.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 85b5a12e7b20a..465ab7dfbc6df 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -54,13 +54,17 @@ Changelog :user:`Lucy Liu `. :mod:`sklearn.cluster` -......................... +...................... - |Fix| Fixed a bug in :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans` where the reported inertia was incorrectly weighted by the sample weights. :pr:`17848` by :user:`Jérémie du Boisberranger `. +- |API| :class:`cluster.MiniBatchKMeans` attributes, `counts_` and + `init_size_`, are deprecated and will be removed in 0.26. :pr:`17864` by + :user:`Jérémie du Boisberranger `. + :mod:`sklearn.covariance` ......................... From d92f80c1b9c7b15186567c118c552516a3e674b6 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 10 Jul 2020 12:23:33 +0200 Subject: [PATCH 05/10] document n_iter_ --- sklearn/cluster/_kmeans.py | 3 +++ sklearn/tests/test_docstring_parameters.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index f80fe2c4ca198..5ef86ee8bf4eb 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1532,6 +1532,9 @@ class MiniBatchKMeans(KMeans): defined as the sum of square distances of samples to their nearest neighbor. + n_iter_ : int + Number of batches processed. + counts_ : ndarray of shape (n_clusters,) Weigth sum of each cluster. diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 67b8fdb2c0c25..96ba9c6d204c7 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -235,7 +235,6 @@ def test_fit_docstring_attributes(name, Estimator): IGNORED = {'BayesianRidge', 'Birch', 'CCA', 'CategoricalNB', 'KernelCenterer', 'LarsCV', 'Lasso', 'LassoLarsCV', 'LassoLarsIC', - 'MiniBatchKMeans', 'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSSVD', 'PassiveAggressiveClassifier'} From 5a6bb0f9e4c591069707336c98ffaed5e2994051 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 10 Jul 2020 12:35:24 +0200 Subject: [PATCH 06/10] remove working estimators from blacklist --- sklearn/tests/test_docstring_parameters.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 96ba9c6d204c7..ec6b530aef1fb 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -232,12 +232,10 @@ def test_fit_docstring_attributes(name, Estimator): with ignore_warnings(category=FutureWarning): assert hasattr(est, attr.name) - IGNORED = {'BayesianRidge', 'Birch', 'CCA', 'CategoricalNB', - 'KernelCenterer', + IGNORED = {'BayesianRidge', 'Birch', 'CCA', 'LarsCV', 'Lasso', 'LassoLarsCV', 'LassoLarsIC', 'OrthogonalMatchingPursuit', - 'PLSCanonical', 'PLSSVD', - 'PassiveAggressiveClassifier'} + 'PLSCanonical', 'PLSSVD'} if Estimator.__name__ in IGNORED: pytest.xfail( From e85d262ed5388e96da8c40df184fb5f4ed529530 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 10 Jul 2020 12:43:26 +0200 Subject: [PATCH 07/10] .. --- sklearn/tests/test_docstring_parameters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index ec6b530aef1fb..a48af83b15a7a 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -233,7 +233,7 @@ def test_fit_docstring_attributes(name, Estimator): assert hasattr(est, attr.name) IGNORED = {'BayesianRidge', 'Birch', 'CCA', - 'LarsCV', 'Lasso', 'LassoLarsCV', 'LassoLarsIC', + 'LarsCV', 'Lasso', 'LassoLarsIC', 'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSSVD'} From 2c080960c51344639cd9bbd609d4dab5d9eeb6b1 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 10 Jul 2020 19:04:22 +0200 Subject: [PATCH 08/10] bad conflict resolution --- sklearn/cluster/_kmeans.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 5ef86ee8bf4eb..24c1e0e714122 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1857,7 +1857,7 @@ def partial_fit(self, X, y=None, sample_weight=None): order='C', accept_large_sparse=False, reset=is_first_call_to_partial_fit) - self.random_state_ = getattr(self, "random_state_", + self._random_state = getattr(self, "_random_state", check_random_state(self.random_state)) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) @@ -1876,7 +1876,7 @@ def partial_fit(self, X, y=None, sample_weight=None): # initialize the cluster centers self.cluster_centers_ = _init_centroids( X, self.n_clusters, init, - random_state=self.random_state_, + random_state=self._random_state, x_squared_norms=x_squared_norms, init_size=self.init_size) self._counts = np.zeros(self.n_clusters, From 1a45080df05a032cdc5578ad41d3a4d5d7ee12cf Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Sat, 11 Jul 2020 17:09:36 +0200 Subject: [PATCH 09/10] deprecate random_state_ --- sklearn/cluster/_kmeans.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 24c1e0e714122..d0c9ba84a0fa4 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1616,6 +1616,12 @@ def counts_(self): def init_size_(self): return self._init_size + @deprecated("The attribute 'random_state_' is deprecated " # type: ignore + "in 0.24 and will be removed in 0.26.") + @property + def random_state_(self): + return getattr(self, "_random_state", None) + def _check_params(self, X): super()._check_params(X) From 59f1dff8c07e18107782768381384ba31299fd7a Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Sat, 11 Jul 2020 17:10:44 +0200 Subject: [PATCH 10/10] with a test --- sklearn/cluster/tests/test_k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 31ee89d5ff73d..5d3829610f203 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -933,7 +933,7 @@ def test_n_jobs_deprecated(n_jobs): kmeans.fit(X) -@pytest.mark.parametrize("attr", ["counts_", "init_size_"]) +@pytest.mark.parametrize("attr", ["counts_", "init_size_", "random_state_"]) def test_minibatch_kmeans_deprecated_attributes(attr): # check that we raise a deprecation warning when accessing `init_size_` # FIXME: remove in 0.26