diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index b7f53eb3198c6..7b1e3109a9501 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -54,7 +54,7 @@ Changelog :user:`Lucy Liu `. :mod:`sklearn.cluster` -....................... +...................... - |Fix| Fixed a bug in :class:`cluster.MeanShift` with `bin_seeding=True`. When the estimated bandwidth is 0, the behavior is equivalent to @@ -66,6 +66,10 @@ Changelog weighted by the sample weights. :pr:`17848` by :user:`Jérémie du Boisberranger `. +- |API| :class:`cluster.MiniBatchKMeans` attributes, `counts_` and + `init_size_`, are deprecated and will be removed in 0.26. :pr:`17864` by + :user:`Jérémie du Boisberranger `. + :mod:`sklearn.covariance` ......................... diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 7b9a31d8d720b..d0c9ba84a0fa4 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -26,6 +26,7 @@ from ..utils import check_array from ..utils import gen_batches from ..utils import check_random_state +from ..utils import deprecated from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils._openmp_helpers import _openmp_effective_n_threads from ..exceptions import ConvergenceWarning @@ -1531,6 +1532,21 @@ class MiniBatchKMeans(KMeans): defined as the sum of square distances of samples to their nearest neighbor. + n_iter_ : int + Number of batches processed. + + counts_ : ndarray of shape (n_clusters,) + Weigth sum of each cluster. + + .. deprecated:: 0.24 + This attribute is deprecated in 0.24 and will be removed in 0.26. + + init_size_ : int + The effective number of samples used for the initialization. + + .. deprecated:: 0.24 + This attribute is deprecated in 0.24 and will be removed in 0.26. + See Also -------- KMeans @@ -1588,6 +1604,24 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, self.init_size = init_size self.reassignment_ratio = reassignment_ratio + @deprecated("The attribute 'counts_' is deprecated in 0.24" # type: ignore + " and will be removed in 0.26.") + @property + def counts_(self): + return self._counts + + @deprecated("The attribute 'init_size_' is deprecated in " # type: ignore + "0.24 and will be removed in 0.26.") + @property + def init_size_(self): + return self._init_size + + @deprecated("The attribute 'random_state_' is deprecated " # type: ignore + "in 0.24 and will be removed in 0.26.") + @property + def random_state_(self): + return getattr(self, "_random_state", None) + def _check_params(self, X): super()._check_params(X) @@ -1619,8 +1653,6 @@ def _check_params(self, X): RuntimeWarning, stacklevel=2) self._init_size = 3 * self.n_clusters self._init_size = min(self._init_size, X.shape[0]) - # FIXME: init_size_ will be deprecated and this line will be removed - self.init_size_ = self._init_size # reassignment_ratio if self.reassignment_ratio < 0: @@ -1727,7 +1759,7 @@ def fit(self, X, y=None, sample_weight=None): % (init_idx + 1, self._n_init, inertia)) if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers - self.counts_ = weight_sums + self._counts = weight_sums best_inertia = inertia # Empty context to be used inplace by the convergence check routine @@ -1744,7 +1776,7 @@ def fit(self, X, y=None, sample_weight=None): batch_inertia, centers_squared_diff = _mini_batch_step( X[minibatch_indices], sample_weight[minibatch_indices], x_squared_norms[minibatch_indices], - self.cluster_centers_, self.counts_, + self.cluster_centers_, self._counts, old_center_buffer, tol > 0.0, distances=distances, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function @@ -1752,7 +1784,7 @@ def fit(self, X, y=None, sample_weight=None): # counts, in order to force this reassignment to happen # every once in a while random_reassign=((iteration_idx + 1) - % (10 + int(self.counts_.min())) == 0), + % (10 + int(self._counts.min())) == 0), random_state=random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) @@ -1831,7 +1863,7 @@ def partial_fit(self, X, y=None, sample_weight=None): order='C', accept_large_sparse=False, reset=is_first_call_to_partial_fit) - self.random_state_ = getattr(self, "random_state_", + self._random_state = getattr(self, "_random_state", check_random_state(self.random_state)) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) @@ -1850,10 +1882,10 @@ def partial_fit(self, X, y=None, sample_weight=None): # initialize the cluster centers self.cluster_centers_ = _init_centroids( X, self.n_clusters, init, - random_state=self.random_state_, + random_state=self._random_state, x_squared_norms=x_squared_norms, init_size=self.init_size) - self.counts_ = np.zeros(self.n_clusters, + self._counts = np.zeros(self.n_clusters, dtype=sample_weight.dtype) random_reassign = False distances = None @@ -1861,15 +1893,15 @@ def partial_fit(self, X, y=None, sample_weight=None): # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts - random_reassign = self.random_state_.randint( - 10 * (1 + self.counts_.min())) == 0 + random_reassign = self._random_state.randint( + 10 * (1 + self._counts.min())) == 0 distances = np.zeros(X.shape[0], dtype=X.dtype) _mini_batch_step(X, sample_weight, x_squared_norms, - self.cluster_centers_, self.counts_, + self.cluster_centers_, self._counts, np.zeros(0, dtype=X.dtype), 0, random_reassign=random_reassign, distances=distances, - random_state=self.random_state_, + random_state=self._random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 39b2dd5920d67..5d3829610f203 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -434,7 +434,7 @@ def test_minibatch_reassign(): # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, - mb_k_means.counts_, + mb_k_means._counts, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, @@ -454,7 +454,7 @@ def test_minibatch_reassign(): # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, - mb_k_means.counts_, + mb_k_means._counts, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, @@ -529,7 +529,7 @@ def test_minibatch_set_init_size(): init_size=666, random_state=42, n_init=1).fit(X) assert mb_k_means.init_size == 666 - assert mb_k_means.init_size_ == n_samples + assert mb_k_means._init_size == n_samples _check_fitted_model(mb_k_means) @@ -933,6 +933,19 @@ def test_n_jobs_deprecated(n_jobs): kmeans.fit(X) +@pytest.mark.parametrize("attr", ["counts_", "init_size_", "random_state_"]) +def test_minibatch_kmeans_deprecated_attributes(attr): + # check that we raise a deprecation warning when accessing `init_size_` + # FIXME: remove in 0.26 + depr_msg = (f"The attribute '{attr}' is deprecated in 0.24 and will be " + f"removed in 0.26.") + km = MiniBatchKMeans(n_clusters=2, n_init=1, init='random', random_state=0) + km.fit(X) + + with pytest.warns(FutureWarning, match=depr_msg): + getattr(km, attr) + + def test_warning_elkan_1_cluster(): X, _ = make_blobs(n_samples=10, n_features=2, centers=1, random_state=0) kmeans = KMeans(n_clusters=1, n_init=1, init='random', random_state=0, diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index a4eb056d81e42..a48af83b15a7a 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -232,13 +232,10 @@ def test_fit_docstring_attributes(name, Estimator): with ignore_warnings(category=FutureWarning): assert hasattr(est, attr.name) - IGNORED = {'BayesianRidge', 'Birch', 'CCA', 'CategoricalNB', - 'KernelCenterer', + IGNORED = {'BayesianRidge', 'Birch', 'CCA', 'LarsCV', 'Lasso', 'LassoLarsIC', - 'MiniBatchKMeans', 'OrthogonalMatchingPursuit', - 'PLSCanonical', 'PLSSVD', - 'PassiveAggressiveClassifier'} + 'PLSCanonical', 'PLSSVD'} if Estimator.__name__ in IGNORED: pytest.xfail(