Skip to content

Commit 5d2107e

Browse files
authored
DOC document and deprecate missing attributes in MiniBatchKMeans (#17864)
1 parent 89e49b6 commit 5d2107e

File tree

4 files changed

+67
-21
lines changed

4 files changed

+67
-21
lines changed

doc/whats_new/v0.24.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ Changelog
5454
:user:`Lucy Liu <lucyleeow>`.
5555

5656
:mod:`sklearn.cluster`
57-
.......................
57+
......................
5858

5959
- |Fix| Fixed a bug in :class:`cluster.MeanShift` with `bin_seeding=True`. When
6060
the estimated bandwidth is 0, the behavior is equivalent to
@@ -66,6 +66,10 @@ Changelog
6666
weighted by the sample weights. :pr:`17848` by
6767
:user:`Jérémie du Boisberranger <jeremiedbb>`.
6868

69+
- |API| :class:`cluster.MiniBatchKMeans` attributes, `counts_` and
70+
`init_size_`, are deprecated and will be removed in 0.26. :pr:`17864` by
71+
:user:`Jérémie du Boisberranger <jeremiedbb>`.
72+
6973
:mod:`sklearn.covariance`
7074
.........................
7175

sklearn/cluster/_kmeans.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from ..utils import check_array
2727
from ..utils import gen_batches
2828
from ..utils import check_random_state
29+
from ..utils import deprecated
2930
from ..utils.validation import check_is_fitted, _check_sample_weight
3031
from ..utils._openmp_helpers import _openmp_effective_n_threads
3132
from ..exceptions import ConvergenceWarning
@@ -1531,6 +1532,21 @@ class MiniBatchKMeans(KMeans):
15311532
defined as the sum of square distances of samples to their nearest
15321533
neighbor.
15331534
1535+
n_iter_ : int
1536+
Number of batches processed.
1537+
1538+
counts_ : ndarray of shape (n_clusters,)
1539+
Weigth sum of each cluster.
1540+
1541+
.. deprecated:: 0.24
1542+
This attribute is deprecated in 0.24 and will be removed in 0.26.
1543+
1544+
init_size_ : int
1545+
The effective number of samples used for the initialization.
1546+
1547+
.. deprecated:: 0.24
1548+
This attribute is deprecated in 0.24 and will be removed in 0.26.
1549+
15341550
See Also
15351551
--------
15361552
KMeans
@@ -1588,6 +1604,24 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,
15881604
self.init_size = init_size
15891605
self.reassignment_ratio = reassignment_ratio
15901606

1607+
@deprecated("The attribute 'counts_' is deprecated in 0.24" # type: ignore
1608+
" and will be removed in 0.26.")
1609+
@property
1610+
def counts_(self):
1611+
return self._counts
1612+
1613+
@deprecated("The attribute 'init_size_' is deprecated in " # type: ignore
1614+
"0.24 and will be removed in 0.26.")
1615+
@property
1616+
def init_size_(self):
1617+
return self._init_size
1618+
1619+
@deprecated("The attribute 'random_state_' is deprecated " # type: ignore
1620+
"in 0.24 and will be removed in 0.26.")
1621+
@property
1622+
def random_state_(self):
1623+
return getattr(self, "_random_state", None)
1624+
15911625
def _check_params(self, X):
15921626
super()._check_params(X)
15931627

@@ -1619,8 +1653,6 @@ def _check_params(self, X):
16191653
RuntimeWarning, stacklevel=2)
16201654
self._init_size = 3 * self.n_clusters
16211655
self._init_size = min(self._init_size, X.shape[0])
1622-
# FIXME: init_size_ will be deprecated and this line will be removed
1623-
self.init_size_ = self._init_size
16241656

16251657
# reassignment_ratio
16261658
if self.reassignment_ratio < 0:
@@ -1727,7 +1759,7 @@ def fit(self, X, y=None, sample_weight=None):
17271759
% (init_idx + 1, self._n_init, inertia))
17281760
if best_inertia is None or inertia < best_inertia:
17291761
self.cluster_centers_ = cluster_centers
1730-
self.counts_ = weight_sums
1762+
self._counts = weight_sums
17311763
best_inertia = inertia
17321764

17331765
# Empty context to be used inplace by the convergence check routine
@@ -1744,15 +1776,15 @@ def fit(self, X, y=None, sample_weight=None):
17441776
batch_inertia, centers_squared_diff = _mini_batch_step(
17451777
X[minibatch_indices], sample_weight[minibatch_indices],
17461778
x_squared_norms[minibatch_indices],
1747-
self.cluster_centers_, self.counts_,
1779+
self.cluster_centers_, self._counts,
17481780
old_center_buffer, tol > 0.0, distances=distances,
17491781
# Here we randomly choose whether to perform
17501782
# random reassignment: the choice is done as a function
17511783
# of the iteration index, and the minimum number of
17521784
# counts, in order to force this reassignment to happen
17531785
# every once in a while
17541786
random_reassign=((iteration_idx + 1)
1755-
% (10 + int(self.counts_.min())) == 0),
1787+
% (10 + int(self._counts.min())) == 0),
17561788
random_state=random_state,
17571789
reassignment_ratio=self.reassignment_ratio,
17581790
verbose=self.verbose)
@@ -1831,7 +1863,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
18311863
order='C', accept_large_sparse=False,
18321864
reset=is_first_call_to_partial_fit)
18331865

1834-
self.random_state_ = getattr(self, "random_state_",
1866+
self._random_state = getattr(self, "_random_state",
18351867
check_random_state(self.random_state))
18361868
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
18371869

@@ -1850,26 +1882,26 @@ def partial_fit(self, X, y=None, sample_weight=None):
18501882
# initialize the cluster centers
18511883
self.cluster_centers_ = _init_centroids(
18521884
X, self.n_clusters, init,
1853-
random_state=self.random_state_,
1885+
random_state=self._random_state,
18541886
x_squared_norms=x_squared_norms, init_size=self.init_size)
18551887

1856-
self.counts_ = np.zeros(self.n_clusters,
1888+
self._counts = np.zeros(self.n_clusters,
18571889
dtype=sample_weight.dtype)
18581890
random_reassign = False
18591891
distances = None
18601892
else:
18611893
# The lower the minimum count is, the more we do random
18621894
# reassignment, however, we don't want to do random
18631895
# reassignment too often, to allow for building up counts
1864-
random_reassign = self.random_state_.randint(
1865-
10 * (1 + self.counts_.min())) == 0
1896+
random_reassign = self._random_state.randint(
1897+
10 * (1 + self._counts.min())) == 0
18661898
distances = np.zeros(X.shape[0], dtype=X.dtype)
18671899

18681900
_mini_batch_step(X, sample_weight, x_squared_norms,
1869-
self.cluster_centers_, self.counts_,
1901+
self.cluster_centers_, self._counts,
18701902
np.zeros(0, dtype=X.dtype), 0,
18711903
random_reassign=random_reassign, distances=distances,
1872-
random_state=self.random_state_,
1904+
random_state=self._random_state,
18731905
reassignment_ratio=self.reassignment_ratio,
18741906
verbose=self.verbose)
18751907

sklearn/cluster/tests/test_k_means.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ def test_minibatch_reassign():
434434
# Turn on verbosity to smoke test the display code
435435
_mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
436436
mb_k_means.cluster_centers_,
437-
mb_k_means.counts_,
437+
mb_k_means._counts,
438438
np.zeros(X.shape[1], np.double),
439439
False, distances=np.zeros(X.shape[0]),
440440
random_reassign=True, random_state=42,
@@ -454,7 +454,7 @@ def test_minibatch_reassign():
454454
# Turn on verbosity to smoke test the display code
455455
_mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
456456
mb_k_means.cluster_centers_,
457-
mb_k_means.counts_,
457+
mb_k_means._counts,
458458
np.zeros(X.shape[1], np.double),
459459
False, distances=np.zeros(X.shape[0]),
460460
random_reassign=True, random_state=42,
@@ -529,7 +529,7 @@ def test_minibatch_set_init_size():
529529
init_size=666, random_state=42,
530530
n_init=1).fit(X)
531531
assert mb_k_means.init_size == 666
532-
assert mb_k_means.init_size_ == n_samples
532+
assert mb_k_means._init_size == n_samples
533533
_check_fitted_model(mb_k_means)
534534

535535

@@ -933,6 +933,19 @@ def test_n_jobs_deprecated(n_jobs):
933933
kmeans.fit(X)
934934

935935

936+
@pytest.mark.parametrize("attr", ["counts_", "init_size_", "random_state_"])
937+
def test_minibatch_kmeans_deprecated_attributes(attr):
938+
# check that we raise a deprecation warning when accessing `init_size_`
939+
# FIXME: remove in 0.26
940+
depr_msg = (f"The attribute '{attr}' is deprecated in 0.24 and will be "
941+
f"removed in 0.26.")
942+
km = MiniBatchKMeans(n_clusters=2, n_init=1, init='random', random_state=0)
943+
km.fit(X)
944+
945+
with pytest.warns(FutureWarning, match=depr_msg):
946+
getattr(km, attr)
947+
948+
936949
def test_warning_elkan_1_cluster():
937950
X, _ = make_blobs(n_samples=10, n_features=2, centers=1, random_state=0)
938951
kmeans = KMeans(n_clusters=1, n_init=1, init='random', random_state=0,

sklearn/tests/test_docstring_parameters.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -232,13 +232,10 @@ def test_fit_docstring_attributes(name, Estimator):
232232
with ignore_warnings(category=FutureWarning):
233233
assert hasattr(est, attr.name)
234234

235-
IGNORED = {'BayesianRidge', 'Birch', 'CCA', 'CategoricalNB',
236-
'KernelCenterer',
235+
IGNORED = {'BayesianRidge', 'Birch', 'CCA',
237236
'LarsCV', 'Lasso', 'LassoLarsIC',
238-
'MiniBatchKMeans',
239237
'OrthogonalMatchingPursuit',
240-
'PLSCanonical', 'PLSSVD',
241-
'PassiveAggressiveClassifier'}
238+
'PLSCanonical', 'PLSSVD'}
242239

243240
if Estimator.__name__ in IGNORED:
244241
pytest.xfail(

0 commit comments

Comments
 (0)