Skip to content

TST enable tests K-means for MacOS #21052

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Sep 15, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 3 additions & 57 deletions sklearn/cluster/tests/test_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,19 +339,6 @@ def test_fortran_aligned_data(Estimator):
)
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
# check that fit.predict gives same result as fit_predict
# There's a very small chance of failure with elkan on unstructured dataset
# because predict method uses fast euclidean distances computation which
# may cause small numerical instabilities.
# NB: This test is largely redundant with respect to test_predict and
# test_predict_equal_labels. This test has the added effect of
# testing idempotence of the fittng procesdure which appears to
# be where it fails on some MacOS setups.
if sys.platform == "darwin":
pytest.xfail(
"Known failures on MacOS, See "
"https://github.com/scikit-learn/scikit-learn/issues/12644"
)

rng = np.random.RandomState(seed)

X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[
Expand All @@ -365,12 +352,7 @@ def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):

labels_1 = kmeans.fit(X).predict(X)
labels_2 = kmeans.fit_predict(X)

# Due to randomness in the order in which chunks of data are processed when
# using more than one thread, the absolute values of the labels can be
# different between the 2 strategies but they should correspond to the same
# clustering.
assert v_measure_score(labels_1, labels_2) == pytest.approx(1, abs=1e-15)
assert_array_equal(labels_1, labels_2)


def test_minibatch_kmeans_verbose():
Expand Down Expand Up @@ -633,21 +615,10 @@ def test_score_max_iter(Estimator):
def test_predict(Estimator, algorithm, init, dtype, array_constr):
# Check the predict method and the equivalence between fit.predict and
# fit_predict.

# There's a very small chance of failure with elkan on unstructured dataset
# because predict method uses fast euclidean distances computation which
# may cause small numerical instabilities.
if sys.platform == "darwin":
pytest.xfail(
"Known failures on MacOS, See "
"https://github.com/scikit-learn/scikit-learn/issues/12644"
)

X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0)
X = array_constr(X)

# With n_init = 1
km = Estimator(n_clusters=10, init=init, n_init=1, random_state=0)
km = Estimator(n_clusters=10, init=init, n_init=10, random_state=0)
if algorithm is not None:
km.set_params(algorithm=algorithm)
km.fit(X)
Expand All @@ -665,31 +636,6 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr):
pred = km.predict(km.cluster_centers_)
assert_array_equal(pred, np.arange(10))

# With n_init > 1
# Due to randomness in the order in which chunks of data are processed when
# using more than one thread, there might be different rounding errors for
# the computation of the inertia between 2 runs. This might result in a
# different ranking of 2 inits, hence a different labeling, even if they
# give the same clustering. We only check the labels up to a permutation.

km = Estimator(n_clusters=10, init=init, n_init=10, random_state=0)
if algorithm is not None:
km.set_params(algorithm=algorithm)
km.fit(X)
labels = km.labels_

# re-predict labels for training set using predict
pred = km.predict(X)
assert_allclose(v_measure_score(pred, labels), 1)

# re-predict labels for training set using fit_predict
pred = km.fit_predict(X)
assert_allclose(v_measure_score(pred, labels), 1)

# predict centroid labels
pred = km.predict(km.cluster_centers_)
assert_allclose(v_measure_score(pred, np.arange(10)), 1)


@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
def test_dense_sparse(Estimator):
Expand Down Expand Up @@ -745,7 +691,7 @@ def test_integer_input(Estimator, array_constr, dtype, init):
assert km.cluster_centers_.dtype == np.float64

expected_labels = [0, 1, 1, 0, 0, 1]
assert_allclose(v_measure_score(km.labels_, expected_labels), 1)
assert_array_equal(km.labels_, expected_labels)

# Same with partial_fit (#14314)
if Estimator is MiniBatchKMeans:
Expand Down