From 60a305f10391cd2be14a8792d9dce3bdb8bc4d55 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Sep 2021 13:16:23 +0200 Subject: [PATCH 1/8] TST enable tests K-means for MacOS --- sklearn/cluster/tests/test_k_means.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 4e71cd1f35645..7f99d345ed774 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -339,19 +339,6 @@ def test_fortran_aligned_data(Estimator): ) def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol): # check that fit.predict gives same result as fit_predict - # There's a very small chance of failure with elkan on unstructured dataset - # because predict method uses fast euclidean distances computation which - # may cause small numerical instabilities. - # NB: This test is largely redundant with respect to test_predict and - # test_predict_equal_labels. This test has the added effect of - # testing idempotence of the fittng procesdure which appears to - # be where it fails on some MacOS setups. - if sys.platform == "darwin": - pytest.xfail( - "Known failures on MacOS, See " - "https://github.com/scikit-learn/scikit-learn/issues/12644" - ) - rng = np.random.RandomState(seed) X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[ @@ -633,16 +620,6 @@ def test_score_max_iter(Estimator): def test_predict(Estimator, algorithm, init, dtype, array_constr): # Check the predict method and the equivalence between fit.predict and # fit_predict. - - # There's a very small chance of failure with elkan on unstructured dataset - # because predict method uses fast euclidean distances computation which - # may cause small numerical instabilities. - if sys.platform == "darwin": - pytest.xfail( - "Known failures on MacOS, See " - "https://github.com/scikit-learn/scikit-learn/issues/12644" - ) - X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0) X = array_constr(X) From 393b9da19a1b244e281d780a242930e979d67f73 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Sep 2021 13:18:35 +0200 Subject: [PATCH 2/8] [cd build] build everything From 10ac47fc5ec5aef176b7a9e370adfe2ba041e764 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Sep 2021 14:40:23 +0200 Subject: [PATCH 3/8] address jeremie commetns --- sklearn/cluster/tests/test_k_means.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 7f99d345ed774..764c6e5612bfb 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -642,13 +642,6 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr): pred = km.predict(km.cluster_centers_) assert_array_equal(pred, np.arange(10)) - # With n_init > 1 - # Due to randomness in the order in which chunks of data are processed when - # using more than one thread, there might be different rounding errors for - # the computation of the inertia between 2 runs. This might result in a - # different ranking of 2 inits, hence a different labeling, even if they - # give the same clustering. We only check the labels up to a permutation. - km = Estimator(n_clusters=10, init=init, n_init=10, random_state=0) if algorithm is not None: km.set_params(algorithm=algorithm) @@ -657,11 +650,11 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr): # re-predict labels for training set using predict pred = km.predict(X) - assert_allclose(v_measure_score(pred, labels), 1) + assert_array_equal(pred, labels) # re-predict labels for training set using fit_predict pred = km.fit_predict(X) - assert_allclose(v_measure_score(pred, labels), 1) + assert_array_equal(pred, labels) # predict centroid labels pred = km.predict(km.cluster_centers_) From 7b7d7cc8ca8d792b216b9f46fd4c96a61756a215 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Sep 2021 14:42:03 +0200 Subject: [PATCH 4/8] whoops --- sklearn/cluster/tests/test_k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 764c6e5612bfb..62e0e828c19ba 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -658,7 +658,7 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr): # predict centroid labels pred = km.predict(km.cluster_centers_) - assert_allclose(v_measure_score(pred, np.arange(10)), 1) + assert_array_equal(pred, np.arange(10)) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) From b6224047f0a385947d6b43e9692af3fbb7eccdbd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Sep 2021 15:09:01 +0200 Subject: [PATCH 5/8] [cd build] trigger CD From 2bbd2db24219e04cb569721d40cd1f75e88e262c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Sep 2021 16:01:38 +0200 Subject: [PATCH 6/8] [cd build] clean up --- sklearn/cluster/tests/test_k_means.py | 32 ++++----------------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 62e0e828c19ba..021516d740b0d 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -352,12 +352,7 @@ def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol): labels_1 = kmeans.fit(X).predict(X) labels_2 = kmeans.fit_predict(X) - - # Due to randomness in the order in which chunks of data are processed when - # using more than one thread, the absolute values of the labels can be - # different between the 2 strategies but they should correspond to the same - # clustering. - assert v_measure_score(labels_1, labels_2) == pytest.approx(1, abs=1e-15) + assert_array_equal(labels_1, labels_2) def test_minibatch_kmeans_verbose(): @@ -623,25 +618,6 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr): X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0) X = array_constr(X) - # With n_init = 1 - km = Estimator(n_clusters=10, init=init, n_init=1, random_state=0) - if algorithm is not None: - km.set_params(algorithm=algorithm) - km.fit(X) - labels = km.labels_ - - # re-predict labels for training set using predict - pred = km.predict(X) - assert_array_equal(pred, labels) - - # re-predict labels for training set using fit_predict - pred = km.fit_predict(X) - assert_array_equal(pred, labels) - - # predict centroid labels - pred = km.predict(km.cluster_centers_) - assert_array_equal(pred, np.arange(10)) - km = Estimator(n_clusters=10, init=init, n_init=10, random_state=0) if algorithm is not None: km.set_params(algorithm=algorithm) @@ -715,7 +691,7 @@ def test_integer_input(Estimator, array_constr, dtype, init): assert km.cluster_centers_.dtype == np.float64 expected_labels = [0, 1, 1, 0, 0, 1] - assert_allclose(v_measure_score(km.labels_, expected_labels), 1) + assert_array_equal(km.labels_, expected_labels) # Same with partial_fit (#14314) if Estimator is MiniBatchKMeans: @@ -774,8 +750,8 @@ def test_k_means_function(): assert cluster_centers.shape == (n_clusters, n_features) assert np.unique(labels).shape[0] == n_clusters - # check that the labels assignment are perfect (up to a permutation) - assert_allclose(v_measure_score(true_labels, labels), 1.0) + # check that the labels assignment are perfect + assert_array_equal(true_labels, labels) assert inertia > 0.0 From a293e108bbdf41565c6b68299a5c7813600d26b2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Sep 2021 16:29:19 +0200 Subject: [PATCH 7/8] Update test_k_means.py --- sklearn/cluster/tests/test_k_means.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 021516d740b0d..cbac3974c1e11 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -750,8 +750,8 @@ def test_k_means_function(): assert cluster_centers.shape == (n_clusters, n_features) assert np.unique(labels).shape[0] == n_clusters - # check that the labels assignment are perfect - assert_array_equal(true_labels, labels) + # check that the labels assignment are perfect (up to a permutation) + assert_allclose(v_measure_score(true_labels, labels), 1.0) assert inertia > 0.0 From 207c9df5499c79ec749e2051820752f3ca7bdaa8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 15 Sep 2021 16:36:34 +0200 Subject: [PATCH 8/8] [cd build] check CD