From 83013911df8578877db30d6b94f1b6bd2a72c6a3 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 12 Oct 2018 13:20:21 +0200 Subject: [PATCH 1/5] fix missing assert --- sklearn/cluster/tests/test_k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 5994c770db9c9..8ea2153b079cc 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -714,7 +714,7 @@ def test_full_vs_elkan(): km1.fit(X) km2.fit(X) - homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0 + assert homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0 def test_n_init(): From da049f2c2847cfad810371746aaac15bdb67e0f5 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 12 Oct 2018 13:27:05 +0200 Subject: [PATCH 2/5] refactor kmeans init tests --- sklearn/cluster/tests/test_k_means.py | 78 +++++---------------------- 1 file changed, 13 insertions(+), 65 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 8ea2153b079cc..57c3fdf9eb2db 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -181,12 +181,6 @@ def _check_fitted_model(km): % km.n_clusters, km.fit, [[0., 1.]]) -def test_k_means_plus_plus_init(): - km = KMeans(init="k-means++", n_clusters=n_clusters, - random_state=42).fit(X) - _check_fitted_model(km) - - def test_k_means_new_centers(): # Explore the part of the code where a new center is reassigned X = np.array([[0, 0, 1, 1], @@ -229,24 +223,6 @@ def test_k_means_precompute_distances_flag(): assert_raises(ValueError, km.fit, X) -def test_k_means_plus_plus_init_sparse(): - km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42) - km.fit(X_csr) - _check_fitted_model(km) - - -def test_k_means_random_init(): - km = KMeans(init="random", n_clusters=n_clusters, random_state=42) - km.fit(X) - _check_fitted_model(km) - - -def test_k_means_random_init_sparse(): - km = KMeans(init="random", n_clusters=n_clusters, random_state=42) - km.fit(X_csr) - _check_fitted_model(km) - - def test_k_means_plus_plus_init_not_precomputed(): km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42, precompute_distances=False).fit(X) @@ -259,10 +235,12 @@ def test_k_means_random_init_not_precomputed(): _check_fitted_model(km) -def test_k_means_perfect_init(): - km = KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42, - n_init=1) - km.fit(X) +@pytest.mark.parametrize('representation', ['dense', 'sparse']) +@pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()]) +def test_k_means_init(representation, init): + data = {'dense': X, 'sparse': X_csr}[representation] + km = KMeans(init=init, n_clusters=n_clusters, random_state=42, n_init=1) + km.fit(data) _check_fitted_model(km) @@ -315,13 +293,6 @@ def test_k_means_fortran_aligned_data(): assert_array_equal(km.labels_, labels) -def test_mb_k_means_plus_plus_init_dense_array(): - mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, - random_state=42) - mb_k_means.fit(X) - _check_fitted_model(mb_k_means) - - def test_mb_kmeans_verbose(): mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42, verbose=1) @@ -333,13 +304,6 @@ def test_mb_kmeans_verbose(): sys.stdout = old_stdout -def test_mb_k_means_plus_plus_init_sparse_matrix(): - mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, - random_state=42) - mb_k_means.fit(X_csr) - _check_fitted_model(mb_k_means) - - def test_minibatch_init_with_large_k(): mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20) # Check that a warning is raised, as the number clusters is larger @@ -347,35 +311,19 @@ def test_minibatch_init_with_large_k(): assert_warns(RuntimeWarning, mb_k_means.fit, X) -def test_minibatch_k_means_random_init_dense_array(): - # increase n_init to make random init stable enough - mb_k_means = MiniBatchKMeans(init="random", n_clusters=n_clusters, - random_state=42, n_init=10).fit(X) - _check_fitted_model(mb_k_means) - - -def test_minibatch_k_means_random_init_sparse_csr(): - # increase n_init to make random init stable enough - mb_k_means = MiniBatchKMeans(init="random", n_clusters=n_clusters, - random_state=42, n_init=10).fit(X_csr) - _check_fitted_model(mb_k_means) - - -def test_minibatch_k_means_perfect_init_dense_array(): - mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, - random_state=42, n_init=1).fit(X) - _check_fitted_model(mb_k_means) - - def test_minibatch_k_means_init_multiple_runs_with_explicit_centers(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=10) assert_warns(RuntimeWarning, mb_k_means.fit, X) -def test_minibatch_k_means_perfect_init_sparse_csr(): - mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, - random_state=42, n_init=1).fit(X_csr) +@pytest.mark.parametrize('representation', ['dense', 'sparse']) +@pytest.mark.parametrize('init', ["random", 'k-means++', centers.copy()]) +def test_minibatch_k_means_init(representation, init): + data = {'dense': X, 'sparse': X_csr}[representation] + mb_k_means = MiniBatchKMeans(init=init, n_clusters=n_clusters, + random_state=42, n_init=10) + mb_k_means.fit(data) _check_fitted_model(mb_k_means) From da1fd5af1d9ef2b834a38463672b37f1740f43c0 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 12 Oct 2018 13:52:47 +0200 Subject: [PATCH 3/5] parametrize more tests --- sklearn/cluster/tests/test_k_means.py | 76 ++++++++------------------- 1 file changed, 22 insertions(+), 54 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 57c3fdf9eb2db..15644f67c6e3c 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -533,64 +533,40 @@ def test_predict(): assert_array_equal(pred, km.labels_) -def test_score(): - - km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1) - s1 = km1.fit(X).score(X) - km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1) - s2 = km2.fit(X).score(X) - assert_greater(s2, s1) - +@pytest.mark.parametrize('algo', ['full', 'elkan']) +def test_score(algo): + # Check that fitting k-means with multiple inits gives better score km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1, - algorithm='elkan') + algorithm=algo) s1 = km1.fit(X).score(X) km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1, - algorithm='elkan') + algorithm=algo) s2 = km2.fit(X).score(X) assert_greater(s2, s1) -def test_predict_minibatch_dense_input(): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, random_state=40).fit(X) - - # sanity check: predict centroid labels - pred = mb_k_means.predict(mb_k_means.cluster_centers_) - assert_array_equal(pred, np.arange(n_clusters)) - - # sanity check: re-predict labeling for training set samples - pred = mb_k_means.predict(X) - assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_) - - -def test_predict_minibatch_kmeanspp_init_sparse_input(): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', - n_init=10).fit(X_csr) +@pytest.mark.parametrize('representation', ['dense', 'sparse']) +@pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()]) +def test_predict_minibatch(representation, init): + data = {'dense': X, 'sparse': X_csr}[representation] + mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init=init, + n_init=10, random_state=0).fit(data) # sanity check: re-predict labeling for training set samples - assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_) + assert_array_equal(mb_k_means.predict(data), mb_k_means.labels_) # sanity check: predict centroid labels pred = mb_k_means.predict(mb_k_means.cluster_centers_) assert_array_equal(pred, np.arange(n_clusters)) - # check that models trained on sparse input also works for dense input at - # predict time - assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_) - - -def test_predict_minibatch_random_init_sparse_input(): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='random', - n_init=10).fit(X_csr) - - # sanity check: re-predict labeling for training set samples - assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_) - - # sanity check: predict centroid labels - pred = mb_k_means.predict(mb_k_means.cluster_centers_) - assert_array_equal(pred, np.arange(n_clusters)) +@pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()]) +def test_predict_minibatch_dense_sparse(init): # check that models trained on sparse input also works for dense input at # predict time + mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init=init, + n_init=10, random_state=0).fit(X_csr) + assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_) @@ -642,25 +618,17 @@ def test_fit_transform(): assert_array_almost_equal(X1, X2) -def test_predict_equal_labels(): +@pytest.mark.parametrize('algo', ['full', 'elkan']) +def test_predict_equal_labels(algo): km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1, - algorithm='full') - km.fit(X) - assert_array_equal(km.predict(X), km.labels_) - - km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1, - algorithm='elkan') + algorithm='algo') km.fit(X) assert_array_equal(km.predict(X), km.labels_) def test_full_vs_elkan(): - - km1 = KMeans(algorithm='full', random_state=13) - km2 = KMeans(algorithm='elkan', random_state=13) - - km1.fit(X) - km2.fit(X) + km1 = KMeans(algorithm='full', random_state=13).fit(X) + km2 = KMeans(algorithm='elkan', random_state=13).fit(X) assert homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0 From 06dc5fd7403921e099e8332901c4c2f85edaf072 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 12 Oct 2018 14:02:11 +0200 Subject: [PATCH 4/5] typo --- sklearn/cluster/tests/test_k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 15644f67c6e3c..147f798975fa1 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -621,7 +621,7 @@ def test_fit_transform(): @pytest.mark.parametrize('algo', ['full', 'elkan']) def test_predict_equal_labels(algo): km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1, - algorithm='algo') + algorithm=algo) km.fit(X) assert_array_equal(km.predict(X), km.labels_) From a81dbb6b63a134a188342f912c6c71e42522fafc Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Sat, 13 Oct 2018 22:56:00 +0200 Subject: [PATCH 5/5] ids in parametrize --- sklearn/cluster/tests/test_k_means.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 147f798975fa1..6483959532630 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -235,10 +235,9 @@ def test_k_means_random_init_not_precomputed(): _check_fitted_model(km) -@pytest.mark.parametrize('representation', ['dense', 'sparse']) +@pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse']) @pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()]) -def test_k_means_init(representation, init): - data = {'dense': X, 'sparse': X_csr}[representation] +def test_k_means_init(data, init): km = KMeans(init=init, n_clusters=n_clusters, random_state=42, n_init=1) km.fit(data) _check_fitted_model(km) @@ -317,10 +316,9 @@ def test_minibatch_k_means_init_multiple_runs_with_explicit_centers(): assert_warns(RuntimeWarning, mb_k_means.fit, X) -@pytest.mark.parametrize('representation', ['dense', 'sparse']) +@pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse']) @pytest.mark.parametrize('init', ["random", 'k-means++', centers.copy()]) -def test_minibatch_k_means_init(representation, init): - data = {'dense': X, 'sparse': X_csr}[representation] +def test_minibatch_k_means_init(data, init): mb_k_means = MiniBatchKMeans(init=init, n_clusters=n_clusters, random_state=42, n_init=10) mb_k_means.fit(data) @@ -545,10 +543,9 @@ def test_score(algo): assert_greater(s2, s1) -@pytest.mark.parametrize('representation', ['dense', 'sparse']) +@pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse']) @pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()]) -def test_predict_minibatch(representation, init): - data = {'dense': X, 'sparse': X_csr}[representation] +def test_predict_minibatch(data, init): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init=init, n_init=10, random_state=0).fit(data)