From 61a19660ea333b5f0ad355a8c2db86630ed31df3 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 3 Mar 2022 17:21:22 +0100 Subject: [PATCH 01/10] TST Adapt test_mutual_info.py to test implementations on 32bit datasets --- .../tests/test_mutual_info.py | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index bb98dfaee4db9..25938da87800b 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -7,12 +7,15 @@ from sklearn.feature_selection._mutual_info import _compute_mi from sklearn.feature_selection import mutual_info_regression, mutual_info_classif +DTYPES = (np.float64, np.float32) -def test_compute_mi_dd(): + +@pytest.mark.parametrize("dtype", DTYPES) +def test_compute_mi_dd(dtype): # In discrete case computations are straightforward and can be done # by hand on given vectors. - x = np.array([0, 1, 1, 0, 0]) - y = np.array([1, 0, 0, 0, 1]) + x = np.array([0, 1, 1, 0, 0], dtype=dtype) + y = np.array([1, 0, 0, 0, 1], dtype=dtype) H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5) H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5) @@ -21,7 +24,8 @@ def test_compute_mi_dd(): assert_almost_equal(_compute_mi(x, y, True, True), I_xy) -def test_compute_mi_cc(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_compute_mi_cc(dtype): # For two continuous variables a good approach is to test on bivariate # normal distribution, where mutual information is known. @@ -43,7 +47,7 @@ def test_compute_mi_cc(): I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov)) rng = check_random_state(0) - Z = rng.multivariate_normal(mean, cov, size=1000) + Z = rng.multivariate_normal(mean, cov, size=1000).astype(dtype) x, y = Z[:, 0], Z[:, 1] @@ -54,7 +58,8 @@ def test_compute_mi_cc(): assert_almost_equal(I_computed, I_theory, 1) -def test_compute_mi_cd(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_compute_mi_cd(dtype): # To test define a joint distribution as follows: # p(x, y) = p(x) p(y | x) # X ~ Bernoulli(p) @@ -76,10 +81,10 @@ def test_compute_mi_cd(): for p in [0.3, 0.5, 0.7]: x = rng.uniform(size=n_samples) > p - y = np.empty(n_samples) + y = np.empty(n_samples, dtype) mask = x == 0 - y[mask] = rng.uniform(-1, 1, size=np.sum(mask)) - y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)) + y[mask] = rng.uniform(-1, 1, size=np.sum(mask)).astype(dtype) + y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)).astype(dtype) I_theory = -0.5 * ( (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5) @@ -91,15 +96,16 @@ def test_compute_mi_cd(): assert_almost_equal(I_computed, I_theory, 1) -def test_compute_mi_cd_unique_label(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_compute_mi_cd_unique_label(dtype): # Test that adding unique label doesn't change MI. n_samples = 100 x = np.random.uniform(size=n_samples) > 0.5 - y = np.empty(n_samples) + y = np.empty(n_samples, dtype) mask = x == 0 - y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)) - y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)) + y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)).astype(dtype) + y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)).astype(dtype) mi_1 = _compute_mi(x, y, True, False) @@ -111,9 +117,10 @@ def test_compute_mi_cd_unique_label(): # We are going test that feature ordering by MI matches our expectations. -def test_mutual_info_classif_discrete(): - X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]]) - y = np.array([0, 1, 2, 2, 1]) +@pytest.mark.parametrize("dtype", DTYPES) +def test_mutual_info_classif_discrete(dtype): + X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=dtype) + y = np.array([0, 1, 2, 2, 1], dtype=dtype) # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly # informative. @@ -121,7 +128,8 @@ def test_mutual_info_classif_discrete(): assert_array_equal(np.argsort(-mi), np.array([0, 2, 1])) -def test_mutual_info_regression(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_mutual_info_regression(dtype): # We generate sample from multivariate normal distribution, using # transformation from initially uncorrelated variables. The zero # variables after transformation is selected as the target vector, @@ -132,7 +140,7 @@ def test_mutual_info_regression(): mean = np.zeros(4) rng = check_random_state(0) - Z = rng.multivariate_normal(mean, cov, size=1000) + Z = rng.multivariate_normal(mean, cov, size=1000).astype(dtype) X = Z[:, 1:] y = Z[:, 0] @@ -140,11 +148,12 @@ def test_mutual_info_regression(): assert_array_equal(np.argsort(-mi), np.array([1, 2, 0])) -def test_mutual_info_classif_mixed(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_mutual_info_classif_mixed(dtype): # Here the target is discrete and there are two continuous and one # discrete feature. The idea of this test is clear from the code. rng = check_random_state(0) - X = rng.rand(1000, 3) + X = rng.rand(1000, 3).astype(dtype) X[:, 1] += X[:, 0] y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) X[:, 2] = X[:, 2] > 0.5 @@ -164,9 +173,10 @@ def test_mutual_info_classif_mixed(): assert mi_nn[2] == mi[2] -def test_mutual_info_options(): - X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=float) - y = np.array([0, 1, 2, 2, 1], dtype=float) +@pytest.mark.parametrize("dtype", DTYPES) +def test_mutual_info_options(dtype): + X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=dtype) + y = np.array([0, 1, 2, 2, 1], dtype=dtype) X_csr = csr_matrix(X) for mutual_info in (mutual_info_regression, mutual_info_classif): From 38b1c6589c0dd8842fde28800a4a4d4ce22ccac7 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 9 Mar 2022 11:59:00 +0100 Subject: [PATCH 02/10] Apply comments from review Co-authored-by: Olivier Grisel --- .../tests/test_mutual_info.py | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index 25938da87800b..d4251776bf69d 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -10,18 +10,17 @@ DTYPES = (np.float64, np.float32) -@pytest.mark.parametrize("dtype", DTYPES) -def test_compute_mi_dd(dtype): +def test_compute_mi_dd(): # In discrete case computations are straightforward and can be done # by hand on given vectors. - x = np.array([0, 1, 1, 0, 0], dtype=dtype) - y = np.array([1, 0, 0, 0, 1], dtype=dtype) + x = np.array([0, 1, 1, 0, 0]) + y = np.array([1, 0, 0, 0, 1]) H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5) H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5) I_xy = H_x + H_y - H_xy - assert_almost_equal(_compute_mi(x, y, True, True), I_xy) + assert_almost_equal(_compute_mi(x, y, x_discrete=True, y_discrete=True), I_xy) @pytest.mark.parametrize("dtype", DTYPES) @@ -54,7 +53,9 @@ def test_compute_mi_cc(dtype): # Theory and computed values won't be very close, assert that the # first figures after decimal point match. for n_neighbors in [3, 5, 7]: - I_computed = _compute_mi(x, y, False, False, n_neighbors) + I_computed = _compute_mi( + x, y, x_discrete=False, y_discrete=False, n_neighbors=n_neighbors + ) assert_almost_equal(I_computed, I_theory, 1) @@ -92,7 +93,9 @@ def test_compute_mi_cd(dtype): # Assert the same tolerance. for n_neighbors in [3, 5, 7]: - I_computed = _compute_mi(x, y, True, False, n_neighbors) + I_computed = _compute_mi( + x, y, x_discrete=True, y_discrete=False, n_neighbors=n_neighbors + ) assert_almost_equal(I_computed, I_theory, 1) @@ -107,11 +110,11 @@ def test_compute_mi_cd_unique_label(dtype): y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)).astype(dtype) y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)).astype(dtype) - mi_1 = _compute_mi(x, y, True, False) + mi_1 = _compute_mi(x, y, x_discrete=True, y_discrete=False) x = np.hstack((x, 2)) y = np.hstack((y, 10)) - mi_2 = _compute_mi(x, y, True, False) + mi_2 = _compute_mi(x, y, x_discrete=True, y_discrete=False) assert mi_1 == mi_2 @@ -146,6 +149,9 @@ def test_mutual_info_regression(dtype): mi = mutual_info_regression(X, y, random_state=0) assert_array_equal(np.argsort(-mi), np.array([1, 2, 0])) + # XXX: should mutual_info_regression be fixed to avoid + # up-casting float32 inputs to float64? + assert mi.dtype == np.float64 @pytest.mark.parametrize("dtype", DTYPES) @@ -176,10 +182,14 @@ def test_mutual_info_classif_mixed(dtype): @pytest.mark.parametrize("dtype", DTYPES) def test_mutual_info_options(dtype): X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=dtype) - y = np.array([0, 1, 2, 2, 1], dtype=dtype) + y = np.array([0, 1, 2, 2, 1]) X_csr = csr_matrix(X) - for mutual_info in (mutual_info_regression, mutual_info_classif): + for mutual_info, y_dtype in ( + (mutual_info_regression, dtype), + (mutual_info_classif, y.dtype), + ): + y = y.astype(dtype) with pytest.raises(ValueError): mutual_info(X_csr, y, discrete_features=False) with pytest.raises(ValueError): From bba8a121f0b003e6720b8ad8355138c51c334dea Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 17 Mar 2022 19:10:48 +0100 Subject: [PATCH 03/10] TST Use global_dtype --- .../tests/test_mutual_info.py | 50 +++++++++---------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index ed8df03130c44..419a1364d22ab 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -10,8 +10,6 @@ from sklearn.feature_selection._mutual_info import _compute_mi from sklearn.feature_selection import mutual_info_regression, mutual_info_classif -DTYPES = (np.float64, np.float32) - def test_compute_mi_dd(): # In discrete case computations are straightforward and can be done @@ -61,8 +59,7 @@ def test_compute_mi_cc(global_dtype): assert_allclose(I_computed, I_theory, rtol=1e-1) -@pytest.mark.parametrize("dtype", DTYPES) -def test_compute_mi_cd(dtype): +def test_compute_mi_cd(global_dtype): # To test define a joint distribution as follows: # p(x, y) = p(x) p(y | x) # X ~ Bernoulli(p) @@ -84,10 +81,10 @@ def test_compute_mi_cd(dtype): for p in [0.3, 0.5, 0.7]: x = rng.uniform(size=n_samples) > p - y = np.empty(n_samples, dtype) + y = np.empty(n_samples, global_dtype) mask = x == 0 - y[mask] = rng.uniform(-1, 1, size=np.sum(mask)).astype(dtype) - y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)).astype(dtype) + y[mask] = rng.uniform(-1, 1, size=np.sum(mask)).astype(global_dtype) + y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)).astype(global_dtype) I_theory = -0.5 * ( (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5) @@ -101,16 +98,15 @@ def test_compute_mi_cd(dtype): assert_allclose(I_computed, I_theory, rtol=1e-1) -@pytest.mark.parametrize("dtype", DTYPES) -def test_compute_mi_cd_unique_label(dtype): +def test_compute_mi_cd_unique_label(global_dtype): # Test that adding unique label doesn't change MI. n_samples = 100 x = np.random.uniform(size=n_samples) > 0.5 - y = np.empty(n_samples, dtype) + y = np.empty(n_samples, global_dtype) mask = x == 0 - y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)).astype(dtype) - y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)).astype(dtype) + y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)).astype(global_dtype) + y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)).astype(global_dtype) mi_1 = _compute_mi(x, y, x_discrete=True, y_discrete=False) @@ -122,10 +118,11 @@ def test_compute_mi_cd_unique_label(dtype): # We are going test that feature ordering by MI matches our expectations. -@pytest.mark.parametrize("dtype", DTYPES) -def test_mutual_info_classif_discrete(dtype): - X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=dtype) - y = np.array([0, 1, 2, 2, 1], dtype=dtype) +def test_mutual_info_classif_discrete(global_dtype): + X = np.array( + [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype + ) + y = np.array([0, 1, 2, 2, 1], dtype=global_dtype) # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly # informative. @@ -133,8 +130,7 @@ def test_mutual_info_classif_discrete(dtype): assert_array_equal(np.argsort(-mi), np.array([0, 2, 1])) -@pytest.mark.parametrize("dtype", DTYPES) -def test_mutual_info_regression(dtype): +def test_mutual_info_regression(global_dtype): # We generate sample from multivariate normal distribution, using # transformation from initially uncorrelated variables. The zero # variables after transformation is selected as the target vector, @@ -145,7 +141,7 @@ def test_mutual_info_regression(dtype): mean = np.zeros(4) rng = check_random_state(0) - Z = rng.multivariate_normal(mean, cov, size=1000).astype(dtype) + Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype) X = Z[:, 1:] y = Z[:, 0] @@ -156,12 +152,11 @@ def test_mutual_info_regression(dtype): assert mi.dtype == np.float64 -@pytest.mark.parametrize("dtype", DTYPES) -def test_mutual_info_classif_mixed(dtype): +def test_mutual_info_classif_mixed(global_dtype): # Here the target is discrete and there are two continuous and one # discrete feature. The idea of this test is clear from the code. rng = check_random_state(0) - X = rng.rand(1000, 3).astype(dtype) + X = rng.rand(1000, 3).astype(global_dtype) X[:, 1] += X[:, 0] y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) X[:, 2] = X[:, 2] > 0.5 @@ -181,17 +176,18 @@ def test_mutual_info_classif_mixed(dtype): assert mi_nn[2] == mi[2] -@pytest.mark.parametrize("dtype", DTYPES) -def test_mutual_info_options(dtype): - X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=dtype) +def test_mutual_info_options(global_dtype): + X = np.array( + [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype + ) y = np.array([0, 1, 2, 2, 1]) X_csr = csr_matrix(X) for mutual_info, y_dtype in ( - (mutual_info_regression, dtype), + (mutual_info_regression, global_dtype), (mutual_info_classif, y.dtype), ): - y = y.astype(dtype) + y = y.astype(global_dtype) with pytest.raises(ValueError): mutual_info(X_csr, y, discrete_features=False) with pytest.raises(ValueError): From 33cc917dd7217e3dc93689af19f038874617b72c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 18 Mar 2022 15:51:21 +0100 Subject: [PATCH 04/10] TST Review comments --- sklearn/feature_selection/tests/test_mutual_info.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index 419a1364d22ab..dc265c6eb62b6 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -83,7 +83,7 @@ def test_compute_mi_cd(global_dtype): y = np.empty(n_samples, global_dtype) mask = x == 0 - y[mask] = rng.uniform(-1, 1, size=np.sum(mask)).astype(global_dtype) + y[mask] = rng.uniform(-1, 1, size=np.sum(mask)) y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)).astype(global_dtype) I_theory = -0.5 * ( @@ -106,7 +106,7 @@ def test_compute_mi_cd_unique_label(global_dtype): y = np.empty(n_samples, global_dtype) mask = x == 0 y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)).astype(global_dtype) - y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)).astype(global_dtype) + y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)) mi_1 = _compute_mi(x, y, x_discrete=True, y_discrete=False) @@ -122,7 +122,7 @@ def test_mutual_info_classif_discrete(global_dtype): X = np.array( [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype ) - y = np.array([0, 1, 2, 2, 1], dtype=global_dtype) + y = np.array([0, 1, 2, 2, 1]) # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly # informative. @@ -206,8 +206,8 @@ def test_mutual_info_options(global_dtype): mi_5 = mutual_info(X, y, discrete_features=[True, False, True], random_state=0) mi_6 = mutual_info(X, y, discrete_features=[0, 2], random_state=0) - assert_array_equal(mi_1, mi_2) - assert_array_equal(mi_3, mi_4) - assert_array_equal(mi_5, mi_6) + assert_allclose(mi_1, mi_2) + assert_allclose(mi_3, mi_4) + assert_allclose(mi_5, mi_6) assert not np.allclose(mi_1, mi_3) From 61022e194d2462c58591f73b7d6a5b024132a7b6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 23 Mar 2022 14:41:30 +0100 Subject: [PATCH 05/10] Address comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- .../tests/test_mutual_info.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index dc265c6eb62b6..b038218366ed9 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -84,7 +84,7 @@ def test_compute_mi_cd(global_dtype): y = np.empty(n_samples, global_dtype) mask = x == 0 y[mask] = rng.uniform(-1, 1, size=np.sum(mask)) - y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)).astype(global_dtype) + y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)) I_theory = -0.5 * ( (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5) @@ -105,7 +105,7 @@ def test_compute_mi_cd_unique_label(global_dtype): y = np.empty(n_samples, global_dtype) mask = x == 0 - y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)).astype(global_dtype) + y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)) y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)) mi_1 = _compute_mi(x, y, x_discrete=True, y_discrete=False) @@ -114,14 +114,12 @@ def test_compute_mi_cd_unique_label(global_dtype): y = np.hstack((y, 10)) mi_2 = _compute_mi(x, y, x_discrete=True, y_discrete=False) - assert mi_1 == mi_2 + assert_allclose(mi_1, mi_2) # We are going test that feature ordering by MI matches our expectations. def test_mutual_info_classif_discrete(global_dtype): - X = np.array( - [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype - ) + X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]]) y = np.array([0, 1, 2, 2, 1]) # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly @@ -180,14 +178,10 @@ def test_mutual_info_options(global_dtype): X = np.array( [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype ) - y = np.array([0, 1, 2, 2, 1]) + y = np.array([0, 1, 2, 2, 1], dtype=global_dtype) X_csr = csr_matrix(X) - for mutual_info, y_dtype in ( - (mutual_info_regression, global_dtype), - (mutual_info_classif, y.dtype), - ): - y = y.astype(global_dtype) + for mutual_info in (mutual_info_regression, mutual_info_classif): with pytest.raises(ValueError): mutual_info(X_csr, y, discrete_features=False) with pytest.raises(ValueError): From b31cccbc652e26843b4a03aba39c602928555b13 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 3 Mar 2022 16:27:48 +0100 Subject: [PATCH 06/10] TST Adapt test_mean_shift.py to test implementations on 32bit datasets --- sklearn/cluster/tests/test_mean_shift.py | 46 +++++++++++++++--------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py index cdd1134156173..f63b2d4740c64 100644 --- a/sklearn/cluster/tests/test_mean_shift.py +++ b/sklearn/cluster/tests/test_mean_shift.py @@ -32,6 +32,8 @@ random_state=11, ) +DTYPES = (np.float64, np.float32) + def test_estimate_bandwidth(): # Test estimate_bandwidth @@ -39,10 +41,11 @@ def test_estimate_bandwidth(): assert 0.9 <= bandwidth <= 1.5 -def test_estimate_bandwidth_1sample(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_estimate_bandwidth_1sample(dtype): # Test estimate_bandwidth when n_samples=1 and quantile<1, so that # n_neighbors is set to 1. - bandwidth = estimate_bandwidth(X, n_samples=1, quantile=0.3) + bandwidth = estimate_bandwidth(X.astype(dtype), n_samples=1, quantile=0.3) assert bandwidth == pytest.approx(0.0, abs=1e-5) @@ -50,10 +53,11 @@ def test_estimate_bandwidth_1sample(): "bandwidth, cluster_all, expected, first_cluster_label", [(1.2, True, 3, 0), (1.2, False, 4, -1)], ) -def test_mean_shift(bandwidth, cluster_all, expected, first_cluster_label): +@pytest.mark.parametrize("dtype", DTYPES) +def test_mean_shift(dtype, bandwidth, cluster_all, expected, first_cluster_label): # Test MeanShift algorithm ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all) - labels = ms.fit(X).labels_ + labels = ms.fit(X.astype(dtype)).labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert n_clusters_ == expected @@ -82,7 +86,8 @@ def test_estimate_bandwidth_with_sparse_matrix(): estimate_bandwidth(X) -def test_parallel(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_parallel(dtype): centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 X, _ = make_blobs( n_samples=50, @@ -93,6 +98,8 @@ def test_parallel(): random_state=11, ) + X = X.astype(dtype) + ms1 = MeanShift(n_jobs=2) ms1.fit(X) @@ -103,11 +110,13 @@ def test_parallel(): assert_array_equal(ms1.labels_, ms2.labels_) -def test_meanshift_predict(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_meanshift_predict(dtype): # Test MeanShift.predict ms = MeanShift(bandwidth=1.2) - labels = ms.fit_predict(X) - labels2 = ms.predict(X) + Y = X.astype(dtype) + labels = ms.fit_predict(Y) + labels2 = ms.predict(Y) assert_array_equal(labels, labels2) @@ -128,22 +137,25 @@ def test_unfitted(): assert not hasattr(ms, "labels_") -def test_cluster_intensity_tie(): - X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]]) +@pytest.mark.parametrize("dtype", DTYPES) +def test_cluster_intensity_tie(dtype): + X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=dtype) c1 = MeanShift(bandwidth=2).fit(X) - X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]]) + X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=dtype) c2 = MeanShift(bandwidth=2).fit(X) assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0]) assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1]) -def test_bin_seeds(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_bin_seeds(dtype): # Test the bin seeding technique which can be used in the mean shift # algorithm # Data is just 6 points in the plane X = np.array( - [[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]] + [[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]], + dtype=dtype, ) # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be @@ -174,6 +186,7 @@ def test_bin_seeds(): cluster_std=0.1, random_state=0, ) + X = X.astype(dtype) test_bins = get_bin_seeds(X, 1) assert_array_equal(test_bins, [[0, 0], [1, 1]]) @@ -191,9 +204,10 @@ def test_max_iter(max_iter): assert np.allclose(c1, c2) -def test_mean_shift_zero_bandwidth(): +@pytest.mark.parametrize("dtype", DTYPES) +def test_mean_shift_zero_bandwidth(dtype): # Check that mean shift works when the estimated bandwidth is 0. - X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1) + X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1).astype(dtype) # estimate_bandwidth with default args returns 0 on this dataset bandwidth = estimate_bandwidth(X) @@ -206,7 +220,7 @@ def test_mean_shift_zero_bandwidth(): # to no binning. ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X) ms_nobinning = MeanShift(bin_seeding=False).fit(X) - expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2]) + expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2], dtype=dtype) assert v_measure_score(ms_binning.labels_, expected_labels) == 1 assert v_measure_score(ms_nobinning.labels_, expected_labels) == 1 From ab3466690f75baa569f0ca08cc6280c8dd55e864 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 17 Mar 2022 15:59:43 +0100 Subject: [PATCH 07/10] TST Use global_dtype --- sklearn/cluster/tests/test_mean_shift.py | 45 ++++++++++-------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py index f63b2d4740c64..eeb86b1131319 100644 --- a/sklearn/cluster/tests/test_mean_shift.py +++ b/sklearn/cluster/tests/test_mean_shift.py @@ -32,8 +32,6 @@ random_state=11, ) -DTYPES = (np.float64, np.float32) - def test_estimate_bandwidth(): # Test estimate_bandwidth @@ -41,11 +39,10 @@ def test_estimate_bandwidth(): assert 0.9 <= bandwidth <= 1.5 -@pytest.mark.parametrize("dtype", DTYPES) -def test_estimate_bandwidth_1sample(dtype): +def test_estimate_bandwidth_1sample(global_dtype): # Test estimate_bandwidth when n_samples=1 and quantile<1, so that # n_neighbors is set to 1. - bandwidth = estimate_bandwidth(X.astype(dtype), n_samples=1, quantile=0.3) + bandwidth = estimate_bandwidth(X.astype(global_dtype), n_samples=1, quantile=0.3) assert bandwidth == pytest.approx(0.0, abs=1e-5) @@ -53,11 +50,12 @@ def test_estimate_bandwidth_1sample(dtype): "bandwidth, cluster_all, expected, first_cluster_label", [(1.2, True, 3, 0), (1.2, False, 4, -1)], ) -@pytest.mark.parametrize("dtype", DTYPES) -def test_mean_shift(dtype, bandwidth, cluster_all, expected, first_cluster_label): +def test_mean_shift( + global_dtype, bandwidth, cluster_all, expected, first_cluster_label +): # Test MeanShift algorithm ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all) - labels = ms.fit(X.astype(dtype)).labels_ + labels = ms.fit(X.astype(global_dtype)).labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert n_clusters_ == expected @@ -86,8 +84,7 @@ def test_estimate_bandwidth_with_sparse_matrix(): estimate_bandwidth(X) -@pytest.mark.parametrize("dtype", DTYPES) -def test_parallel(dtype): +def test_parallel(global_dtype): centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 X, _ = make_blobs( n_samples=50, @@ -98,7 +95,7 @@ def test_parallel(dtype): random_state=11, ) - X = X.astype(dtype) + X = X.astype(global_dtype) ms1 = MeanShift(n_jobs=2) ms1.fit(X) @@ -110,11 +107,10 @@ def test_parallel(dtype): assert_array_equal(ms1.labels_, ms2.labels_) -@pytest.mark.parametrize("dtype", DTYPES) -def test_meanshift_predict(dtype): +def test_meanshift_predict(global_dtype): # Test MeanShift.predict ms = MeanShift(bandwidth=1.2) - Y = X.astype(dtype) + Y = X.astype(global_dtype) labels = ms.fit_predict(Y) labels2 = ms.predict(Y) assert_array_equal(labels, labels2) @@ -137,25 +133,23 @@ def test_unfitted(): assert not hasattr(ms, "labels_") -@pytest.mark.parametrize("dtype", DTYPES) -def test_cluster_intensity_tie(dtype): - X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=dtype) +def test_cluster_intensity_tie(global_dtype): + X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=global_dtype) c1 = MeanShift(bandwidth=2).fit(X) - X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=dtype) + X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=global_dtype) c2 = MeanShift(bandwidth=2).fit(X) assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0]) assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1]) -@pytest.mark.parametrize("dtype", DTYPES) -def test_bin_seeds(dtype): +def test_bin_seeds(global_dtype): # Test the bin seeding technique which can be used in the mean shift # algorithm # Data is just 6 points in the plane X = np.array( [[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]], - dtype=dtype, + dtype=global_dtype, ) # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be @@ -186,7 +180,7 @@ def test_bin_seeds(dtype): cluster_std=0.1, random_state=0, ) - X = X.astype(dtype) + X = X.astype(global_dtype) test_bins = get_bin_seeds(X, 1) assert_array_equal(test_bins, [[0, 0], [1, 1]]) @@ -204,10 +198,9 @@ def test_max_iter(max_iter): assert np.allclose(c1, c2) -@pytest.mark.parametrize("dtype", DTYPES) -def test_mean_shift_zero_bandwidth(dtype): +def test_mean_shift_zero_bandwidth(global_dtype): # Check that mean shift works when the estimated bandwidth is 0. - X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1).astype(dtype) + X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1).astype(global_dtype) # estimate_bandwidth with default args returns 0 on this dataset bandwidth = estimate_bandwidth(X) @@ -220,7 +213,7 @@ def test_mean_shift_zero_bandwidth(dtype): # to no binning. ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X) ms_nobinning = MeanShift(bin_seeding=False).fit(X) - expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2], dtype=dtype) + expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2], dtype=global_dtype) assert v_measure_score(ms_binning.labels_, expected_labels) == 1 assert v_measure_score(ms_nobinning.labels_, expected_labels) == 1 From 70cde82a3233c0ae3a70389e0c8e77b6ba90833b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 17 Mar 2022 16:06:46 +0100 Subject: [PATCH 08/10] Address review comments Co-authored-by: Olivier Grisel --- sklearn/cluster/tests/test_mean_shift.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py index eeb86b1131319..860ad477e1530 100644 --- a/sklearn/cluster/tests/test_mean_shift.py +++ b/sklearn/cluster/tests/test_mean_shift.py @@ -109,10 +109,11 @@ def test_parallel(global_dtype): def test_meanshift_predict(global_dtype): # Test MeanShift.predict + global X ms = MeanShift(bandwidth=1.2) - Y = X.astype(global_dtype) - labels = ms.fit_predict(Y) - labels2 = ms.predict(Y) + X = X.astype(global_dtype) + labels = ms.fit_predict(X) + labels2 = ms.predict(X) assert_array_equal(labels, labels2) @@ -213,8 +214,8 @@ def test_mean_shift_zero_bandwidth(global_dtype): # to no binning. ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X) ms_nobinning = MeanShift(bin_seeding=False).fit(X) - expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2], dtype=global_dtype) + expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2]) - assert v_measure_score(ms_binning.labels_, expected_labels) == 1 - assert v_measure_score(ms_nobinning.labels_, expected_labels) == 1 + assert v_measure_score(ms_binning.labels_, expected_labels) == pytest.approx(1) + assert v_measure_score(ms_nobinning.labels_, expected_labels) == pytest.approx(1) assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_) From 063b93508b7ea2a07d14eae2cb1862821999386d Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 23 Mar 2022 12:17:36 +0100 Subject: [PATCH 09/10] Assert review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- sklearn/cluster/tests/test_mean_shift.py | 39 +++++++++++-------- .../tests/test_mutual_info.py | 10 +++-- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py index 860ad477e1530..6e2651c406e33 100644 --- a/sklearn/cluster/tests/test_mean_shift.py +++ b/sklearn/cluster/tests/test_mean_shift.py @@ -10,7 +10,6 @@ from scipy import sparse from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.cluster import MeanShift @@ -42,8 +41,12 @@ def test_estimate_bandwidth(): def test_estimate_bandwidth_1sample(global_dtype): # Test estimate_bandwidth when n_samples=1 and quantile<1, so that # n_neighbors is set to 1. - bandwidth = estimate_bandwidth(X.astype(global_dtype), n_samples=1, quantile=0.3) - assert bandwidth == pytest.approx(0.0, abs=1e-5) + bandwidth = estimate_bandwidth( + X.astype(global_dtype, copy=False), n_samples=1, quantile=0.3 + ) + + assert bandwidth.dtype == X.dtype + assert_allclose(bandwidth, 0.0, atol=1e-5) @pytest.mark.parametrize( @@ -54,14 +57,15 @@ def test_mean_shift( global_dtype, bandwidth, cluster_all, expected, first_cluster_label ): # Test MeanShift algorithm + X_ = X.astype(global_dtype, copy=False) ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all) - labels = ms.fit(X.astype(global_dtype)).labels_ + labels = ms.fit(X_).labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert n_clusters_ == expected assert labels_unique[0] == first_cluster_label - cluster_centers, labels_mean_shift = mean_shift(X, cluster_all=cluster_all) + cluster_centers, labels_mean_shift = mean_shift(X_, cluster_all=cluster_all) labels_mean_shift_unique = np.unique(labels_mean_shift) n_clusters_mean_shift = len(labels_mean_shift_unique) assert n_clusters_mean_shift == expected @@ -95,7 +99,7 @@ def test_parallel(global_dtype): random_state=11, ) - X = X.astype(global_dtype) + X = X.astype(global_dtype, copy=False) ms1 = MeanShift(n_jobs=2) ms1.fit(X) @@ -103,17 +107,16 @@ def test_parallel(global_dtype): ms2 = MeanShift() ms2.fit(X) - assert_array_almost_equal(ms1.cluster_centers_, ms2.cluster_centers_) + assert_allclose(ms1.cluster_centers_, ms2.cluster_centers_) assert_array_equal(ms1.labels_, ms2.labels_) def test_meanshift_predict(global_dtype): # Test MeanShift.predict - global X ms = MeanShift(bandwidth=1.2) - X = X.astype(global_dtype) - labels = ms.fit_predict(X) - labels2 = ms.predict(X) + X_ = X.astype(global_dtype, copy=False) + labels = ms.fit_predict(X_) + labels2 = ms.predict(X_) assert_array_equal(labels, labels2) @@ -171,7 +174,7 @@ def test_bin_seeds(global_dtype): # we bail and use the whole data here. with warnings.catch_warnings(record=True): test_bins = get_bin_seeds(X, 0.01, 1) - assert_array_almost_equal(test_bins, X) + assert_allclose(test_bins, X) # tight clusters around [0, 0] and [1, 1], only get two bins X, _ = make_blobs( @@ -181,7 +184,7 @@ def test_bin_seeds(global_dtype): cluster_std=0.1, random_state=0, ) - X = X.astype(global_dtype) + X = X.astype(global_dtype, copy=False) test_bins = get_bin_seeds(X, 1) assert_array_equal(test_bins, [[0, 0], [1, 1]]) @@ -201,7 +204,11 @@ def test_max_iter(max_iter): def test_mean_shift_zero_bandwidth(global_dtype): # Check that mean shift works when the estimated bandwidth is 0. - X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1).astype(global_dtype) + X = ( + np.array([1, 1, 1, 2, 2, 2, 3, 3]) + .reshape(-1, 1) + .astype(global_dtype, copy=False) + ) # estimate_bandwidth with default args returns 0 on this dataset bandwidth = estimate_bandwidth(X) @@ -216,6 +223,6 @@ def test_mean_shift_zero_bandwidth(global_dtype): ms_nobinning = MeanShift(bin_seeding=False).fit(X) expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2]) - assert v_measure_score(ms_binning.labels_, expected_labels) == pytest.approx(1) - assert v_measure_score(ms_nobinning.labels_, expected_labels) == pytest.approx(1) + assert_allclose(v_measure_score(ms_binning.labels_, expected_labels), 1) + assert_allclose(v_measure_score(ms_nobinning.labels_, expected_labels), 1) assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index b038218366ed9..af2b733efd62d 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -119,7 +119,9 @@ def test_compute_mi_cd_unique_label(global_dtype): # We are going test that feature ordering by MI matches our expectations. def test_mutual_info_classif_discrete(global_dtype): - X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]]) + X = np.array( + [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype + ) y = np.array([0, 1, 2, 2, 1]) # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly @@ -139,7 +141,7 @@ def test_mutual_info_regression(global_dtype): mean = np.zeros(4) rng = check_random_state(0) - Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype) + Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False) X = Z[:, 1:] y = Z[:, 0] @@ -154,7 +156,7 @@ def test_mutual_info_classif_mixed(global_dtype): # Here the target is discrete and there are two continuous and one # discrete feature. The idea of this test is clear from the code. rng = check_random_state(0) - X = rng.rand(1000, 3).astype(global_dtype) + X = rng.rand(1000, 3).astype(global_dtype, copy=False) X[:, 1] += X[:, 0] y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) X[:, 2] = X[:, 2] > 0.5 @@ -204,4 +206,4 @@ def test_mutual_info_options(global_dtype): assert_allclose(mi_3, mi_4) assert_allclose(mi_5, mi_6) - assert not np.allclose(mi_1, mi_3) + assert not np.allclose(mi_1, mi_3) From 8a95627de322535b9ca8820a04ebbba447e27e50 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 23 Mar 2022 18:11:14 +0100 Subject: [PATCH 10/10] TST Undo changes made to other files --- sklearn/cluster/tests/test_mean_shift.py | 59 +++++++++--------------- 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py index 6e2651c406e33..cdd1134156173 100644 --- a/sklearn/cluster/tests/test_mean_shift.py +++ b/sklearn/cluster/tests/test_mean_shift.py @@ -10,6 +10,7 @@ from scipy import sparse from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.cluster import MeanShift @@ -38,34 +39,27 @@ def test_estimate_bandwidth(): assert 0.9 <= bandwidth <= 1.5 -def test_estimate_bandwidth_1sample(global_dtype): +def test_estimate_bandwidth_1sample(): # Test estimate_bandwidth when n_samples=1 and quantile<1, so that # n_neighbors is set to 1. - bandwidth = estimate_bandwidth( - X.astype(global_dtype, copy=False), n_samples=1, quantile=0.3 - ) - - assert bandwidth.dtype == X.dtype - assert_allclose(bandwidth, 0.0, atol=1e-5) + bandwidth = estimate_bandwidth(X, n_samples=1, quantile=0.3) + assert bandwidth == pytest.approx(0.0, abs=1e-5) @pytest.mark.parametrize( "bandwidth, cluster_all, expected, first_cluster_label", [(1.2, True, 3, 0), (1.2, False, 4, -1)], ) -def test_mean_shift( - global_dtype, bandwidth, cluster_all, expected, first_cluster_label -): +def test_mean_shift(bandwidth, cluster_all, expected, first_cluster_label): # Test MeanShift algorithm - X_ = X.astype(global_dtype, copy=False) ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all) - labels = ms.fit(X_).labels_ + labels = ms.fit(X).labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert n_clusters_ == expected assert labels_unique[0] == first_cluster_label - cluster_centers, labels_mean_shift = mean_shift(X_, cluster_all=cluster_all) + cluster_centers, labels_mean_shift = mean_shift(X, cluster_all=cluster_all) labels_mean_shift_unique = np.unique(labels_mean_shift) n_clusters_mean_shift = len(labels_mean_shift_unique) assert n_clusters_mean_shift == expected @@ -88,7 +82,7 @@ def test_estimate_bandwidth_with_sparse_matrix(): estimate_bandwidth(X) -def test_parallel(global_dtype): +def test_parallel(): centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 X, _ = make_blobs( n_samples=50, @@ -99,24 +93,21 @@ def test_parallel(global_dtype): random_state=11, ) - X = X.astype(global_dtype, copy=False) - ms1 = MeanShift(n_jobs=2) ms1.fit(X) ms2 = MeanShift() ms2.fit(X) - assert_allclose(ms1.cluster_centers_, ms2.cluster_centers_) + assert_array_almost_equal(ms1.cluster_centers_, ms2.cluster_centers_) assert_array_equal(ms1.labels_, ms2.labels_) -def test_meanshift_predict(global_dtype): +def test_meanshift_predict(): # Test MeanShift.predict ms = MeanShift(bandwidth=1.2) - X_ = X.astype(global_dtype, copy=False) - labels = ms.fit_predict(X_) - labels2 = ms.predict(X_) + labels = ms.fit_predict(X) + labels2 = ms.predict(X) assert_array_equal(labels, labels2) @@ -137,23 +128,22 @@ def test_unfitted(): assert not hasattr(ms, "labels_") -def test_cluster_intensity_tie(global_dtype): - X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=global_dtype) +def test_cluster_intensity_tie(): + X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]]) c1 = MeanShift(bandwidth=2).fit(X) - X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=global_dtype) + X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]]) c2 = MeanShift(bandwidth=2).fit(X) assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0]) assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1]) -def test_bin_seeds(global_dtype): +def test_bin_seeds(): # Test the bin seeding technique which can be used in the mean shift # algorithm # Data is just 6 points in the plane X = np.array( - [[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]], - dtype=global_dtype, + [[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]] ) # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be @@ -174,7 +164,7 @@ def test_bin_seeds(global_dtype): # we bail and use the whole data here. with warnings.catch_warnings(record=True): test_bins = get_bin_seeds(X, 0.01, 1) - assert_allclose(test_bins, X) + assert_array_almost_equal(test_bins, X) # tight clusters around [0, 0] and [1, 1], only get two bins X, _ = make_blobs( @@ -184,7 +174,6 @@ def test_bin_seeds(global_dtype): cluster_std=0.1, random_state=0, ) - X = X.astype(global_dtype, copy=False) test_bins = get_bin_seeds(X, 1) assert_array_equal(test_bins, [[0, 0], [1, 1]]) @@ -202,13 +191,9 @@ def test_max_iter(max_iter): assert np.allclose(c1, c2) -def test_mean_shift_zero_bandwidth(global_dtype): +def test_mean_shift_zero_bandwidth(): # Check that mean shift works when the estimated bandwidth is 0. - X = ( - np.array([1, 1, 1, 2, 2, 2, 3, 3]) - .reshape(-1, 1) - .astype(global_dtype, copy=False) - ) + X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1) # estimate_bandwidth with default args returns 0 on this dataset bandwidth = estimate_bandwidth(X) @@ -223,6 +208,6 @@ def test_mean_shift_zero_bandwidth(global_dtype): ms_nobinning = MeanShift(bin_seeding=False).fit(X) expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2]) - assert_allclose(v_measure_score(ms_binning.labels_, expected_labels), 1) - assert_allclose(v_measure_score(ms_nobinning.labels_, expected_labels), 1) + assert v_measure_score(ms_binning.labels_, expected_labels) == 1 + assert v_measure_score(ms_nobinning.labels_, expected_labels) == 1 assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)