From b7df9fc1d70128a16abbbd811ee5512c406de504 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 27 Oct 2020 12:01:40 +0100 Subject: [PATCH 01/11] Raise error, add a test. --- sklearn/neighbors/_binary_tree.pxi | 2 +- sklearn/neighbors/tests/test_ball_tree.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index a632477c936fb..0fda9a7abc38c 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -1058,7 +1058,7 @@ cdef class BinaryTree: n_samples = data.shape[0] n_features = data.shape[1] - self.data_arr = np.asarray(data, dtype=DTYPE, order='C') + self.data_arr = check_array(data, dtype=DTYPE, order='C') self.leaf_size = leaf_size self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 8da703dbe207d..ff3b142423250 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -65,3 +65,13 @@ def test_query_haversine(): assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2) + + +def test_array_object_type(): + X = [(1,2,3), (2,5), (5,5,1,2)] + Y = np.array(X) + with pytest.raises( + ValueError, + match="setting an array element with a sequence" + ): + BallTree(Y) From 51173167aaf919992803d8340aa120a36fce26ce Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 27 Oct 2020 12:07:37 +0100 Subject: [PATCH 02/11] Temporary what's new. --- doc/whats_new/v0.24.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index f2a05c3e73228..ede1d05180e45 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -574,6 +574,11 @@ Changelog the data intrinsic dimensionality is too high for tree-based methods. :pr:`17148` by :user:`Geoffrey Bolmier `. +- |Fix| :class:`neighbors.BinaryTree` + will raise a `ValueError` when fitting on data array having points with + different dimensions. + :pr:`` by :user:`Chiara Marmo `. + - |Fix| :class:`neighbors.NearestCentroid` with a numerical `shrink_threshold` will raise a `ValueError` when fitting on data with all constant features. :pr:`18370` by :user:`Trevor Waite `. From cec423f204a354926447f102abdb2636dbef9b0b Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 27 Oct 2020 12:13:56 +0100 Subject: [PATCH 03/11] Fix lint error. Final what's new. --- doc/whats_new/v0.24.rst | 2 +- sklearn/neighbors/tests/test_ball_tree.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index ede1d05180e45..300db944cfb08 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -577,7 +577,7 @@ Changelog - |Fix| :class:`neighbors.BinaryTree` will raise a `ValueError` when fitting on data array having points with different dimensions. - :pr:`` by :user:`Chiara Marmo `. + :pr:`18691` by :user:`Chiara Marmo `. - |Fix| :class:`neighbors.NearestCentroid` with a numerical `shrink_threshold` will raise a `ValueError` when fitting on data with all constant features. diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index ff3b142423250..8d04d61a3f613 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -68,7 +68,7 @@ def test_query_haversine(): def test_array_object_type(): - X = [(1,2,3), (2,5), (5,5,1,2)] + X = [(1, 2, 3), (2, 5), (5, 5, 1, 2)] Y = np.array(X) with pytest.raises( ValueError, From 8fbc490c39e644e67dcb01a3296d6f9f43b10e6c Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 27 Oct 2020 16:57:25 +0100 Subject: [PATCH 04/11] Add dtype in tests, following NEP 34. --- sklearn/neighbors/tests/test_ball_tree.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 8d04d61a3f613..353bf483a7cd5 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -69,7 +69,8 @@ def test_query_haversine(): def test_array_object_type(): X = [(1, 2, 3), (2, 5), (5, 5, 1, 2)] - Y = np.array(X) + # dtype parameter enforced by NEP 34 + Y = np.array(X, dtype=object) with pytest.raises( ValueError, match="setting an array element with a sequence" From bce45b509e824f783d2b605f8e4cc5ecfb88332e Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 28 Oct 2020 12:18:59 +0100 Subject: [PATCH 05/11] Move validation before shape call. --- sklearn/neighbors/_binary_tree.pxi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 0fda9a7abc38c..68f425da027f2 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -1055,10 +1055,11 @@ cdef class BinaryTree: if leaf_size < 1: raise ValueError("leaf_size must be greater than or equal to 1") + self.data_arr = check_array(data, dtype=DTYPE, order='C') + n_samples = data.shape[0] n_features = data.shape[1] - self.data_arr = check_array(data, dtype=DTYPE, order='C') self.leaf_size = leaf_size self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ From dfbdeda9a24aa2d2100ea8782a8754425150919f Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 28 Oct 2020 15:01:45 +0100 Subject: [PATCH 06/11] Address comments. Add tests. --- sklearn/neighbors/_binary_tree.pxi | 13 ++++++------- sklearn/neighbors/tests/test_ball_tree.py | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 68f425da027f2..7117082ca27f2 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -1049,18 +1049,17 @@ cdef class BinaryTree: def __init__(self, data, leaf_size=40, metric='minkowski', sample_weight=None, **kwargs): # validate data - if data.size == 0: + self.data_arr = check_array(data, dtype=DTYPE, order='C') + if self.data_arr.size == 0: raise ValueError("X is an empty array") + n_samples = self.data_arr.shape[0] + n_features = self.data_arr.shape[1] + if leaf_size < 1: raise ValueError("leaf_size must be greater than or equal to 1") - - self.data_arr = check_array(data, dtype=DTYPE, order='C') - - n_samples = data.shape[0] - n_features = data.shape[1] - self.leaf_size = leaf_size + self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance') diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 353bf483a7cd5..7da5e94269ae6 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -6,6 +6,8 @@ from sklearn.neighbors._ball_tree import BallTree from sklearn.neighbors import DistanceMetric from sklearn.utils import check_random_state +from sklearn.utils.validation import check_array +from sklearn.utils._testing import _convert_container rng = np.random.RandomState(10) V_mahalanobis = rng.rand(3, 3) @@ -31,15 +33,19 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): + X, Y = check_array(X), check_array(Y) D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) ind = np.argsort(D, axis=1)[:, :k] dist = D[np.arange(Y.shape[0])[:, None], ind] return dist, ind -@pytest.mark.parametrize('metric', - itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) -def test_ball_tree_query_metrics(metric): +@pytest.mark.parametrize( + 'metric', + itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS) +) +@pytest.mark.parametrize("array_type", ["list", "array"]) +def test_ball_tree_query_metrics(metric, array_type): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) @@ -47,6 +53,8 @@ def test_ball_tree_query_metrics(metric): elif metric in DISCRETE_METRICS: X = (4 * rng.random_sample((40, 10))).round(0) Y = (4 * rng.random_sample((10, 10))).round(0) + X = _convert_container(X, array_type) + Y = _convert_container(Y, array_type) k = 5 @@ -68,8 +76,8 @@ def test_query_haversine(): def test_array_object_type(): + """Check that we do not accept object dtype array.""" X = [(1, 2, 3), (2, 5), (5, 5, 1, 2)] - # dtype parameter enforced by NEP 34 Y = np.array(X, dtype=object) with pytest.raises( ValueError, From f2868e567552081019dbb15ac7a996a3b4cf0dc5 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 28 Oct 2020 15:26:24 +0100 Subject: [PATCH 07/11] Address comments. --- sklearn/neighbors/_binary_tree.pxi | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 7117082ca27f2..ddb96a78da443 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -1052,9 +1052,7 @@ cdef class BinaryTree: self.data_arr = check_array(data, dtype=DTYPE, order='C') if self.data_arr.size == 0: raise ValueError("X is an empty array") - - n_samples = self.data_arr.shape[0] - n_features = self.data_arr.shape[1] + n_samples, n_features = self.data_arr.shape if leaf_size < 1: raise ValueError("leaf_size must be greater than or equal to 1") @@ -1069,7 +1067,7 @@ cdef class BinaryTree: raise ValueError('metric {metric} is not valid for ' '{BinaryTree}'.format(metric=metric, **DOC_DICT)) - self.dist_metric._validate_data(data) + self.dist_metric._validate_data(self.data_arr) # determine number of levels in the tree, and from this # the number of nodes in the tree. This results in leaf nodes From 23a7c22840b46ba411c625d0088d5d4489abe3be Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 28 Oct 2020 15:46:32 +0100 Subject: [PATCH 08/11] Revert some comments. --- sklearn/neighbors/_binary_tree.pxi | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index ddb96a78da443..b1d1290f3b759 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -1052,7 +1052,9 @@ cdef class BinaryTree: self.data_arr = check_array(data, dtype=DTYPE, order='C') if self.data_arr.size == 0: raise ValueError("X is an empty array") - n_samples, n_features = self.data_arr.shape + + n_samples = self.data_arr.shape[0] + n_features = self.data_arr.shape[1] if leaf_size < 1: raise ValueError("leaf_size must be greater than or equal to 1") From c66f8bdb4fb2ffb5d9a3d1116547c40c11cd8064 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 28 Oct 2020 21:19:05 +0100 Subject: [PATCH 09/11] Reduce variables in test. --- sklearn/neighbors/tests/test_ball_tree.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 7da5e94269ae6..ae88c71ff497b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -77,10 +77,9 @@ def test_query_haversine(): def test_array_object_type(): """Check that we do not accept object dtype array.""" - X = [(1, 2, 3), (2, 5), (5, 5, 1, 2)] - Y = np.array(X, dtype=object) + X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises( ValueError, match="setting an array element with a sequence" ): - BallTree(Y) + BallTree(X) From 796ae8b5c56503ca674d84aa7551f8ab1e23fe69 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 30 Oct 2020 15:58:52 +0100 Subject: [PATCH 10/11] Add a case test for KDTree. --- sklearn/neighbors/tests/test_kd_tree.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index c5d30d0b179d3..1161c2757e593 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -1,6 +1,20 @@ +import numpy as np +import pytest + +from sklearn.neighbors._kd_tree import KDTree + DIMENSION = 3 METRICS = {'euclidean': {}, 'manhattan': {}, 'chebyshev': {}, 'minkowski': dict(p=3)} + +def test_array_object_type(): + """Check that we do not accept object dtype array.""" + X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) + with pytest.raises( + ValueError, + match="setting an array element with a sequence" + ): + KDTree(X) From 5a99ed9a2dd16026813c62f9030674b37fcbdaef Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 30 Oct 2020 16:00:55 +0100 Subject: [PATCH 11/11] Fix lint error. --- sklearn/neighbors/tests/test_kd_tree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 1161c2757e593..8b013cae522b8 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -10,6 +10,7 @@ 'chebyshev': {}, 'minkowski': dict(p=3)} + def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)