Skip to content

MNT: Remove duplicated data validation done in internally used BinaryTrees #19418

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
011c8d2
TST Adds failing test
thomasjpfan Nov 2, 2020
59b4a8e
ENH Makes copies of global configuration per thread
thomasjpfan Nov 2, 2020
f6a1357
TST Skip loky tests for joblib version < 0.12
thomasjpfan Nov 2, 2020
bddc33f
Merge remote-tracking branch 'upstream/master' into thread_safe_config
thomasjpfan Nov 13, 2020
e7de9a0
Merge remote-tracking branch 'upstream/master' into thread_safe_config
thomasjpfan Nov 18, 2020
02e738f
Merge remote-tracking branch 'upstream/master' into thread_safe_config
thomasjpfan Nov 25, 2020
99a4ce1
ENH Small refactor
thomasjpfan Nov 26, 2020
966498d
DOC Update docstring of test
thomasjpfan Nov 26, 2020
df291d9
DOC Update names
thomasjpfan Nov 26, 2020
1828e00
Remove data validation in NeighborsBase's BinaryTrees
jjerphan Feb 9, 2021
e6959e3
fixup! Remove data validation in NeighborsBase's BinaryTrees
jjerphan Feb 9, 2021
d9c900c
Remove data validation in KernelDensity's BinaryTrees
jjerphan Feb 9, 2021
d88970c
fixup! Remove data validation in KernelDensity's BinaryTrees
jjerphan Feb 9, 2021
aae2ad3
Remove data validation in feature_selection's KDTrees
jjerphan Feb 9, 2021
32a5cff
Merge branch 'main' into remove_validation_internal_trees
jjerphan Apr 7, 2021
bba6465
Merge remote-tracking branch 'upstream/main' into thread_safe_config
thomasjpfan Apr 9, 2021
ba93b54
DOC Updates changelog and docstring about threadsafeness
thomasjpfan Apr 9, 2021
ea516c4
DOC Update test's docstring
thomasjpfan Apr 9, 2021
6fe0619
ENH Cleaner code
thomasjpfan Apr 9, 2021
91d18c1
DOC Better docstring
thomasjpfan Apr 9, 2021
8e733cb
Merge remote-tracking branch 'upstream/main' into thread_safe_config
thomasjpfan Apr 9, 2021
8dd54e4
Merge branch 'thread_safe_config' into remove_validation_internal_trees
jjerphan Apr 9, 2021
7cec6bd
Merge correction
jjerphan Apr 9, 2021
22a6a2f
Merge branch 'main' into remove_validation_internal_trees
jjerphan Apr 27, 2021
156eff7
fixup! Merge branch 'main' into remove_validation_internal_trees
jjerphan May 5, 2021
dcc9995
Remove data validation on _compute_mi_cd's KDTrees
jjerphan May 9, 2021
ff32212
Merge branch 'main' into remove_validation_internal_trees
jjerphan May 9, 2021
903125f
Merge branch 'main' into remove_validation_internal_trees
jjerphan May 21, 2021
c882688
Move context managers right before the parallel call site
jjerphan Jun 14, 2021
9cb0d6b
Merge branch 'main' into remove_validation_internal_trees
jjerphan Aug 5, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 29 additions & 18 deletions sklearn/feature_selection/_mutual_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from scipy.sparse import issparse
from scipy.special import digamma

from .. import config_context
from ..metrics.cluster import mutual_info_score
from ..neighbors import NearestNeighbors, KDTree
from ..preprocessing import scale
Expand Down Expand Up @@ -57,22 +58,27 @@ def _compute_mi_cc(x, y, n_neighbors):
radius = nn.kneighbors()[0]
radius = np.nextafter(radius[:, -1], 0)

# KDTree is explicitly fit to allow for the querying of number of
# neighbors within a specified radius
kd = KDTree(x, metric="chebyshev")
nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
nx = np.array(nx) - 1.0

kd = KDTree(y, metric="chebyshev")
ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
ny = np.array(ny) - 1.0

mi = (
digamma(n_samples)
+ digamma(n_neighbors)
- np.mean(digamma(nx + 1))
- np.mean(digamma(ny + 1))
)
with config_context(assume_finite=True):
# We remove the validation of x done at the beginning of
# KDTree.__init__ and KDTree.query_radius as x already got validated
# at the beginning of feature_selection._estimate_mi.

# KDTree is explicitly fit to allow for the querying of number of
# neighbors within a specified radius
kd = KDTree(x, metric="chebyshev")
nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
nx = np.array(nx) - 1.0

kd = KDTree(y, metric="chebyshev")
ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
ny = np.array(ny) - 1.0

mi = (
digamma(n_samples)
+ digamma(n_neighbors)
- np.mean(digamma(nx + 1))
- np.mean(digamma(ny + 1))
)

return max(0, mi)

Expand Down Expand Up @@ -136,8 +142,13 @@ def _compute_mi_cd(c, d, n_neighbors):
c = c[mask]
radius = radius[mask]

kd = KDTree(c)
m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
with config_context(assume_finite=True):
# We remove the validation of c done at the beginning of
# KDTree.__init__ and KDTree.query_radius as c already got validated
# at the beginning of feature_selection._estimate_mi.
kd = KDTree(c)
m_all = kd.query_radius(c, radius, count_only=True,
return_distance=False)
m_all = np.array(m_all) - 1.0

mi = (
Expand Down
72 changes: 44 additions & 28 deletions sklearn/neighbors/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from ._ball_tree import BallTree
from ._kd_tree import KDTree
from .. import config_context
from ..base import BaseEstimator, MultiOutputMixin
from ..base import is_classifier
from ..metrics import pairwise_distances_chunked
Expand Down Expand Up @@ -542,24 +543,28 @@ def _fit(self, X, y=None):
else:
self._fit_method = "brute"

if self._fit_method == "ball_tree":
self._tree = BallTree(
X,
self.leaf_size,
metric=self.effective_metric_,
**self.effective_metric_params_,
)
elif self._fit_method == "kd_tree":
self._tree = KDTree(
X,
self.leaf_size,
metric=self.effective_metric_,
**self.effective_metric_params_,
)
elif self._fit_method == "brute":
self._tree = None
else:
raise ValueError("algorithm = '%s' not recognized" % self.algorithm)
with config_context(assume_finite=True):
# In the following cases, we remove the validation of X done at
# the beginning of the BinaryTree's constructors as X already got
# validated when calling this method, NeighborsBase._fit.
Comment on lines +546 to +549
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checks running time (represented by the small spike at the beginning) is negligible here.
Screenshot 2021-08-06 at 18-04-18 Screenshot

Script
# profile.py
from sklearn.datasets import make_regression
from sklearn.neighbors import NearestNeighbors


def main(args=None):
    X_train, _ = make_regression(n_samples=100_000, n_features=10)
    X_test, _ = make_regression(n_samples=100_000, n_features=10)

    nn = NearestNeighbors(algorithm='kd_tree').fit(X_train)
    nn.kneighbors(X_test, n_neighbors=2)


if __name__ == "__main__":
    main()
giltracer --state-detect profile.py

if self._fit_method == "ball_tree":
self._tree = BallTree(
X,
self.leaf_size,
metric=self.effective_metric_,
**self.effective_metric_params_,
)
elif self._fit_method == "kd_tree":
self._tree = KDTree(
X,
self.leaf_size,
metric=self.effective_metric_,
**self.effective_metric_params_,
)
elif self._fit_method == "brute":
self._tree = None
else:
raise ValueError("algorithm = '%s' not recognized" % self.algorithm)

if self.n_neighbors is not None:
if self.n_neighbors <= 0:
Expand Down Expand Up @@ -770,12 +775,18 @@ class from an array representing our data set and ask who's
parallel_kwargs = {"backend": "threading"}
else:
parallel_kwargs = {"prefer": "threads"}
chunked_results = Parallel(n_jobs, **parallel_kwargs)(
delayed(_tree_query_parallel_helper)(
self._tree, X[s], n_neighbors, return_distance

with config_context(assume_finite=True):
# We remove the validation of the query points
# (in *parallel_kwargs) done at the beginning of
# BinaryTree.query as those points already got
# validated in the caller.
Comment on lines +779 to +783
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checks running time (represented by the small spike at the beginning) here is negligible.
Screenshot 2021-08-06 at 18-05-07 Screenshot

Script
# profile.py
from sklearn.datasets import make_regression
from sklearn.neighbors import NearestNeighbors


def main(args=None):
    X_train, _ = make_regression(n_samples=100_000, n_features=10)
    X_test, _ = make_regression(n_samples=100_000, n_features=10)

    nn = NearestNeighbors(algorithm='kd_tree').fit(X_train)
    nn.kneighbors(X_test, n_neighbors=2)


if __name__ == "__main__":
    main()
giltracer --state-detect profile.py

chunked_results = Parallel(n_jobs, **parallel_kwargs)(
delayed(_tree_query_parallel_helper)(
self._tree, X[s], n_neighbors, return_distance
)
for s in gen_even_slices(X.shape[0], n_jobs)
)
for s in gen_even_slices(X.shape[0], n_jobs)
)
else:
raise ValueError("internal: _fit_method not recognized")

Expand Down Expand Up @@ -1108,12 +1119,17 @@ class from an array representing our data set and ask who's
else:
parallel_kwargs = {"prefer": "threads"}

chunked_results = Parallel(n_jobs, **parallel_kwargs)(
delayed_query(
self._tree, X[s], radius, return_distance, sort_results=sort_results
with config_context(assume_finite=True):
# We remove the validation of the query points
# (in *parallel_kwargs) done at the beginning of
# BinaryTree.query_radius as those points already
# got validated in the caller.
Comment on lines +1122 to +1126
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checks running time here is negligible here similarly.

Script
# profile.py
from sklearn.datasets import make_regression
from sklearn.neighbors import NearestNeighbors


def main(args=None):
    X_train, _ = make_regression(n_samples=100_000, n_features=10)
    X_test, _ = make_regression(n_samples=100_000, n_features=10)

    nn = NearestNeighbors(algorithm='kd_tree').fit(X_train)
    nn.kneighbors(X_test, n_neighbors=2)


if __name__ == "__main__":
    main()
giltracer --state-detect profile.py

chunked_results = Parallel(n_jobs, **parallel_kwargs)(
delayed_query(
self._tree, X[s], radius, return_distance, sort_results=sort_results
)
for s in gen_even_slices(X.shape[0], n_jobs)
)
for s in gen_even_slices(X.shape[0], n_jobs)
)
if return_distance:
neigh_ind, neigh_dist = tuple(zip(*chunked_results))
results = np.hstack(neigh_dist), np.hstack(neigh_ind)
Expand Down
24 changes: 15 additions & 9 deletions sklearn/neighbors/_kde.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import numpy as np
from scipy.special import gammainc

from .. import config_context
from ..base import BaseEstimator
from ..utils import check_random_state
from ..utils.validation import _check_sample_weight, check_is_fitted
Expand Down Expand Up @@ -227,15 +229,19 @@ def score_samples(self, X):
else:
N = self.tree_.sum_weight
atol_N = self.atol * N
log_density = self.tree_.kernel_density(
X,
h=self.bandwidth,
kernel=self.kernel,
atol=atol_N,
rtol=self.rtol,
breadth_first=self.breadth_first,
return_log=True,
)
with config_context(assume_finite=True):
# We remove the validation of X done at the beginning of
# BinaryTree.kernel_density as X already got validated at the
# beginning of this method, KernelDensity.score_samples.
log_density = self.tree_.kernel_density(
X,
h=self.bandwidth,
kernel=self.kernel,
atol=atol_N,
rtol=self.rtol,
breadth_first=self.breadth_first,
return_log=True,
)
log_density -= np.log(N)
return log_density

Expand Down