Skip to content

Commit 1639749

Browse files
authored
Merge branch 'main' into MAINT-parameters-validation-for-covariance.empirical_covariance
2 parents c08abba + 0266481 commit 1639749

File tree

10 files changed

+134
-54
lines changed

10 files changed

+134
-54
lines changed

SECURITY.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55
| Version | Supported |
66
| --------- | ------------------ |
7-
| 1.1.3 | :white_check_mark: |
8-
| < 1.1.3 | :x: |
7+
| 1.2.0 | :white_check_mark: |
8+
| < 1.2.0 | :x: |
99

1010
## Reporting a Vulnerability
1111

build_tools/azure/install.sh

-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ setup_ccache() {
2929

3030
pre_python_environment_install() {
3131
if [[ "$DISTRIB" == "ubuntu" ]]; then
32-
sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test
3332
sudo apt-get update
3433
sudo apt-get install python3-scipy python3-matplotlib \
3534
libatlas3-base libatlas-base-dev python3-virtualenv ccache

doc/computing/parallelism.rst

+7
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,13 @@ When this environment variable is set to a non zero value, the `Cython`
299299
derivative, `boundscheck` is set to `True`. This is useful for finding
300300
segfaults.
301301

302+
`SKLEARN_BUILD_ENABLE_DEBUG_SYMBOLS`
303+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
304+
305+
When this environment variable is set to a non zero value, the debug symbols
306+
will be included in the compiled C extensions. Only debug symbols for POSIX
307+
systems is configured.
308+
302309
`SKLEARN_PAIRWISE_DIST_CHUNK_SIZE`
303310
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
304311

doc/developers/maintainer.rst

+2
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,8 @@ The following GitHub checklist might be helpful in a release PR::
310310
* [ ] upload the wheels and source tarball to PyPI
311311
* [ ] https://github.com/scikit-learn/scikit-learn/releases publish (except for RC)
312312
* [ ] announce on mailing list and on Twitter, and LinkedIn
313+
* [ ] update symlink for stable in
314+
https://github.com/scikit-learn/scikit-learn.github.io (only major/minor)
313315
* [ ] update SECURITY.md in main branch (except for RC)
314316

315317
Merging Pull Requests

setup.py

+12
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@
9898
"sklearn.metrics._pairwise_distances_reduction._argkmin",
9999
"sklearn.metrics._pairwise_distances_reduction._radius_neighbors",
100100
"sklearn.metrics._pairwise_fast",
101+
"sklearn.neighbors._ball_tree",
102+
"sklearn.neighbors._kd_tree",
101103
"sklearn.neighbors._partition_nodes",
102104
"sklearn.tree._splitter",
103105
"sklearn.tree._utils",
@@ -509,6 +511,16 @@ def configure_extension_modules():
509511
default_extra_compile_args = [f"/{optimization_level}"]
510512
default_libraries = []
511513

514+
build_with_debug_symbols = (
515+
os.environ.get("SKLEARN_BUILD_ENABLE_DEBUG_SYMBOLS", "0") != "0"
516+
)
517+
if os.name == "posix":
518+
if build_with_debug_symbols:
519+
default_extra_compile_args.append("-g")
520+
else:
521+
# Setting -g0 will strip symbols, reducing the binary size of extensions
522+
default_extra_compile_args.append("-g0")
523+
512524
cython_exts = []
513525
for submodule, extensions in extension_config.items():
514526
submodule_parts = submodule.split(".")

sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp

+2-5
Original file line numberDiff line numberDiff line change
@@ -330,11 +330,8 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}):
330330
metric_kwargs=None,
331331
):
332332
if (
333-
metric_kwargs is not None and
334-
len(metric_kwargs) > 0 and (
335-
"Y_norm_squared" not in metric_kwargs or
336-
"X_norm_squared" not in metric_kwargs
337-
)
333+
isinstance(metric_kwargs, dict) and
334+
(metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"})
338335
):
339336
warnings.warn(
340337
f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "

sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp

+2-5
Original file line numberDiff line numberDiff line change
@@ -336,11 +336,8 @@ cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}
336336
metric_kwargs=None,
337337
):
338338
if (
339-
metric_kwargs is not None and
340-
len(metric_kwargs) > 0 and (
341-
"Y_norm_squared" not in metric_kwargs or
342-
"X_norm_squared" not in metric_kwargs
343-
)
339+
isinstance(metric_kwargs, dict) and
340+
(metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"})
344341
):
345342
warnings.warn(
346343
f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't "

sklearn/metrics/_ranking.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
import warnings
2323
from functools import partial
24-
from numbers import Integral
24+
from numbers import Real
2525

2626
import numpy as np
2727
from scipy.sparse import csr_matrix, issparse
@@ -723,7 +723,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
723723
y_score : ndarray of shape (n_samples,)
724724
Estimated probabilities or output of a decision function.
725725
726-
pos_label : int or str, default=None
726+
pos_label : int, float, bool or str, default=None
727727
The label of the positive class.
728728
729729
sample_weight : array-like of shape (n_samples,), default=None
@@ -908,7 +908,7 @@ def precision_recall_curve(y_true, probas_pred, *, pos_label=None, sample_weight
908908
{
909909
"y_true": ["array-like"],
910910
"y_score": ["array-like"],
911-
"pos_label": [Integral, str, None],
911+
"pos_label": [Real, str, "boolean", None],
912912
"sample_weight": ["array-like", None],
913913
"drop_intermediate": ["boolean"],
914914
}
@@ -933,7 +933,7 @@ def roc_curve(
933933
class, confidence values, or non-thresholded measure of decisions
934934
(as returned by "decision_function" on some classifiers).
935935
936-
pos_label : int or str, default=None
936+
pos_label : int, float, bool or str, default=None
937937
The label of the positive class.
938938
When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},
939939
``pos_label`` is set to 1, otherwise an error will be raised.

sklearn/metrics/tests/test_pairwise_distances_reduction.py

+67-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import itertools
22
import re
3+
import warnings
34
from collections import defaultdict
45

56
import numpy as np
@@ -620,19 +621,44 @@ def test_argkmin_factory_method_wrong_usages():
620621
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
621622
ArgKmin.compute(X=np.asfortranarray(X), Y=Y, k=k, metric=metric)
622623

624+
# A UserWarning must be raised in this case.
623625
unused_metric_kwargs = {"p": 3}
624626

625-
message = (
626-
r"Some metric_kwargs have been passed \({'p': 3}\) but aren't usable for this"
627-
r" case \("
628-
r"EuclideanArgKmin64."
629-
)
627+
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
630628

631629
with pytest.warns(UserWarning, match=message):
632630
ArgKmin.compute(
633631
X=X, Y=Y, k=k, metric=metric, metric_kwargs=unused_metric_kwargs
634632
)
635633

634+
# A UserWarning must be raised in this case.
635+
metric_kwargs = {
636+
"p": 3, # unused
637+
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
638+
}
639+
640+
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
641+
642+
with pytest.warns(UserWarning, match=message):
643+
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
644+
645+
# No user warning must be raised in this case.
646+
metric_kwargs = {
647+
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
648+
}
649+
with warnings.catch_warnings():
650+
warnings.simplefilter("error", category=UserWarning)
651+
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
652+
653+
# No user warning must be raised in this case.
654+
metric_kwargs = {
655+
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
656+
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
657+
}
658+
with warnings.catch_warnings():
659+
warnings.simplefilter("error", category=UserWarning)
660+
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
661+
636662

637663
def test_radius_neighbors_factory_method_wrong_usages():
638664
rng = np.random.RandomState(1)
@@ -683,16 +709,48 @@ def test_radius_neighbors_factory_method_wrong_usages():
683709

684710
unused_metric_kwargs = {"p": 3}
685711

686-
message = (
687-
r"Some metric_kwargs have been passed \({'p': 3}\) but aren't usable for this"
688-
r" case \(EuclideanRadiusNeighbors64"
689-
)
712+
# A UserWarning must be raised in this case.
713+
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
690714

691715
with pytest.warns(UserWarning, match=message):
692716
RadiusNeighbors.compute(
693717
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=unused_metric_kwargs
694718
)
695719

720+
# A UserWarning must be raised in this case.
721+
metric_kwargs = {
722+
"p": 3, # unused
723+
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
724+
}
725+
726+
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
727+
728+
with pytest.warns(UserWarning, match=message):
729+
RadiusNeighbors.compute(
730+
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
731+
)
732+
733+
# No user warning must be raised in this case.
734+
metric_kwargs = {
735+
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
736+
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
737+
}
738+
with warnings.catch_warnings():
739+
warnings.simplefilter("error", category=UserWarning)
740+
RadiusNeighbors.compute(
741+
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
742+
)
743+
744+
# No user warning must be raised in this case.
745+
metric_kwargs = {
746+
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
747+
}
748+
with warnings.catch_warnings():
749+
warnings.simplefilter("error", category=UserWarning)
750+
RadiusNeighbors.compute(
751+
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
752+
)
753+
696754

697755
@pytest.mark.parametrize(
698756
"n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]

sklearn/neighbors/_binary_tree.pxi

+36-28
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ from ..utils._typedefs import DTYPE, ITYPE
166166
from ..utils._heap cimport heap_push
167167
from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort
168168

169+
# TODO: use cnp.PyArray_ENABLEFLAGS when Cython>=3.0 is used.
169170
cdef extern from "numpy/arrayobject.h":
170171
void PyArray_ENABLEFLAGS(cnp.ndarray arr, int flags)
171172

@@ -511,8 +512,8 @@ cdef class NeighborsHeap:
511512
n_nbrs : int
512513
the size of each heap.
513514
"""
514-
cdef cnp.ndarray distances_arr
515-
cdef cnp.ndarray indices_arr
515+
cdef DTYPE_t[:, ::1] distances_arr
516+
cdef ITYPE_t[:, ::1] indices_arr
516517

517518
cdef DTYPE_t[:, ::1] distances
518519
cdef ITYPE_t[:, ::1] indices
@@ -538,7 +539,7 @@ cdef class NeighborsHeap:
538539
"""
539540
if sort:
540541
self._sort()
541-
return self.distances_arr, self.indices_arr
542+
return self.distances_arr.base, self.indices_arr.base
542543

543544
cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1:
544545
"""Return the largest distance in the given row"""
@@ -643,8 +644,8 @@ cdef class NodeHeap:
643644
644645
heap[i].val < min(heap[2 * i + 1].val, heap[2 * i + 2].val)
645646
"""
646-
cdef cnp.ndarray data_arr
647-
cdef NodeHeapData_t[::1] data
647+
cdef NodeHeapData_t[:] data_arr
648+
cdef NodeHeapData_t[:] data
648649
cdef ITYPE_t n
649650

650651
def __cinit__(self):
@@ -660,13 +661,16 @@ cdef class NodeHeap:
660661

661662
cdef int resize(self, ITYPE_t new_size) except -1:
662663
"""Resize the heap to be either larger or smaller"""
663-
cdef NodeHeapData_t *data_ptr
664-
cdef NodeHeapData_t *new_data_ptr
665-
cdef ITYPE_t i
666-
cdef ITYPE_t size = self.data.shape[0]
667-
cdef cnp.ndarray new_data_arr = np.zeros(new_size,
668-
dtype=NodeHeapData)
669-
cdef NodeHeapData_t[::1] new_data = new_data_arr
664+
cdef:
665+
NodeHeapData_t *data_ptr
666+
NodeHeapData_t *new_data_ptr
667+
ITYPE_t i
668+
ITYPE_t size = self.data.shape[0]
669+
NodeHeapData_t[:] new_data_arr = np.zeros(
670+
new_size,
671+
dtype=NodeHeapData,
672+
)
673+
NodeHeapData_t[:] new_data = new_data_arr
670674

671675
if size > 0 and new_size > 0:
672676
data_ptr = &self.data[0]
@@ -769,11 +773,11 @@ VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS)
769773
# Binary Tree class
770774
cdef class BinaryTree:
771775

772-
cdef cnp.ndarray data_arr
773-
cdef cnp.ndarray sample_weight_arr
774-
cdef cnp.ndarray idx_array_arr
775-
cdef cnp.ndarray node_data_arr
776-
cdef cnp.ndarray node_bounds_arr
776+
cdef const DTYPE_t[:, ::1] data_arr
777+
cdef const DTYPE_t[::1] sample_weight_arr
778+
cdef const ITYPE_t[::1] idx_array_arr
779+
cdef const NodeData_t[::1] node_data_arr
780+
cdef const DTYPE_t[:, :, ::1] node_bounds_arr
777781

778782
cdef readonly const DTYPE_t[:, ::1] data
779783
cdef readonly const DTYPE_t[::1] sample_weight
@@ -869,7 +873,7 @@ cdef class BinaryTree:
869873
# Allocate tree-specific data
870874
allocate_data(self, self.n_nodes, n_features)
871875
self._recursive_build(
872-
node_data=self.node_data_arr,
876+
node_data=self.node_data_arr.base,
873877
i_node=0,
874878
idx_start=0,
875879
idx_end=n_samples
@@ -905,15 +909,15 @@ cdef class BinaryTree:
905909
"""
906910
if self.sample_weight is not None:
907911
# pass the numpy array
908-
sample_weight_arr = self.sample_weight_arr
912+
sample_weight_arr = self.sample_weight_arr.base
909913
else:
910914
# pass None to avoid confusion with the empty place holder
911915
# of size 1 from __cinit__
912916
sample_weight_arr = None
913-
return (self.data_arr,
914-
self.idx_array_arr,
915-
self.node_data_arr,
916-
self.node_bounds_arr,
917+
return (self.data_arr.base,
918+
self.idx_array_arr.base,
919+
self.node_data_arr.base,
920+
self.node_bounds_arr.base,
917921
int(self.leaf_size),
918922
int(self.n_levels),
919923
int(self.n_nodes),
@@ -993,8 +997,12 @@ cdef class BinaryTree:
993997
arrays: tuple of array
994998
Arrays for storing tree data, index, node data and node bounds.
995999
"""
996-
return (self.data_arr, self.idx_array_arr,
997-
self.node_data_arr, self.node_bounds_arr)
1000+
return (
1001+
self.data_arr.base,
1002+
self.idx_array_arr.base,
1003+
self.node_data_arr.base,
1004+
self.node_bounds_arr.base,
1005+
)
9981006

9991007
cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
10001008
ITYPE_t size) nogil except -1:
@@ -1340,14 +1348,14 @@ cdef class BinaryTree:
13401348
# make a new numpy array that wraps the existing data
13411349
indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_INTP, indices[i])
13421350
# make sure the data will be freed when the numpy array is garbage collected
1343-
PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_OWNDATA)
1351+
PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA)
13441352
# make sure the data is not freed twice
13451353
indices[i] = NULL
13461354

13471355
# make a new numpy array that wraps the existing data
13481356
distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_DOUBLE, distances[i])
13491357
# make sure the data will be freed when the numpy array is garbage collected
1350-
PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_OWNDATA)
1358+
PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA)
13511359
# make sure the data is not freed twice
13521360
distances[i] = NULL
13531361

@@ -1360,7 +1368,7 @@ cdef class BinaryTree:
13601368
# make a new numpy array that wraps the existing data
13611369
indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_INTP, indices[i])
13621370
# make sure the data will be freed when the numpy array is garbage collected
1363-
PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_OWNDATA)
1371+
PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA)
13641372
# make sure the data is not freed twice
13651373
indices[i] = NULL
13661374

0 commit comments

Comments
 (0)