Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
bb9ec11
MAINT Remove -Wcpp warnings when compiling sklearn.cluster._hierarchi…
OmarManzoor Nov 14, 2022
f183b42
Merge remote-tracking branch 'upstream/main' into cython_hierarchical…
OmarManzoor Nov 14, 2022
c4376ef
Merge remote-tracking branch 'upstream/main' into cython_hierarchical…
OmarManzoor Nov 15, 2022
faaf0f7
Improvements in compute_ward_dist
OmarManzoor Nov 15, 2022
661777a
Addressed PR improvements
OmarManzoor Nov 15, 2022
d17d03c
Merge remote-tracking branch 'upstream/main' into cython_hierarchical…
OmarManzoor Nov 15, 2022
913f2a4
Merge branch 'main' into cython_hierarchical_fast
glemaitre Nov 17, 2022
e8a245b
Addressed PR suggestions
OmarManzoor Nov 17, 2022
f702547
Merge branch 'cython_hierarchical_fast' of https://github.com/OmarMan…
OmarManzoor Nov 17, 2022
b1db35a
Merge remote-tracking branch 'upstream/main' into cython_hierarchical…
OmarManzoor Nov 17, 2022
6dd1728
Merge branch 'main' into cython_hierarchical_fast
jjerphan Nov 18, 2022
ec48a91
Remove aliases
OmarManzoor Nov 18, 2022
fb5a085
Merge branch 'cython_hierarchical_fast' of https://github.com/OmarMan…
OmarManzoor Nov 18, 2022
8083d81
PR suggestion
OmarManzoor Nov 18, 2022
c856fb2
Use def instead of cpdef for functions only used inside python
OmarManzoor Nov 18, 2022
8f0a5bf
Merge main into branch
OmarManzoor Nov 23, 2022
c5ae2d8
Merge remote-tracking branch 'upstream/main' into cython_hierarchical…
OmarManzoor Nov 26, 2022
2eda8ff
Use np.asarray in place of the base attribute in _single_linkage_label
OmarManzoor Nov 26, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
"sklearn.__check_build._check_build",
"sklearn._loss._loss",
"sklearn.cluster._dbscan_inner",
"sklearn.cluster._hierarchical_fast",
"sklearn.cluster._k_means_common",
"sklearn.cluster._k_means_lloyd",
"sklearn.cluster._k_means_elkan",
Expand Down
83 changes: 43 additions & 40 deletions sklearn/cluster/_hierarchical_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@ import numpy as np
cimport numpy as cnp
cimport cython

ctypedef cnp.float64_t DOUBLE
ctypedef cnp.npy_intp INTP
ctypedef cnp.int8_t INT8

cnp.import_array()

from ..metrics._dist_metrics cimport DistanceMetric
Expand All @@ -29,15 +25,17 @@ from numpy.math cimport INFINITY
###############################################################################
# Utilities for computing the ward momentum

def compute_ward_dist(cnp.ndarray[DOUBLE, ndim=1, mode='c'] m_1,
cnp.ndarray[DOUBLE, ndim=2, mode='c'] m_2,
cnp.ndarray[INTP, ndim=1, mode='c'] coord_row,
cnp.ndarray[INTP, ndim=1, mode='c'] coord_col,
cnp.ndarray[DOUBLE, ndim=1, mode='c'] res):
cdef INTP size_max = coord_row.shape[0]
cdef INTP n_features = m_2.shape[1]
cdef INTP i, j, row, col
cdef DOUBLE pa, n
def compute_ward_dist(
const cnp.float64_t[::1] m_1,
const cnp.float64_t[:, ::1] m_2,
const cnp.intp_t[::1] coord_row,
const cnp.intp_t[::1] coord_col,
cnp.float64_t[::1] res
):
cdef cnp.intp_t size_max = coord_row.shape[0]
cdef cnp.intp_t n_features = m_2.shape[1]
cdef cnp.intp_t i, j, row, col
cdef cnp.float64_t pa, n

for i in range(size_max):
row = coord_row[i]
Expand All @@ -47,13 +45,12 @@ def compute_ward_dist(cnp.ndarray[DOUBLE, ndim=1, mode='c'] m_1,
for j in range(n_features):
pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2
res[i] = pa * n
return res


###############################################################################
# Utilities for cutting and exploring a hierarchical tree

def _hc_get_descendent(INTP node, children, INTP n_leaves):
def _hc_get_descendent(cnp.intp_t node, children, cnp.intp_t n_leaves):
"""
Function returning all the descendent leaves of a set of nodes in the tree.

Expand Down Expand Up @@ -82,7 +79,7 @@ def _hc_get_descendent(INTP node, children, INTP n_leaves):
# It is actually faster to do the accounting of the number of
# elements is the list ourselves: len is a lengthy operation on a
# chained list
cdef INTP i, n_indices = 1
cdef cnp.intp_t i, n_indices = 1

while n_indices:
i = ind.pop()
Expand All @@ -95,7 +92,7 @@ def _hc_get_descendent(INTP node, children, INTP n_leaves):
return descendent


def hc_get_heads(cnp.ndarray[INTP, ndim=1] parents, copy=True):
def hc_get_heads(cnp.intp_t[:] parents, copy=True):
"""Returns the heads of the forest, as defined by parents.

Parameters
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can change the INTP below

Expand All @@ -111,7 +108,7 @@ def hc_get_heads(cnp.ndarray[INTP, ndim=1] parents, copy=True):
The indices in the 'parents' of the tree heads

"""
cdef INTP parent, node0, node, size
cdef cnp.intp_t parent, node0, node, size
if copy:
parents = np.copy(parents)
size = parents.size
Expand All @@ -127,8 +124,12 @@ def hc_get_heads(cnp.ndarray[INTP, ndim=1] parents, copy=True):
return parents


def _get_parents(nodes, heads, cnp.ndarray[INTP, ndim=1] parents,
cnp.ndarray[INT8, ndim=1, mode='c'] not_visited):
def _get_parents(
nodes,
heads,
const cnp.intp_t[:] parents,
cnp.int8_t[::1] not_visited
):
"""Returns the heads of the given nodes, as defined by parents.

Modifies 'heads' and 'not_visited' in-place.
Expand All @@ -145,7 +146,7 @@ def _get_parents(nodes, heads, cnp.ndarray[INTP, ndim=1] parents,
The tree nodes to consider (modified inplace)

"""
cdef INTP parent, node
cdef cnp.intp_t parent, node

for node in nodes:
parent = parents[node]
Expand All @@ -155,7 +156,6 @@ def _get_parents(nodes, heads, cnp.ndarray[INTP, ndim=1] parents,
if not_visited[node]:
not_visited[node] = 0
heads.append(node)
return heads


###############################################################################
Expand All @@ -166,9 +166,13 @@ def _get_parents(nodes, heads, cnp.ndarray[INTP, ndim=1] parents,
# as keys and edge weights as values.


def max_merge(IntFloatDict a, IntFloatDict b,
cnp.ndarray[ITYPE_t, ndim=1] mask,
ITYPE_t n_a, ITYPE_t n_b):
def max_merge(
IntFloatDict a,
IntFloatDict b,
const cnp.intp_t[:] mask,
cnp.intp_t n_a,
cnp.intp_t n_b
):
"""Merge two IntFloatDicts with the max strategy: when the same key is
present in the two dicts, the max of the two values is used.

Expand Down Expand Up @@ -219,9 +223,13 @@ def max_merge(IntFloatDict a, IntFloatDict b,
return out_obj


def average_merge(IntFloatDict a, IntFloatDict b,
cnp.ndarray[ITYPE_t, ndim=1] mask,
ITYPE_t n_a, ITYPE_t n_b):
def average_merge(
IntFloatDict a,
IntFloatDict b,
const cnp.intp_t[:] mask,
cnp.intp_t n_a,
cnp.intp_t n_b
):
"""Merge two IntFloatDicts with the average strategy: when the
same key is present in the two dicts, the weighted average of the two
values is used.
Expand Down Expand Up @@ -354,8 +362,7 @@ cdef class UnionFind(object):
return n


cpdef cnp.ndarray[DTYPE_t, ndim=2] _single_linkage_label(
cnp.ndarray[DTYPE_t, ndim=2] L):
def _single_linkage_label(const cnp.float64_t[:, :] L):
"""
Convert an linkage array or MST to a tree by labelling clusters at merges.
This is done by using a Union find structure to keep track of merges
Expand All @@ -375,14 +382,12 @@ cpdef cnp.ndarray[DTYPE_t, ndim=2] _single_linkage_label(
A tree in the format used by scipy.cluster.hierarchy.
"""

cdef cnp.ndarray[DTYPE_t, ndim=2] result_arr
cdef DTYPE_t[:, ::1] result
cdef DTYPE_t[:, ::1] result_arr

cdef ITYPE_t left, left_cluster, right, right_cluster, index
cdef DTYPE_t delta

result_arr = np.zeros((L.shape[0], 4), dtype=DTYPE)
result = result_arr
U = UnionFind(L.shape[0] + 1)

for index in range(L.shape[0]):
Expand All @@ -394,14 +399,14 @@ cpdef cnp.ndarray[DTYPE_t, ndim=2] _single_linkage_label(
left_cluster = U.fast_find(left)
right_cluster = U.fast_find(right)

result[index][0] = left_cluster
result[index][1] = right_cluster
result[index][2] = delta
result[index][3] = U.size[left_cluster] + U.size[right_cluster]
result_arr[index][0] = left_cluster
result_arr[index][1] = right_cluster
result_arr[index][2] = delta
result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster]

U.union(left_cluster, right_cluster)

return result_arr
return np.asarray(result_arr)


@cython.wraparound(True)
Expand Down Expand Up @@ -472,8 +477,6 @@ def mst_linkage_core(
cnp.int8_t[:] in_tree = np.zeros(n_samples, dtype=np.int8)
DTYPE_t[:, ::1] result = np.zeros((n_samples - 1, 3))

cnp.ndarray label_filter

ITYPE_t current_node = 0
ITYPE_t new_node
ITYPE_t i
Expand Down