Skip to content

ENH Add parameters V, VI, p, w to AgglomerativeClustering #30528

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 82 additions & 30 deletions sklearn/cluster/_agglomerative.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
# For non fully-connected graphs


def _fix_connectivity(X, connectivity, affinity):
def _fix_connectivity(X, connectivity, metric):
"""
Fixes the connectivity matrix.

Expand All @@ -65,7 +65,7 @@ def _fix_connectivity(X, connectivity, affinity):
be symmetric and only the upper triangular half is used.
Default is `None`, i.e, the Ward algorithm is unstructured.

affinity : {"euclidean", "precomputed"}, default="euclidean"
metric : {"euclidean", "precomputed"}, default="euclidean"
Which affinity to use. At the moment `precomputed` and
``euclidean`` are supported. `euclidean` uses the
negative squared Euclidean distance between points.
Expand Down Expand Up @@ -112,7 +112,7 @@ def _fix_connectivity(X, connectivity, affinity):
graph=connectivity,
n_connected_components=n_connected_components,
component_labels=labels,
metric=affinity,
metric=metric,
mode="connectivity",
)

Expand Down Expand Up @@ -185,10 +185,13 @@ def _single_linkage_tree(
"connectivity": ["array-like", "sparse matrix", None],
"n_clusters": [Interval(Integral, 1, None, closed="left"), None],
"return_distance": ["boolean"],
"extra": [None],
},
prefer_skip_nested_validation=True,
)
def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
def ward_tree(
X, *, connectivity=None, n_clusters=None, return_distance=False, extra=None
):
"""Ward clustering based on a Feature matrix.

Recursively merges the pair of clusters that minimally increases
Expand Down Expand Up @@ -224,6 +227,9 @@ def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
return_distance : bool, default=False
If `True`, return the distance between the clusters.

extra: dict, default={}
Extra arguments for the given `metric`.

Returns
-------
children : ndarray of shape (n_nodes-1, 2)
Expand Down Expand Up @@ -319,7 +325,7 @@ def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
return children_, 1, n_samples, None

connectivity, n_connected_components = _fix_connectivity(
X, connectivity, affinity="euclidean"
X, connectivity, metric="euclidean"
)
if n_clusters is None:
n_nodes = 2 * n_samples - 1
Expand Down Expand Up @@ -422,14 +428,15 @@ def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
return children, n_connected_components, n_leaves, parent


# single average and complete linkage
# single, average and complete linkage
def linkage_tree(
X,
connectivity=None,
n_clusters=None,
linkage="complete",
affinity="euclidean",
metric="euclidean",
return_distance=False,
extra={},
):
"""Linkage agglomerative clustering based on a Feature matrix.

Expand Down Expand Up @@ -469,7 +476,7 @@ def linkage_tree(
- "single" uses the minimum of the distances between all
observations of the two sets.

affinity : str or callable, default='euclidean'
metric : str or callable, default='euclidean'
Which metric to use. Can be 'euclidean', 'manhattan', or any
distance known to paired distance (see metric.pairwise).

Expand Down Expand Up @@ -524,7 +531,7 @@ def linkage_tree(
% (linkage_choices.keys(), linkage)
) from e

if affinity == "cosine" and np.any(~np.any(X, axis=1)):
if metric == "cosine" and np.any(~np.any(X, axis=1)):
raise ValueError("Cosine affinity cannot be used when X contains zero vectors")

if connectivity is None:
Expand All @@ -543,7 +550,7 @@ def linkage_tree(
stacklevel=2,
)

if affinity == "precomputed":
if metric == "precomputed":
# for the linkage function of hierarchy to work on precomputed
# data, provide as first argument an ndarray of the shape returned
# by sklearn.metrics.pairwise_distances.
Expand All @@ -553,23 +560,23 @@ def linkage_tree(
)
i, j = np.triu_indices(X.shape[0], k=1)
X = X[i, j]
elif affinity == "l2":
elif metric == "l2":
# Translate to something understood by scipy
affinity = "euclidean"
elif affinity in ("l1", "manhattan"):
affinity = "cityblock"
elif callable(affinity):
X = affinity(X)
metric = "euclidean"
elif metric in ("l1", "manhattan"):
metric = "cityblock"
elif callable(metric):
X = metric(X)
i, j = np.triu_indices(X.shape[0], k=1)
X = X[i, j]
if (
linkage == "single"
and affinity != "precomputed"
and not callable(affinity)
and affinity in METRIC_MAPPING64
and metric != "precomputed"
and not callable(metric)
and metric in METRIC_MAPPING64
):
# We need the fast cythonized metric from neighbors
dist_metric = DistanceMetric.get_metric(affinity)
dist_metric = DistanceMetric.get_metric(metric, **extra)

# The Cython routines used require contiguous arrays
X = np.ascontiguousarray(X, dtype=np.double)
Expand All @@ -581,7 +588,9 @@ def linkage_tree(
# Convert edge list into standard hierarchical clustering format
out = _hierarchical.single_linkage_label(mst)
else:
out = hierarchy.linkage(X, method=linkage, metric=affinity)
# TODO: hierarchy.linkage does not provide a way to pass extra
# arguments to pdist like V for "seuclidean".
out = hierarchy.linkage(X, method=linkage, metric=metric)
children_ = out[:, :2].astype(int, copy=False)

if return_distance:
Expand All @@ -590,7 +599,7 @@ def linkage_tree(
return children_, 1, n_samples, None

connectivity, n_connected_components = _fix_connectivity(
X, connectivity, affinity=affinity
X, connectivity, metric=metric
)
connectivity = connectivity.tocoo()
# Put the diagonal to zero
Expand All @@ -600,13 +609,13 @@ def linkage_tree(
connectivity.data = connectivity.data[diag_mask]
del diag_mask

if affinity == "precomputed":
if metric == "precomputed":
distances = X[connectivity.row, connectivity.col].astype(np.float64, copy=False)
else:
# FIXME We compute all the distances, while we could have only computed
# the "interesting" distances
distances = paired_distances(
X[connectivity.row], X[connectivity.col], metric=affinity
X[connectivity.row], X[connectivity.col], metric=metric
)
connectivity.data = distances

Expand Down Expand Up @@ -836,11 +845,11 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):

- 'ward' minimizes the variance of the clusters being merged.
- 'average' uses the average of the distances of each observation of
the two sets.
the two sets.
- 'complete' or 'maximum' linkage uses the maximum distances between
all observations of the two sets.
all observations of the two sets.
- 'single' uses the minimum of the distances between all observations
of the two sets.
of the two sets.

.. versionadded:: 0.20
Added the 'single' option
Expand All @@ -865,6 +874,26 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
For an example of dendrogram visualization, see
:ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_dendrogram.py`.

V : ndarray, optional
The variance vector for standardized Euclidean.

.. versionadded:: 1.7

VI : ndarray, optional
The inverse of the covariance matrix for Mahalanobis.

.. versionadded:: 1.7

p : float, optional
The p-norm to apply for Minkowski, weighted and unweighted.

.. versionadded:: 1.7

w : ndarray, optional
The weight vector for metrics that support weights (e.g., Minkowski).

.. versionadded:: 1.7

Attributes
----------
n_clusters_ : int
Expand Down Expand Up @@ -899,7 +928,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
The children of each non-leaf node. Values less than `n_samples`
correspond to leaves of the tree which are the original samples.
A node `i` greater than or equal to `n_samples` is a non-leaf
node and has children `children_[i - n_samples]`. Alternatively
c node and has children `children_[i - n_samples]`. Alternatively
at the i-th iteration, children[i][0] and children[i][1]
are merged to form node `n_samples + i`.

Expand Down Expand Up @@ -939,6 +968,10 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
"linkage": [StrOptions(set(_TREE_BUILDERS.keys()))],
"distance_threshold": [Interval(Real, 0, None, closed="left"), None],
"compute_distances": ["boolean"],
"V": ["array-like", None],
"VI": ["array-like", None],
"p": [None],
"w": ["array-like", None],
}

def __init__(
Expand All @@ -952,6 +985,10 @@ def __init__(
linkage="ward",
distance_threshold=None,
compute_distances=False,
V=None,
VI=None,
p=None,
w=None,
):
self.n_clusters = n_clusters
self.distance_threshold = distance_threshold
Expand All @@ -961,11 +998,15 @@ def __init__(
self.linkage = linkage
self.metric = metric
self.compute_distances = compute_distances
self.V = V
self.VI = VI
self.p = p
self.w = w

@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the hierarchical clustering from features, or distance matrix.

s
Parameters
----------
X : array-like, shape (n_samples, n_features) or \
Expand Down Expand Up @@ -1048,17 +1089,28 @@ def _fit(self, X):
kwargs = {}
if self.linkage != "ward":
kwargs["linkage"] = self.linkage
kwargs["affinity"] = self.metric
kwargs["metric"] = self.metric

distance_threshold = self.distance_threshold

return_distance = (distance_threshold is not None) or self.compute_distances

extra_metric_kwargs = {}
if self.V is not None:
extra_metric_kwargs["V"] = self.V
if self.VI is not None:
extra_metric_kwargs["VI"] = self.VI
if self.p is not None:
extra_metric_kwargs["p"] = self.p
if self.w is not None:
extra_metric_kwargs["w"] = self.w

out = memory.cache(tree_builder)(
X,
connectivity=connectivity,
n_clusters=n_clusters,
return_distance=return_distance,
extra=extra_metric_kwargs,
**kwargs,
)
(self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[
Expand Down
12 changes: 6 additions & 6 deletions sklearn/cluster/tests/test_hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ def test_linkage_misc():
# test hierarchical clustering on a precomputed distances matrix
dis = cosine_distances(X)

res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
res = linkage_tree(dis, metric="precomputed")
assert_array_equal(res[0], linkage_tree(X, metric="cosine")[0])

# test hierarchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
res = linkage_tree(X, metric=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, metric="manhattan")[0])


def test_structured_linkage_tree():
Expand Down Expand Up @@ -143,7 +143,7 @@ def test_zero_cosine_linkage_tree():
X = np.array([[0, 1], [0, 0]])
msg = "Cosine affinity cannot be used when X contains zero vectors"
with pytest.raises(ValueError, match=msg):
linkage_tree(X, affinity="cosine")
linkage_tree(X, metric="cosine")


@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)])
Expand Down Expand Up @@ -719,7 +719,7 @@ def increment(self, *args, **kwargs):

fa = FakeAffinity()

linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
linkage_tree(X, connectivity=connectivity, metric=fa.increment)

assert fa.counter == 3

Expand Down
Loading