From ff859efeb1b974ad57807e97253f8c44591db2b6 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 6 Aug 2018 12:01:42 +1000 Subject: [PATCH 1/2] FIX IndexError due to imprecision in KMeans++ Fixes #8583 I'm not sure how to test this. --- sklearn/cluster/k_means_.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index 444d8fca29f7b..47b5950638db9 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -111,6 +111,9 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): rand_vals = random_state.random_sample(n_local_trials) * current_pot candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals) + # XXX: numerical imprecision can result in a candidate_id out of range + np.clip(candidate_ids, None, len(closest_dist_sq) - 1, + out=candidate_ids) # Compute distances to center candidates distance_to_candidates = euclidean_distances( From 69d93f53cbd7e9ce62838c10789f189942833a22 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 6 Aug 2018 16:42:43 +1000 Subject: [PATCH 2/2] Fix silly error --- sklearn/cluster/k_means_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index 47b5950638db9..2aeabc995163e 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -112,7 +112,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals) # XXX: numerical imprecision can result in a candidate_id out of range - np.clip(candidate_ids, None, len(closest_dist_sq) - 1, + np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids) # Compute distances to center candidates