From e0959d88411b055d911d7e24a3a7eb5129364b37 Mon Sep 17 00:00:00 2001 From: Martin Royer Date: Wed, 7 Nov 2018 16:46:46 +0100 Subject: [PATCH 1/9] halfway into class-ification --- pecok/__init__.py | 2 +- pecok/admm.py | 4 +- pecok/clustering.py | 270 +++++++++++++++++++++++++++++++++++++++++--- pecok/gamma.py | 4 +- tests/test.py | 10 +- 5 files changed, 264 insertions(+), 26 deletions(-) diff --git a/pecok/__init__.py b/pecok/__init__.py index 214df5c..2e2b868 100644 --- a/pecok/__init__.py +++ b/pecok/__init__.py @@ -1,2 +1,2 @@ -from .clustering import pecok_clustering, kmeanz_clustering +from .clustering import KMeanz, Pecok diff --git a/pecok/admm.py b/pecok/admm.py index 491b551..137dc71 100644 --- a/pecok/admm.py +++ b/pecok/admm.py @@ -48,7 +48,7 @@ def proj_Snp_imp(Y): return Y -def pecok_admm(relational_data, n_struct, n_iter_max=-1, rho=5, mat_init=None, verbose=False, eps_residual=1e-4): +def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None, verbose=False, eps_residual=1e-4): """Implementation of Alternating Direction Method of Multipliers Parameters @@ -71,7 +71,7 @@ def pecok_admm(relational_data, n_struct, n_iter_max=-1, rho=5, mat_init=None, v n_iter = n_iter + 1 oldXbar = Xbar - X = proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_struct) + X = proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_clusters) Y = proj_positive(Xbar - V) Z = proj_Snp_imp(Xbar - W) Xbar = (X + Y + Z)/3 diff --git a/pecok/clustering.py b/pecok/clustering.py index 8f1e884..c60bd8b 100644 --- a/pecok/clustering.py +++ b/pecok/clustering.py @@ -1,30 +1,266 @@ -"""Clustering wrapper""" +"""PECOK clustering""" -# author: Martin Royer +# author: Martin Royer # License: MIT import numpy as np -from sklearn.cluster import AgglomerativeClustering -from scipy.linalg import svd as LINsvd -from scipy.sparse.linalg import svds as SPAsvd +from scipy.linalg import svd as lin_svd +from scipy.sparse.linalg import svds as spa_svd + +from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin +from sklearn.cluster import AgglomerativeClustering from .gamma import gamma_hat from .admm import pecok_admm -def corrected_relational(obs, int_corr): - return (obs.dot(obs.T) - gamma_hat(obs, int_corr=int_corr)) / obs.shape[1] - +############################################################################### +# shared correction -def pecok_clustering(obs, n_struct, int_corr=4, **kwargs): - gram_corrected = corrected_relational(obs, int_corr=int_corr) - U, _, V = SPAsvd(gram_corrected, k=n_struct) - Bhat = pecok_admm(gram_corrected, n_struct=n_struct, mat_init=U.dot(V), **kwargs) - return AgglomerativeClustering(linkage='ward', n_clusters=n_struct).fit(Bhat).labels_ +def _corrected_relational(obs, corr): + return (obs.dot(obs.T) - gamma_hat(obs, corr=corr)) / obs.shape[1] -def kmeanz_clustering(obs, n_struct, int_corr=4): - gram_corrected = corrected_relational(obs, int_corr=int_corr) - U, s, _ = LINsvd(gram_corrected) +def kmeanz(X, n_clusters, corr): + gram_corrected = _corrected_relational(X, corr=corr) + U, s, _ = lin_svd(gram_corrected, compute_uv=True) approx = U.dot(np.diag(np.sqrt(s))) - return AgglomerativeClustering(linkage='ward', n_clusters=n_struct).fit(approx).labels_ + return approx, AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit(approx) + + +def pecok_clustering(obs, n_clusters, corr=4, **kwargs): + gram_corrected = _corrected_relational(obs, corr=corr) + U, _, V = spa_svd(gram_corrected, k=n_clusters) + Bhat = pecok_admm(gram_corrected, n_clusters=n_clusters, mat_init=U.dot(V), **kwargs) + return AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit(Bhat) + + +class Pecok(BaseEstimator, ClusterMixin, TransformerMixin): + """PeCoK clustering + Read more in [my thesis] + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of centroids to generate. + corr : int, optional, default: 0 + Method for correcting for bias, defaults to 0: + + 0: no correction + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + 1: bad correction + 4: good correction + init : @TODO{'k-means++', 'random' or an ndarray} + Method for initialization, defaults to 'k-means++': + + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + + Attributes + ---------- + corrected_points_: array, [n_points, n_features] + Points representation in Z-space + cluster_centers_ : array, [n_clusters, n_features] + Coordinates of cluster centers in Z-space + labels_ : + Labels of each point + inertia_ : float + Sum of squared distances (in Z-space) of samples to their closest cluster center. + + Examples + -------- + TODO + >>> from sklearn.cluster import KMeans + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X) + >>> kmeans.labels_ + array([0, 0, 0, 1, 1, 1], dtype=int32) + >>> kmeans.predict([[0, 0], [4, 4]]) + array([0, 1], dtype=int32) + >>> kmeans.cluster_centers_ + array([[1., 2.], + [4., 2.]]) + """ + + def __init__(self, n_clusters=8, corr=0, init='notyet', + verbose=0, random_state=None, copy_x=True, + n_jobs=None, algorithm='auto'): + + self.n_clusters = n_clusters + self.corr = corr + self.init = init + self.verbose = verbose + self.random_state = random_state + self.copy_x = copy_x + self.n_jobs = n_jobs + self.algorithm = algorithm + + def fit(self, X, y=None, sample_weight=None): + """Compute k-means clustering. + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory + copy if the given data is not C-contiguous. + y : Ignored + not used, present here for API consistency by convention. + sample_weight : array-like, shape (n_samples,), optional + The weights for each observation in X. If None, all observations + are assigned equal weight (default: None) + """ + + # random_state = check_random_state(self.random_state) + + hc_ = \ + pecok_clustering( + X, n_clusters=self.n_clusters, corr=self.corr, verbose=self.verbose) + # init=self.init, n_init=self.n_init, + # max_iter=self.max_iter, verbose=self.verbose, + # precompute_distances=self.precompute_distances, + # tol=self.tol, random_state=random_state, copy_x=self.copy_x, + # n_jobs=self.n_jobs, algorithm=self.algorithm, + # return_n_iter=True) + self.labels_ = hc_.labels_ + return self + + # def predict(self, X, sample_weight=None): + # """Predict the closest cluster each sample in X belongs to. + # In the vector quantization literature, `cluster_centers_` is called + # the code book and each value returned by `predict` is the index of + # the closest code in the code book. + # Parameters + # ---------- + # X : {array-like, sparse matrix}, shape = [n_samples, n_features] + # New data to predict. + # sample_weight : array-like, shape (n_samples,), optional + # The weights for each observation in X. If None, all observations + # are assigned equal weight (default: None) + # Returns + # ------- + # labels : array, shape [n_samples,] + # Index of the cluster each sample belongs to. + # """ + # check_is_fitted(self, 'cluster_centers_') + # + # X = self._check_test_data(X) + # return self._labels_inertia_minibatch(X, sample_weight)[0] + + +class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin): + """K-MeanZ clustering + Read more in [my thesis] + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of centroids to generate. + corr : int, optional, default: 0 + Method for correcting for bias, defaults to 0: + + 0: no correction + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + 1: bad correction + 4: good correction + init : @TODO{'k-means++', 'random' or an ndarray} + Method for initialization, defaults to 'k-means++': + + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + + Attributes + ---------- + corrected_points_: array, [n_points, n_features] + Points representation in Z-space + cluster_centers_ : array, [n_clusters, n_features] + Coordinates of cluster centers in Z-space + labels_ : + Labels of each point + inertia_ : float + Sum of squared distances (in Z-space) of samples to their closest cluster center. + + Examples + -------- + TODO + >>> from sklearn.cluster import KMeans + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X) + >>> kmeans.labels_ + array([0, 0, 0, 1, 1, 1], dtype=int32) + >>> kmeans.predict([[0, 0], [4, 4]]) + array([0, 1], dtype=int32) + >>> kmeans.cluster_centers_ + array([[1., 2.], + [4., 2.]]) + """ + + def __init__(self, n_clusters=8, corr=0, init='notyet', + verbose=0, random_state=None, copy_x=True, + n_jobs=None, algorithm='auto'): + + self.n_clusters = n_clusters + self.corr = corr + self.init = init + self.verbose = verbose + self.random_state = random_state + self.copy_x = copy_x + self.n_jobs = n_jobs + self.algorithm = algorithm + + def fit(self, X, y=None, sample_weight=None): + """Compute k-means clustering. + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory + copy if the given data is not C-contiguous. + y : Ignored + not used, present here for API consistency by convention. + sample_weight : array-like, shape (n_samples,), optional + The weights for each observation in X. If None, all observations + are assigned equal weight (default: None) + """ + + # random_state = check_random_state(self.random_state) + + self.corrected_points_, hc_ = \ + kmeanz( + X, n_clusters=self.n_clusters, corr=self.corr) + # init=self.init, n_init=self.n_init, + # max_iter=self.max_iter, verbose=self.verbose, + # precompute_distances=self.precompute_distances, + # tol=self.tol, random_state=random_state, copy_x=self.copy_x, + # n_jobs=self.n_jobs, algorithm=self.algorithm, + # return_n_iter=True) + self.labels_ = hc_.labels_ + return self + + # def predict(self, X, sample_weight=None): + # """Predict the closest cluster each sample in X belongs to. + # In the vector quantization literature, `cluster_centers_` is called + # the code book and each value returned by `predict` is the index of + # the closest code in the code book. + # Parameters + # ---------- + # X : {array-like, sparse matrix}, shape = [n_samples, n_features] + # New data to predict. + # sample_weight : array-like, shape (n_samples,), optional + # The weights for each observation in X. If None, all observations + # are assigned equal weight (default: None) + # Returns + # ------- + # labels : array, shape [n_samples,] + # Index of the cluster each sample belongs to. + # """ + # check_is_fitted(self, 'cluster_centers_') + # + # X = self._check_test_data(X) + # return self._labels_inertia_minibatch(X, sample_weight)[0] + diff --git a/pecok/gamma.py b/pecok/gamma.py index 3076db0..6616fc9 100644 --- a/pecok/gamma.py +++ b/pecok/gamma.py @@ -89,7 +89,7 @@ def cross_diag(X): return np.diag(X.dot(X.T)) -def gamma_hat(X, int_corr): +def gamma_hat(X, corr): ghat = { 0: no_correction, 1: gamma_hat2_robust, @@ -97,5 +97,5 @@ def gamma_hat(X, int_corr): 3: gamma_hat3, 4: gamma_hat4, 8: cross_diag, - }.get(int_corr, no_correction)(X) + }.get(corr, no_correction)(X) return np.diag(ghat) diff --git a/tests/test.py b/tests/test.py index 2755954..8a2808d 100644 --- a/tests/test.py +++ b/tests/test.py @@ -8,7 +8,7 @@ # import pyper import numpy as np -from pecok import pecok_clustering, kmeanz_clustering +from pecok import KMeanz, Pecok from sklearn.cluster import KMeans @@ -42,9 +42,11 @@ def hdclassif(obs, n_struct): methods = [ [lambda X, K: KMeans(n_clusters=K, init='k-means++', n_init=100).fit(X).labels_, 'kmeans++'], + [lambda X, K: KMeanz(n_clusters=K, corr=2).fit(X).labels_, 'KMeanz'], + [lambda X, K: Pecok(n_clusters=K, corr=2).fit(X).labels_, 'Pecok'], # [hdclassif, 'HDDC'], - [pecok_clustering, 'pecok'], - [kmeanz_clustering, 'kmeanz'] +# [pecok_clustering, 'pecok'], +# [kmeanz_clustering, 'kmeanz'] ] print("\nVAR CLUSTERING\n\n") @@ -66,8 +68,8 @@ def hdclassif(obs, n_struct): print("\nPOINT CLUSTERING\n\n") -n_var = 100 n_obs = 10 +n_var = 100 truth = np.asmatrix(np.concatenate((np.repeat(0, n_obs//2), np.repeat(1, n_obs//2)))) X = np.zeros((n_obs, n_var)) From cf8810b1d417fdcd9719fff35f58416884e6bd68 Mon Sep 17 00:00:00 2001 From: Martin Royer Date: Sun, 27 Jan 2019 16:21:53 +0100 Subject: [PATCH 2/9] some tests --- tests/demo.py | 14 ++++ tests/illustration_comparaison.py | 124 ++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 tests/demo.py create mode 100644 tests/illustration_comparaison.py diff --git a/tests/demo.py b/tests/demo.py new file mode 100644 index 0000000..1ce540f --- /dev/null +++ b/tests/demo.py @@ -0,0 +1,14 @@ +import numpy as np +np.seed(232123123) +n = 10 +p = 100 +X = np.zeros((n, p)) +snr = 0.3 +X[n//2:, :] = -np.ones(p) * snr + np.random.normal(scale=.1, size=(n//2,p)) +X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p)) + +from sklearn.cluster import KMeans +KMeans(n_clusters=2, init='k-means++', n_init=100).fit(X).labels_ +from pecok import PeCoK +PeCoK(n_clusters=2, corr=0).fit(X).labels_ +PeCoK(n_clusters=2, corr=4).fit(X).labels_ diff --git a/tests/illustration_comparaison.py b/tests/illustration_comparaison.py new file mode 100644 index 0000000..04318c3 --- /dev/null +++ b/tests/illustration_comparaison.py @@ -0,0 +1,124 @@ +print(__doc__) + +import time +import warnings + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn import cluster, datasets, mixture +from sklearn.preprocessing import StandardScaler +from itertools import cycle, islice + +from pecok import KMeanZ + + + +np.random.seed(0) + +random_state = 170 +n_samples = 500 +blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) + +# Anisotropicly distributed data +# X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state) +# transformation = [[0.6, -0.6], [-0.4, 0.8]] +# X_aniso = np.dot(X, transformation) +# aniso = (X_aniso, y) + +# blobs with varied variances +varied = datasets.make_blobs(n_samples=n_samples, centers=np.matrix([[0,0],[1,0]]), + cluster_std=[.1, .5], + random_state=random_state) + +# ============ +# Set up cluster parameters +# ============ +plt.figure(figsize=(9 * 2 + 3, 12.5)) +plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, + hspace=.01) + +plot_num = 1 + +default_base = {'quantile': .3, + 'eps': .3, + 'damping': .9, + 'preference': -200, + 'n_neighbors': 10, + 'n_clusters': 2} + +datasets = [ + # (noisy_circles, {'damping': .77, 'preference': -240, + # 'quantile': .2, 'n_clusters': 2}), + # (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}), + (varied, {'eps': .18, 'n_neighbors': 2})] + +for i_dataset, (dataset, algo_params) in enumerate(datasets): + # update parameters with dataset-specific values + params = default_base.copy() + params.update(algo_params) + + X, y = dataset + X = StandardScaler().fit_transform(X) + + two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) + spectral = cluster.SpectralClustering( + n_clusters=params['n_clusters'], eigen_solver='arpack', + affinity="nearest_neighbors") + kmeanz = KMeanZ( + n_clusters=params['n_clusters'], corr=2 + ) + gmm = mixture.GaussianMixture( + n_components=1, covariance_type='full') + + clustering_algorithms = ( + ('MiniBatchKMeans', two_means), + ('SpectralClustering', spectral), + ('KMeanZ', kmeanz) + # ('GaussianMixture', gmm) + ) + + for name, algorithm in clustering_algorithms: + t0 = time.time() + + # catch warnings related to kneighbors_graph + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="the number of connected components of the " + + "connectivity matrix is [0-9]{1,2}" + + " > 1. Completing it to avoid stopping the tree early.", + category=UserWarning) + warnings.filterwarnings( + "ignore", + message="Graph is not fully connected, spectral embedding" + + " may not work as expected.", + category=UserWarning) + algorithm.fit(X) + + t1 = time.time() + if hasattr(algorithm, 'labels_'): + y_pred = algorithm.labels_.astype(np.int) + else: + y_pred = algorithm.predict(X) + + plt.subplot(len(datasets), len(clustering_algorithms), plot_num) + if i_dataset == 0: + plt.title(name, size=18) + + colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', + '#f781bf', '#a65628', '#984ea3', + '#999999', '#e41a1c', '#dede00']), + int(max(y_pred) + 1)))) + plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) + + plt.xlim(-2.5, 2.5) + plt.ylim(-2.5, 2.5) + plt.xticks(()) + plt.yticks(()) + plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), + transform=plt.gca().transAxes, size=15, + horizontalalignment='right') + plot_num += 1 + +plt.show() From fa18d214e9973669db5f3e6e026dd21bcc821c0e Mon Sep 17 00:00:00 2001 From: martinroyer-datashape Date: Wed, 10 Jul 2019 12:39:56 +0200 Subject: [PATCH 3/9] quick refac --- pecok/admm.py | 30 +++++++++++++++--------------- pecok/clustering.py | 14 +++++++------- pecok/gamma.py | 26 +++++++++++++------------- tests/test.py | 21 +++------------------ 4 files changed, 38 insertions(+), 53 deletions(-) diff --git a/pecok/admm.py b/pecok/admm.py index 137dc71..85e5b57 100644 --- a/pecok/admm.py +++ b/pecok/admm.py @@ -4,46 +4,46 @@ # License: MIT import numpy as np -from scipy import linalg as la +from scipy import linalg -def operator_lstarllstarinv_sym(u, v): +def _operator_lstarllstarinv_sym(u, v): """Operator \widetildetilde{L}^*_{sym} on (u,v) in R^{p+1} -> R^{p*p}""" temp = u.repeat(u.size).reshape((u.size, u.size)) return (temp + temp.T)/2 + np.diag(np.repeat(v, u.size)) -def proj_lin_Hsymmetric(Y, n_struct): +def _proj_lin_Hsymmetric(Y, n_struct): """Projection onto \Pi_{\mathcal{A}sym}(Y)""" n_samples,_ = Y.shape x = np.sum(Y, 1) - 1 y = np.trace(Y) - n_struct invx = (x-(np.sum(x)+y)/(2*n_samples))/(n_samples-1) invy = (y-(np.sum(x)+y)/(2*n_samples))/(n_samples-1) - Y = Y - operator_lstarllstarinv_sym(invx, invy) + Y = Y - _operator_lstarllstarinv_sym(invx, invy) return Y -def proj_positive(x, thresh=0): +def _proj_positive(x, thresh=0): """Project onto component-positive matrix""" x[x < thresh] = 0 return x -def proj_Snp_imp(Y): +def _proj_Snp_imp(Y): """Improved projection onto semi-definite positive matrix""" n_samples,_ = Y.shape - eig_vals = la.eigh(Y, eigvals_only=True) + eig_vals = linalg.eigh(Y, eigvals_only=True) n_val_neg = np.sum(eig_vals<0) if n_val_neg == 0: return Y if n_val_neg == n_samples: return np.zeros((n_samples,n_samples)) if n_val_neg < n_samples-n_val_neg: - eig_vals, v = la.eigh(-Y, eigvals=(n_samples - n_val_neg, n_samples - 1)) + eig_vals, v = linalg.eigh(-Y, eigvals=(n_samples - n_val_neg, n_samples - 1)) Y = Y + v.dot(np.diag(eig_vals)).dot(v.T) else: - eig_vals, v = la.eigh(Y, eigvals=(n_val_neg, n_samples - 1)) + eig_vals, v = linalg.eigh(Y, eigvals=(n_val_neg, n_samples - 1)) Y = v.dot(np.diag(eig_vals)).dot(v.T) return Y @@ -71,9 +71,9 @@ def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None, n_iter = n_iter + 1 oldXbar = Xbar - X = proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_clusters) - Y = proj_positive(Xbar - V) - Z = proj_Snp_imp(Xbar - W) + X = _proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_clusters) + Y = _proj_positive(Xbar - V) + Z = _proj_Snp_imp(Xbar - W) Xbar = (X + Y + Z)/3 U = U + X - Xbar @@ -82,7 +82,7 @@ def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None, res_dual = rho * np.linalg.norm(Xbar-oldXbar) res_primal = np.linalg.norm((X-Xbar, Z-Xbar, Y-Xbar)) - if not (is_primal_high(eps_residual, res_primal, X, Y, Z) or is_dual_high(eps_residual, res_dual, Y, Z)): + if not (_is_primal_high(eps_residual, res_primal, X, Y, Z) or _is_dual_high(eps_residual, res_dual, Y, Z)): break if verbose: print("ADMM ends -- n_iter=%i, rho=%2.2f" % (n_iter, rho)) @@ -90,9 +90,9 @@ def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None, return Z -def is_primal_high(eps_residual, res_primal, X, Y, Z): +def _is_primal_high(eps_residual, res_primal, X, Y, Z): return res_primal > eps_residual * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z))) -def is_dual_high(eps_residual, res_dual, Y, Z): +def _is_dual_high(eps_residual, res_dual, Y, Z): return res_dual > eps_residual * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z)) diff --git a/pecok/clustering.py b/pecok/clustering.py index c60bd8b..b6ca2d7 100644 --- a/pecok/clustering.py +++ b/pecok/clustering.py @@ -1,6 +1,6 @@ """PECOK clustering""" -# author: Martin Royer +# author: Martin Royer # License: MIT import numpy as np @@ -21,14 +21,14 @@ def _corrected_relational(obs, corr): return (obs.dot(obs.T) - gamma_hat(obs, corr=corr)) / obs.shape[1] -def kmeanz(X, n_clusters, corr): +def _kmeanz(X, n_clusters, corr): gram_corrected = _corrected_relational(X, corr=corr) U, s, _ = lin_svd(gram_corrected, compute_uv=True) approx = U.dot(np.diag(np.sqrt(s))) return approx, AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit(approx) -def pecok_clustering(obs, n_clusters, corr=4, **kwargs): +def _pecok_clustering(obs, n_clusters, corr=4, **kwargs): gram_corrected = _corrected_relational(obs, corr=corr) U, _, V = spa_svd(gram_corrected, k=n_clusters) Bhat = pecok_admm(gram_corrected, n_clusters=n_clusters, mat_init=U.dot(V), **kwargs) @@ -37,7 +37,7 @@ def pecok_clustering(obs, n_clusters, corr=4, **kwargs): class Pecok(BaseEstimator, ClusterMixin, TransformerMixin): """PeCoK clustering - Read more in [my thesis] + Read more in [my thesis: http://www.theses.fr/2018SACLS442] Parameters ---------- n_clusters : int, optional, default: 8 @@ -116,7 +116,7 @@ def fit(self, X, y=None, sample_weight=None): # random_state = check_random_state(self.random_state) hc_ = \ - pecok_clustering( + _pecok_clustering( X, n_clusters=self.n_clusters, corr=self.corr, verbose=self.verbose) # init=self.init, n_init=self.n_init, # max_iter=self.max_iter, verbose=self.verbose, @@ -152,7 +152,7 @@ def fit(self, X, y=None, sample_weight=None): class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin): """K-MeanZ clustering - Read more in [my thesis] + Read more in [my thesis: http://www.theses.fr/2018SACLS442] Parameters ---------- n_clusters : int, optional, default: 8 @@ -231,7 +231,7 @@ def fit(self, X, y=None, sample_weight=None): # random_state = check_random_state(self.random_state) self.corrected_points_, hc_ = \ - kmeanz( + _kmeanz( X, n_clusters=self.n_clusters, corr=self.corr) # init=self.init, n_init=self.n_init, # max_iter=self.max_iter, verbose=self.verbose, diff --git a/pecok/gamma.py b/pecok/gamma.py index 6616fc9..fe9f925 100644 --- a/pecok/gamma.py +++ b/pecok/gamma.py @@ -8,7 +8,7 @@ -def gamma_hat2(X): +def _gamma_hat2(X): n_samples,_ = X.shape X2 = X / (np.linalg.norm(X, axis=1, keepdims=True)+1e-8) XaX2 = X.dot(X2.T) @@ -21,7 +21,7 @@ def gamma_hat2(X): return gamma -def gamma_hat2_robust(X): +def _gamma_hat2_robust(X): n_samples,_ = X.shape X2 = X / (np.linalg.norm(X, axis=1, keepdims=True)+1e-8) XaX2 = X.dot(X2.T) @@ -34,7 +34,7 @@ def gamma_hat2_robust(X): return gamma -def gamma_hat3(X): +def _gamma_hat3(X): """Gamma_hat3 estimator from PECOK supplement, in O(n_samples^3 * n_features) Parameters @@ -53,7 +53,7 @@ def gamma_hat3(X): return gamma -def gamma_hat4(X): +def _gamma_hat4(X): """Gamma_hat4 estimator from PECOK, in O(n_samples^4 * n_features) Parameters @@ -81,21 +81,21 @@ def gamma_hat4(X): return np.asarray(gamma) -def no_correction(X): +def _no_correction(X): return np.zeros(X.shape[0]) -def cross_diag(X): +def _cross_diag(X): return np.diag(X.dot(X.T)) def gamma_hat(X, corr): ghat = { - 0: no_correction, - 1: gamma_hat2_robust, - 2: gamma_hat2, - 3: gamma_hat3, - 4: gamma_hat4, - 8: cross_diag, - }.get(corr, no_correction)(X) + 0: _no_correction, + 1: _gamma_hat2_robust, + 2: _gamma_hat2, + 3: _gamma_hat3, + 4: _gamma_hat4, + 8: _cross_diag, + }.get(corr, _no_correction)(X) return np.diag(ghat) diff --git a/tests/test.py b/tests/test.py index 8a2808d..6d39ca1 100644 --- a/tests/test.py +++ b/tests/test.py @@ -6,13 +6,12 @@ from functools import wraps import timeit -# import pyper import numpy as np from pecok import KMeanz, Pecok from sklearn.cluster import KMeans -def timethis(f): +def _timethis(f): @wraps(f) def wrap(*args, **kw): ts = timeit.default_timer() @@ -22,20 +21,6 @@ def wrap(*args, **kw): return wrap -def hdclassif(obs, n_struct): - n,_ = obs.shape - myR = pyper.R() - myR.run('library(HDclassif)') - myR.assign('obs', obs) - myR.assign('n_struct', n_struct) - try: - myR.run('res <- hddc(obs, K=n_struct, model=c(1,2,7,9))') - result = np.array(myR['res$class']) - except: - print("fail hdclassif") - result = np.zeros(n) - return result - seed = 432 np.random.seed(seed) print("seed is %i" % seed) @@ -62,7 +47,7 @@ def hdclassif(obs, n_struct): print("truth:".ljust(15), truth) for method, method_name in methods: - job_result, job_time = timethis(method)(mat_data.T, 2) + job_result, job_time = _timethis(method)(mat_data.T, 2) print(method_name.ljust(15), job_result) print("job_time: %.2f (s)".ljust(15) % job_time) @@ -79,7 +64,7 @@ def hdclassif(obs, n_struct): print("truth:".ljust(15), truth) for method, method_name in methods: - job_result, job_time = timethis(method)(X, 2) + job_result, job_time = _timethis(method)(X, 2) print(method_name.ljust(15), job_result) print("job_time: %.2f (s)".ljust(15) % job_time) # print("pecok:".ljust(15), pecok_clustering(X, n_struct=2, rho=100, n_iter_max=3000, verbose=True).labels_) From c721e278f6b6c9e24455ecf5e9ab7f1558390ada Mon Sep 17 00:00:00 2001 From: martinroyer-datashape Date: Wed, 10 Jul 2019 14:44:10 +0200 Subject: [PATCH 4/9] minimal, basic .fit(X) sklearn framework --- pecok/clustering.py | 106 +++++++------------------------------------- tests/test.py | 39 +++++++--------- 2 files changed, 31 insertions(+), 114 deletions(-) diff --git a/pecok/clustering.py b/pecok/clustering.py index b6ca2d7..d14d036 100644 --- a/pecok/clustering.py +++ b/pecok/clustering.py @@ -14,9 +14,6 @@ from .admm import pecok_admm -############################################################################### -# shared correction - def _corrected_relational(obs, corr): return (obs.dot(obs.T) - gamma_hat(obs, corr=corr)) / obs.shape[1] @@ -59,30 +56,18 @@ class Pecok(BaseEstimator, ClusterMixin, TransformerMixin): Attributes ---------- - corrected_points_: array, [n_points, n_features] - Points representation in Z-space - cluster_centers_ : array, [n_clusters, n_features] - Coordinates of cluster centers in Z-space labels_ : Labels of each point - inertia_ : float - Sum of squared distances (in Z-space) of samples to their closest cluster center. Examples - -------- - TODO - >>> from sklearn.cluster import KMeans - >>> import numpy as np - >>> X = np.array([[1, 2], [1, 4], [1, 0], - ... [4, 2], [4, 4], [4, 0]]) - >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X) - >>> kmeans.labels_ - array([0, 0, 0, 1, 1, 1], dtype=int32) - >>> kmeans.predict([[0, 0], [4, 4]]) - array([0, 1], dtype=int32) - >>> kmeans.cluster_centers_ - array([[1., 2.], - [4., 2.]]) + -------- + >>> from pecok import Pecok + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> pecok = Pecok(n_clusters=2).fit(X) + >>> pecok.labels_ + array([1, 1, 1, 0, 0, 0], dtype=int32) """ def __init__(self, n_clusters=8, corr=0, init='notyet', @@ -113,8 +98,6 @@ def fit(self, X, y=None, sample_weight=None): are assigned equal weight (default: None) """ - # random_state = check_random_state(self.random_state) - hc_ = \ _pecok_clustering( X, n_clusters=self.n_clusters, corr=self.corr, verbose=self.verbose) @@ -127,28 +110,6 @@ def fit(self, X, y=None, sample_weight=None): self.labels_ = hc_.labels_ return self - # def predict(self, X, sample_weight=None): - # """Predict the closest cluster each sample in X belongs to. - # In the vector quantization literature, `cluster_centers_` is called - # the code book and each value returned by `predict` is the index of - # the closest code in the code book. - # Parameters - # ---------- - # X : {array-like, sparse matrix}, shape = [n_samples, n_features] - # New data to predict. - # sample_weight : array-like, shape (n_samples,), optional - # The weights for each observation in X. If None, all observations - # are assigned equal weight (default: None) - # Returns - # ------- - # labels : array, shape [n_samples,] - # Index of the cluster each sample belongs to. - # """ - # check_is_fitted(self, 'cluster_centers_') - # - # X = self._check_test_data(X) - # return self._labels_inertia_minibatch(X, sample_weight)[0] - class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin): """K-MeanZ clustering @@ -176,28 +137,18 @@ class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin): ---------- corrected_points_: array, [n_points, n_features] Points representation in Z-space - cluster_centers_ : array, [n_clusters, n_features] - Coordinates of cluster centers in Z-space labels_ : Labels of each point - inertia_ : float - Sum of squared distances (in Z-space) of samples to their closest cluster center. Examples - -------- - TODO - >>> from sklearn.cluster import KMeans - >>> import numpy as np - >>> X = np.array([[1, 2], [1, 4], [1, 0], - ... [4, 2], [4, 4], [4, 0]]) - >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X) - >>> kmeans.labels_ - array([0, 0, 0, 1, 1, 1], dtype=int32) - >>> kmeans.predict([[0, 0], [4, 4]]) - array([0, 1], dtype=int32) - >>> kmeans.cluster_centers_ - array([[1., 2.], - [4., 2.]]) + -------- + >>> from pecok import KMeanz + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> kmeanz = KMeanz(n_clusters=2).fit(X) + >>> kmeanz.labels_ + array([1, 1, 1, 0, 0, 0], dtype=int32) """ def __init__(self, n_clusters=8, corr=0, init='notyet', @@ -228,8 +179,6 @@ def fit(self, X, y=None, sample_weight=None): are assigned equal weight (default: None) """ - # random_state = check_random_state(self.random_state) - self.corrected_points_, hc_ = \ _kmeanz( X, n_clusters=self.n_clusters, corr=self.corr) @@ -241,26 +190,3 @@ def fit(self, X, y=None, sample_weight=None): # return_n_iter=True) self.labels_ = hc_.labels_ return self - - # def predict(self, X, sample_weight=None): - # """Predict the closest cluster each sample in X belongs to. - # In the vector quantization literature, `cluster_centers_` is called - # the code book and each value returned by `predict` is the index of - # the closest code in the code book. - # Parameters - # ---------- - # X : {array-like, sparse matrix}, shape = [n_samples, n_features] - # New data to predict. - # sample_weight : array-like, shape (n_samples,), optional - # The weights for each observation in X. If None, all observations - # are assigned equal weight (default: None) - # Returns - # ------- - # labels : array, shape [n_samples,] - # Index of the cluster each sample belongs to. - # """ - # check_is_fitted(self, 'cluster_centers_') - # - # X = self._check_test_data(X) - # return self._labels_inertia_minibatch(X, sample_weight)[0] - diff --git a/tests/test.py b/tests/test.py index 6d39ca1..2afe9ad 100644 --- a/tests/test.py +++ b/tests/test.py @@ -3,22 +3,12 @@ # author: Martin Royer # License: MIT -from functools import wraps +from functools import partial import timeit import numpy as np from pecok import KMeanz, Pecok -from sklearn.cluster import KMeans - - -def _timethis(f): - @wraps(f) - def wrap(*args, **kw): - ts = timeit.default_timer() - result = f(*args, **kw) - te = timeit.default_timer() - return result, te-ts - return wrap +from sklearn.cluster import KMeans, AgglomerativeClustering seed = 432 @@ -26,12 +16,10 @@ def wrap(*args, **kw): print("seed is %i" % seed) methods = [ - [lambda X, K: KMeans(n_clusters=K, init='k-means++', n_init=100).fit(X).labels_, 'kmeans++'], - [lambda X, K: KMeanz(n_clusters=K, corr=2).fit(X).labels_, 'KMeanz'], - [lambda X, K: Pecok(n_clusters=K, corr=2).fit(X).labels_, 'Pecok'], -# [hdclassif, 'HDDC'], -# [pecok_clustering, 'pecok'], -# [kmeanz_clustering, 'kmeanz'] + [partial(KMeans, init="k-means++", n_init=100), "k-means++"], + [partial(AgglomerativeClustering, linkage='ward'), "Hierarchical"], + [partial(KMeanz, corr=2), "KMeanz"], + [partial(Pecok, corr=2), "Pecok"], ] print("\nVAR CLUSTERING\n\n") @@ -39,7 +27,7 @@ def wrap(*args, **kw): n_obs = 100 truth = np.asmatrix(np.concatenate((np.repeat(0, n_var//2), np.repeat(1, n_var//2)))) -membership = truth.T.dot(np.matrix([1, 0])) + (1-truth).T.dot(np.matrix([0, 1])) +membership = truth.T.dot(np.array([1, 0])[np.newaxis,:]) + (1-truth).T.dot(np.array([0, 1])[np.newaxis,:]) stds = np.ones(n_var) stds[:(n_var//2)] = 0.1 sigma = membership.dot(0.1*np.identity(2)).dot(membership.T) + np.diag(stds) @@ -47,9 +35,11 @@ def wrap(*args, **kw): print("truth:".ljust(15), truth) for method, method_name in methods: - job_result, job_time = _timethis(method)(mat_data.T, 2) + ts = timeit.default_timer() + job_result = method(n_clusters=2).fit(mat_data.T).labels_ + te = timeit.default_timer() print(method_name.ljust(15), job_result) - print("job_time: %.2f (s)".ljust(15) % job_time) + print("job_time: %.2f (s)".ljust(15) % (te-ts)) print("\nPOINT CLUSTERING\n\n") @@ -64,7 +54,8 @@ def wrap(*args, **kw): print("truth:".ljust(15), truth) for method, method_name in methods: - job_result, job_time = _timethis(method)(X, 2) + ts = timeit.default_timer() + job_result = method(n_clusters=2).fit(X).labels_ + te = timeit.default_timer() print(method_name.ljust(15), job_result) - print("job_time: %.2f (s)".ljust(15) % job_time) -# print("pecok:".ljust(15), pecok_clustering(X, n_struct=2, rho=100, n_iter_max=3000, verbose=True).labels_) + print("job_time: %.2f (s)".ljust(15) % (te-ts)) From 947b862fe6542297522e3709bc8dedb7b18b7dd3 Mon Sep 17 00:00:00 2001 From: martinroyer-datashape Date: Wed, 10 Jul 2019 14:53:25 +0200 Subject: [PATCH 5/9] fix test & demos --- tests/demo.py | 11 ++++++----- tests/illustration_comparaison.py | 13 +++++-------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/tests/demo.py b/tests/demo.py index 1ce540f..8ccdc52 100644 --- a/tests/demo.py +++ b/tests/demo.py @@ -1,5 +1,6 @@ import numpy as np -np.seed(232123123) + +np.random.seed(232123123) n = 10 p = 100 X = np.zeros((n, p)) @@ -8,7 +9,7 @@ X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p)) from sklearn.cluster import KMeans -KMeans(n_clusters=2, init='k-means++', n_init=100).fit(X).labels_ -from pecok import PeCoK -PeCoK(n_clusters=2, corr=0).fit(X).labels_ -PeCoK(n_clusters=2, corr=4).fit(X).labels_ +print(KMeans(n_clusters=2, init='k-means++', n_init=100).fit(X).labels_) +from pecok import Pecok +print(Pecok(n_clusters=2, corr=0).fit(X).labels_) +print(Pecok(n_clusters=2, corr=4).fit(X).labels_) diff --git a/tests/illustration_comparaison.py b/tests/illustration_comparaison.py index 04318c3..57f4a0a 100644 --- a/tests/illustration_comparaison.py +++ b/tests/illustration_comparaison.py @@ -1,5 +1,3 @@ -print(__doc__) - import time import warnings @@ -10,8 +8,7 @@ from sklearn.preprocessing import StandardScaler from itertools import cycle, islice -from pecok import KMeanZ - +from pecok import KMeanz np.random.seed(0) @@ -27,8 +24,8 @@ # aniso = (X_aniso, y) # blobs with varied variances -varied = datasets.make_blobs(n_samples=n_samples, centers=np.matrix([[0,0],[1,0]]), - cluster_std=[.1, .5], +varied = datasets.make_blobs(n_samples=n_samples, centers=np.array([[0,0],[1,0]]), + cluster_std=[.05, .5], random_state=random_state) # ============ @@ -65,7 +62,7 @@ spectral = cluster.SpectralClustering( n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") - kmeanz = KMeanZ( + kmeanz = KMeanz( n_clusters=params['n_clusters'], corr=2 ) gmm = mixture.GaussianMixture( @@ -74,7 +71,7 @@ clustering_algorithms = ( ('MiniBatchKMeans', two_means), ('SpectralClustering', spectral), - ('KMeanZ', kmeanz) + ('KMeanz', kmeanz) # ('GaussianMixture', gmm) ) From 0051724fe2c02f18cadbdfb355683ad9f5e47d91 Mon Sep 17 00:00:00 2001 From: martinroyer <16647869+martinroyer@users.noreply.github.com> Date: Wed, 10 Jul 2019 15:05:00 +0200 Subject: [PATCH 6/9] minimal examples in readme --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 1b634d2..d757c86 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,24 @@ You can find it on PyPI and install it with: pip install pecok ``` +# Minimal example + +```python + +import numpy as np +np.random.seed(232123123) +n = 10 +p = 100 +X = np.zeros((n, p)) +snr = 0.3 +X[n//2:, :] = -np.ones(p) * snr + np.random.normal(scale=.1, size=(n//2,p)) +X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p)) + +from pecok import Pecok, KMeanz +Pecok(n_clusters=2, corr=4).fit(X).labels_ +KMeans(n_clusters=2, corr=2).fit(X).labels_ +``` + ### Clustering algorithms From 53b268ca69b86dcd4a512c1c3cd37cd549c35207 Mon Sep 17 00:00:00 2001 From: martinroyer <16647869+martinroyer@users.noreply.github.com> Date: Wed, 10 Jul 2019 15:11:59 +0200 Subject: [PATCH 7/9] improved readme --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d757c86..9c64cef 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ # Description -pecok is a python package containing a few clustering algorithms (unsupervised learning) for a given number of clusters. It is based on the PECOK paper (https://arxiv.org/abs/1606.05100) +pecok is a python package containing a few clustering algorithms (unsupervised learning) for a given number of clusters. It is based on the PECOK paper https://arxiv.org/abs/1606.05100 for variable clustering that later got extended here: https://arxiv.org/abs/1508.01939 (to appear in Annals of Statistics). See also https://papers.nips.cc/paper/6776-adaptive-clustering-through-semidefinite-programming. # Installing pecok @@ -35,15 +35,15 @@ X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p)) from pecok import Pecok, KMeanz Pecok(n_clusters=2, corr=4).fit(X).labels_ -KMeans(n_clusters=2, corr=2).fit(X).labels_ +KMeanz(n_clusters=2, corr=2).fit(X).labels_ ``` -### Clustering algorithms +## Clustering algorithms -Currently available algorithms are: +Currently available algorithms (with sklearn object framework implementing `fit` routine) are: - * **pecok_clustering**: the main clustering algorithm described in [Bunea, Giraud, Royer, Verzelen ('17)] + * **Pecok**: the main clustering algorithm described in [Bunea, Giraud, Royer, Verzelen ('17)] Parameters: @@ -56,7 +56,7 @@ Currently available algorithms are: |**n_iter_max** = -1| if positive, sets the stop condition for maximum number of iteration of the ADMM algorithm used to approximate the solution of the SDP.| |**verbose** = False| yields print for time and residuals value at ADMM stop time.| - * **kmeanz_clustering**: efficient variant for main clustering algorithm, see my PhD thesis + * **KMeanz**: efficient variant for main clustering algorithm, introduced in my PhD thesis: http://www.theses.fr/2018SACLS442 Parameters: From f53e16f6d5ed2d53228da8d00d877aa894dbeb58 Mon Sep 17 00:00:00 2001 From: martinroyer Date: Wed, 10 Jul 2019 15:44:29 +0200 Subject: [PATCH 8/9] version maj --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c984f0b..5c86af3 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ install_requires=[ 'numpy', 'scipy', 'scikit-learn' ], - version = '1.0.2', + version = '1.0.3', description = 'Implementation of Pecok', author='Martin Royer', author_email='martinpierreroyer@gmail.com', From ba83845b8d566fd98fb9a5dc2bfc088aaeae54f3 Mon Sep 17 00:00:00 2001 From: martinroyer <16647869+martinroyer@users.noreply.github.com> Date: Mon, 29 Jul 2019 08:55:55 +0200 Subject: [PATCH 9/9] Update README.md update readme --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 9c64cef..0ebc8ba 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,9 @@ Currently available algorithms (with sklearn object framework implementing `fit` | **name** | **description** | | --- | --- | - |**obs** | data matrix (n \times p), clustering is applied to the lines of **obs**.| - |**n_struct** | number of structures to separate the data into.| - |**int_corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. | + |**X** | data matrix (n \times p), clustering is applied to the lines of **X**.| + |**n_clusters** | number of structures to separate the data into.| + |**corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. | |**rho** = 5| bias-variance tradeoff parameter in ADMM.| |**n_iter_max** = -1| if positive, sets the stop condition for maximum number of iteration of the ADMM algorithm used to approximate the solution of the SDP.| |**verbose** = False| yields print for time and residuals value at ADMM stop time.| @@ -62,7 +62,7 @@ Currently available algorithms (with sklearn object framework implementing `fit` | **name** | **description** | | --- | --- | - |**obs** | data matrix (n \times p), clustering is applied to the lines of **obs**.| - |**n_struct** | number of structures to separate the data into.| - |**int_corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. | + |**X** | data matrix (n \times p), clustering is applied to the lines of **X**.| + |**n_clusters** | number of structures to separate the data into.| + |**corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. |