diff --git a/README.md b/README.md index 1b634d2..0ebc8ba 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ # Description -pecok is a python package containing a few clustering algorithms (unsupervised learning) for a given number of clusters. It is based on the PECOK paper (https://arxiv.org/abs/1606.05100) +pecok is a python package containing a few clustering algorithms (unsupervised learning) for a given number of clusters. It is based on the PECOK paper https://arxiv.org/abs/1606.05100 for variable clustering that later got extended here: https://arxiv.org/abs/1508.01939 (to appear in Annals of Statistics). See also https://papers.nips.cc/paper/6776-adaptive-clustering-through-semidefinite-programming. # Installing pecok @@ -20,31 +20,49 @@ You can find it on PyPI and install it with: pip install pecok ``` +# Minimal example + +```python + +import numpy as np +np.random.seed(232123123) +n = 10 +p = 100 +X = np.zeros((n, p)) +snr = 0.3 +X[n//2:, :] = -np.ones(p) * snr + np.random.normal(scale=.1, size=(n//2,p)) +X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p)) + +from pecok import Pecok, KMeanz +Pecok(n_clusters=2, corr=4).fit(X).labels_ +KMeanz(n_clusters=2, corr=2).fit(X).labels_ +``` + -### Clustering algorithms +## Clustering algorithms -Currently available algorithms are: +Currently available algorithms (with sklearn object framework implementing `fit` routine) are: - * **pecok_clustering**: the main clustering algorithm described in [Bunea, Giraud, Royer, Verzelen ('17)] + * **Pecok**: the main clustering algorithm described in [Bunea, Giraud, Royer, Verzelen ('17)] Parameters: | **name** | **description** | | --- | --- | - |**obs** | data matrix (n \times p), clustering is applied to the lines of **obs**.| - |**n_struct** | number of structures to separate the data into.| - |**int_corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. | + |**X** | data matrix (n \times p), clustering is applied to the lines of **X**.| + |**n_clusters** | number of structures to separate the data into.| + |**corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. | |**rho** = 5| bias-variance tradeoff parameter in ADMM.| |**n_iter_max** = -1| if positive, sets the stop condition for maximum number of iteration of the ADMM algorithm used to approximate the solution of the SDP.| |**verbose** = False| yields print for time and residuals value at ADMM stop time.| - * **kmeanz_clustering**: efficient variant for main clustering algorithm, see my PhD thesis + * **KMeanz**: efficient variant for main clustering algorithm, introduced in my PhD thesis: http://www.theses.fr/2018SACLS442 Parameters: | **name** | **description** | | --- | --- | - |**obs** | data matrix (n \times p), clustering is applied to the lines of **obs**.| - |**n_struct** | number of structures to separate the data into.| - |**int_corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. | + |**X** | data matrix (n \times p), clustering is applied to the lines of **X**.| + |**n_clusters** | number of structures to separate the data into.| + |**corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. | diff --git a/pecok/__init__.py b/pecok/__init__.py index 214df5c..2e2b868 100644 --- a/pecok/__init__.py +++ b/pecok/__init__.py @@ -1,2 +1,2 @@ -from .clustering import pecok_clustering, kmeanz_clustering +from .clustering import KMeanz, Pecok diff --git a/pecok/admm.py b/pecok/admm.py index 491b551..85e5b57 100644 --- a/pecok/admm.py +++ b/pecok/admm.py @@ -4,51 +4,51 @@ # License: MIT import numpy as np -from scipy import linalg as la +from scipy import linalg -def operator_lstarllstarinv_sym(u, v): +def _operator_lstarllstarinv_sym(u, v): """Operator \widetildetilde{L}^*_{sym} on (u,v) in R^{p+1} -> R^{p*p}""" temp = u.repeat(u.size).reshape((u.size, u.size)) return (temp + temp.T)/2 + np.diag(np.repeat(v, u.size)) -def proj_lin_Hsymmetric(Y, n_struct): +def _proj_lin_Hsymmetric(Y, n_struct): """Projection onto \Pi_{\mathcal{A}sym}(Y)""" n_samples,_ = Y.shape x = np.sum(Y, 1) - 1 y = np.trace(Y) - n_struct invx = (x-(np.sum(x)+y)/(2*n_samples))/(n_samples-1) invy = (y-(np.sum(x)+y)/(2*n_samples))/(n_samples-1) - Y = Y - operator_lstarllstarinv_sym(invx, invy) + Y = Y - _operator_lstarllstarinv_sym(invx, invy) return Y -def proj_positive(x, thresh=0): +def _proj_positive(x, thresh=0): """Project onto component-positive matrix""" x[x < thresh] = 0 return x -def proj_Snp_imp(Y): +def _proj_Snp_imp(Y): """Improved projection onto semi-definite positive matrix""" n_samples,_ = Y.shape - eig_vals = la.eigh(Y, eigvals_only=True) + eig_vals = linalg.eigh(Y, eigvals_only=True) n_val_neg = np.sum(eig_vals<0) if n_val_neg == 0: return Y if n_val_neg == n_samples: return np.zeros((n_samples,n_samples)) if n_val_neg < n_samples-n_val_neg: - eig_vals, v = la.eigh(-Y, eigvals=(n_samples - n_val_neg, n_samples - 1)) + eig_vals, v = linalg.eigh(-Y, eigvals=(n_samples - n_val_neg, n_samples - 1)) Y = Y + v.dot(np.diag(eig_vals)).dot(v.T) else: - eig_vals, v = la.eigh(Y, eigvals=(n_val_neg, n_samples - 1)) + eig_vals, v = linalg.eigh(Y, eigvals=(n_val_neg, n_samples - 1)) Y = v.dot(np.diag(eig_vals)).dot(v.T) return Y -def pecok_admm(relational_data, n_struct, n_iter_max=-1, rho=5, mat_init=None, verbose=False, eps_residual=1e-4): +def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None, verbose=False, eps_residual=1e-4): """Implementation of Alternating Direction Method of Multipliers Parameters @@ -71,9 +71,9 @@ def pecok_admm(relational_data, n_struct, n_iter_max=-1, rho=5, mat_init=None, v n_iter = n_iter + 1 oldXbar = Xbar - X = proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_struct) - Y = proj_positive(Xbar - V) - Z = proj_Snp_imp(Xbar - W) + X = _proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_clusters) + Y = _proj_positive(Xbar - V) + Z = _proj_Snp_imp(Xbar - W) Xbar = (X + Y + Z)/3 U = U + X - Xbar @@ -82,7 +82,7 @@ def pecok_admm(relational_data, n_struct, n_iter_max=-1, rho=5, mat_init=None, v res_dual = rho * np.linalg.norm(Xbar-oldXbar) res_primal = np.linalg.norm((X-Xbar, Z-Xbar, Y-Xbar)) - if not (is_primal_high(eps_residual, res_primal, X, Y, Z) or is_dual_high(eps_residual, res_dual, Y, Z)): + if not (_is_primal_high(eps_residual, res_primal, X, Y, Z) or _is_dual_high(eps_residual, res_dual, Y, Z)): break if verbose: print("ADMM ends -- n_iter=%i, rho=%2.2f" % (n_iter, rho)) @@ -90,9 +90,9 @@ def pecok_admm(relational_data, n_struct, n_iter_max=-1, rho=5, mat_init=None, v return Z -def is_primal_high(eps_residual, res_primal, X, Y, Z): +def _is_primal_high(eps_residual, res_primal, X, Y, Z): return res_primal > eps_residual * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z))) -def is_dual_high(eps_residual, res_dual, Y, Z): +def _is_dual_high(eps_residual, res_dual, Y, Z): return res_dual > eps_residual * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z)) diff --git a/pecok/clustering.py b/pecok/clustering.py index 8f1e884..d14d036 100644 --- a/pecok/clustering.py +++ b/pecok/clustering.py @@ -1,30 +1,192 @@ -"""Clustering wrapper""" +"""PECOK clustering""" # author: Martin Royer # License: MIT import numpy as np -from sklearn.cluster import AgglomerativeClustering -from scipy.linalg import svd as LINsvd -from scipy.sparse.linalg import svds as SPAsvd +from scipy.linalg import svd as lin_svd +from scipy.sparse.linalg import svds as spa_svd + +from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin +from sklearn.cluster import AgglomerativeClustering from .gamma import gamma_hat from .admm import pecok_admm -def corrected_relational(obs, int_corr): - return (obs.dot(obs.T) - gamma_hat(obs, int_corr=int_corr)) / obs.shape[1] +def _corrected_relational(obs, corr): + return (obs.dot(obs.T) - gamma_hat(obs, corr=corr)) / obs.shape[1] -def pecok_clustering(obs, n_struct, int_corr=4, **kwargs): - gram_corrected = corrected_relational(obs, int_corr=int_corr) - U, _, V = SPAsvd(gram_corrected, k=n_struct) - Bhat = pecok_admm(gram_corrected, n_struct=n_struct, mat_init=U.dot(V), **kwargs) - return AgglomerativeClustering(linkage='ward', n_clusters=n_struct).fit(Bhat).labels_ +def _kmeanz(X, n_clusters, corr): + gram_corrected = _corrected_relational(X, corr=corr) + U, s, _ = lin_svd(gram_corrected, compute_uv=True) + approx = U.dot(np.diag(np.sqrt(s))) + return approx, AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit(approx) -def kmeanz_clustering(obs, n_struct, int_corr=4): - gram_corrected = corrected_relational(obs, int_corr=int_corr) - U, s, _ = LINsvd(gram_corrected) - approx = U.dot(np.diag(np.sqrt(s))) - return AgglomerativeClustering(linkage='ward', n_clusters=n_struct).fit(approx).labels_ +def _pecok_clustering(obs, n_clusters, corr=4, **kwargs): + gram_corrected = _corrected_relational(obs, corr=corr) + U, _, V = spa_svd(gram_corrected, k=n_clusters) + Bhat = pecok_admm(gram_corrected, n_clusters=n_clusters, mat_init=U.dot(V), **kwargs) + return AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit(Bhat) + + +class Pecok(BaseEstimator, ClusterMixin, TransformerMixin): + """PeCoK clustering + Read more in [my thesis: http://www.theses.fr/2018SACLS442] + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of centroids to generate. + corr : int, optional, default: 0 + Method for correcting for bias, defaults to 0: + + 0: no correction + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + 1: bad correction + 4: good correction + init : @TODO{'k-means++', 'random' or an ndarray} + Method for initialization, defaults to 'k-means++': + + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + + Attributes + ---------- + labels_ : + Labels of each point + + Examples + -------- + >>> from pecok import Pecok + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> pecok = Pecok(n_clusters=2).fit(X) + >>> pecok.labels_ + array([1, 1, 1, 0, 0, 0], dtype=int32) + """ + + def __init__(self, n_clusters=8, corr=0, init='notyet', + verbose=0, random_state=None, copy_x=True, + n_jobs=None, algorithm='auto'): + + self.n_clusters = n_clusters + self.corr = corr + self.init = init + self.verbose = verbose + self.random_state = random_state + self.copy_x = copy_x + self.n_jobs = n_jobs + self.algorithm = algorithm + + def fit(self, X, y=None, sample_weight=None): + """Compute k-means clustering. + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory + copy if the given data is not C-contiguous. + y : Ignored + not used, present here for API consistency by convention. + sample_weight : array-like, shape (n_samples,), optional + The weights for each observation in X. If None, all observations + are assigned equal weight (default: None) + """ + + hc_ = \ + _pecok_clustering( + X, n_clusters=self.n_clusters, corr=self.corr, verbose=self.verbose) + # init=self.init, n_init=self.n_init, + # max_iter=self.max_iter, verbose=self.verbose, + # precompute_distances=self.precompute_distances, + # tol=self.tol, random_state=random_state, copy_x=self.copy_x, + # n_jobs=self.n_jobs, algorithm=self.algorithm, + # return_n_iter=True) + self.labels_ = hc_.labels_ + return self + + +class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin): + """K-MeanZ clustering + Read more in [my thesis: http://www.theses.fr/2018SACLS442] + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of centroids to generate. + corr : int, optional, default: 0 + Method for correcting for bias, defaults to 0: + + 0: no correction + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + 1: bad correction + 4: good correction + init : @TODO{'k-means++', 'random' or an ndarray} + Method for initialization, defaults to 'k-means++': + + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + + Attributes + ---------- + corrected_points_: array, [n_points, n_features] + Points representation in Z-space + labels_ : + Labels of each point + + Examples + -------- + >>> from pecok import KMeanz + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> kmeanz = KMeanz(n_clusters=2).fit(X) + >>> kmeanz.labels_ + array([1, 1, 1, 0, 0, 0], dtype=int32) + """ + + def __init__(self, n_clusters=8, corr=0, init='notyet', + verbose=0, random_state=None, copy_x=True, + n_jobs=None, algorithm='auto'): + + self.n_clusters = n_clusters + self.corr = corr + self.init = init + self.verbose = verbose + self.random_state = random_state + self.copy_x = copy_x + self.n_jobs = n_jobs + self.algorithm = algorithm + + def fit(self, X, y=None, sample_weight=None): + """Compute k-means clustering. + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory + copy if the given data is not C-contiguous. + y : Ignored + not used, present here for API consistency by convention. + sample_weight : array-like, shape (n_samples,), optional + The weights for each observation in X. If None, all observations + are assigned equal weight (default: None) + """ + + self.corrected_points_, hc_ = \ + _kmeanz( + X, n_clusters=self.n_clusters, corr=self.corr) + # init=self.init, n_init=self.n_init, + # max_iter=self.max_iter, verbose=self.verbose, + # precompute_distances=self.precompute_distances, + # tol=self.tol, random_state=random_state, copy_x=self.copy_x, + # n_jobs=self.n_jobs, algorithm=self.algorithm, + # return_n_iter=True) + self.labels_ = hc_.labels_ + return self diff --git a/pecok/gamma.py b/pecok/gamma.py index 3076db0..fe9f925 100644 --- a/pecok/gamma.py +++ b/pecok/gamma.py @@ -8,7 +8,7 @@ -def gamma_hat2(X): +def _gamma_hat2(X): n_samples,_ = X.shape X2 = X / (np.linalg.norm(X, axis=1, keepdims=True)+1e-8) XaX2 = X.dot(X2.T) @@ -21,7 +21,7 @@ def gamma_hat2(X): return gamma -def gamma_hat2_robust(X): +def _gamma_hat2_robust(X): n_samples,_ = X.shape X2 = X / (np.linalg.norm(X, axis=1, keepdims=True)+1e-8) XaX2 = X.dot(X2.T) @@ -34,7 +34,7 @@ def gamma_hat2_robust(X): return gamma -def gamma_hat3(X): +def _gamma_hat3(X): """Gamma_hat3 estimator from PECOK supplement, in O(n_samples^3 * n_features) Parameters @@ -53,7 +53,7 @@ def gamma_hat3(X): return gamma -def gamma_hat4(X): +def _gamma_hat4(X): """Gamma_hat4 estimator from PECOK, in O(n_samples^4 * n_features) Parameters @@ -81,21 +81,21 @@ def gamma_hat4(X): return np.asarray(gamma) -def no_correction(X): +def _no_correction(X): return np.zeros(X.shape[0]) -def cross_diag(X): +def _cross_diag(X): return np.diag(X.dot(X.T)) -def gamma_hat(X, int_corr): +def gamma_hat(X, corr): ghat = { - 0: no_correction, - 1: gamma_hat2_robust, - 2: gamma_hat2, - 3: gamma_hat3, - 4: gamma_hat4, - 8: cross_diag, - }.get(int_corr, no_correction)(X) + 0: _no_correction, + 1: _gamma_hat2_robust, + 2: _gamma_hat2, + 3: _gamma_hat3, + 4: _gamma_hat4, + 8: _cross_diag, + }.get(corr, _no_correction)(X) return np.diag(ghat) diff --git a/setup.py b/setup.py index c984f0b..5c86af3 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ install_requires=[ 'numpy', 'scipy', 'scikit-learn' ], - version = '1.0.2', + version = '1.0.3', description = 'Implementation of Pecok', author='Martin Royer', author_email='martinpierreroyer@gmail.com', diff --git a/tests/demo.py b/tests/demo.py new file mode 100644 index 0000000..8ccdc52 --- /dev/null +++ b/tests/demo.py @@ -0,0 +1,15 @@ +import numpy as np + +np.random.seed(232123123) +n = 10 +p = 100 +X = np.zeros((n, p)) +snr = 0.3 +X[n//2:, :] = -np.ones(p) * snr + np.random.normal(scale=.1, size=(n//2,p)) +X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p)) + +from sklearn.cluster import KMeans +print(KMeans(n_clusters=2, init='k-means++', n_init=100).fit(X).labels_) +from pecok import Pecok +print(Pecok(n_clusters=2, corr=0).fit(X).labels_) +print(Pecok(n_clusters=2, corr=4).fit(X).labels_) diff --git a/tests/illustration_comparaison.py b/tests/illustration_comparaison.py new file mode 100644 index 0000000..57f4a0a --- /dev/null +++ b/tests/illustration_comparaison.py @@ -0,0 +1,121 @@ +import time +import warnings + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn import cluster, datasets, mixture +from sklearn.preprocessing import StandardScaler +from itertools import cycle, islice + +from pecok import KMeanz + + +np.random.seed(0) + +random_state = 170 +n_samples = 500 +blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) + +# Anisotropicly distributed data +# X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state) +# transformation = [[0.6, -0.6], [-0.4, 0.8]] +# X_aniso = np.dot(X, transformation) +# aniso = (X_aniso, y) + +# blobs with varied variances +varied = datasets.make_blobs(n_samples=n_samples, centers=np.array([[0,0],[1,0]]), + cluster_std=[.05, .5], + random_state=random_state) + +# ============ +# Set up cluster parameters +# ============ +plt.figure(figsize=(9 * 2 + 3, 12.5)) +plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, + hspace=.01) + +plot_num = 1 + +default_base = {'quantile': .3, + 'eps': .3, + 'damping': .9, + 'preference': -200, + 'n_neighbors': 10, + 'n_clusters': 2} + +datasets = [ + # (noisy_circles, {'damping': .77, 'preference': -240, + # 'quantile': .2, 'n_clusters': 2}), + # (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}), + (varied, {'eps': .18, 'n_neighbors': 2})] + +for i_dataset, (dataset, algo_params) in enumerate(datasets): + # update parameters with dataset-specific values + params = default_base.copy() + params.update(algo_params) + + X, y = dataset + X = StandardScaler().fit_transform(X) + + two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) + spectral = cluster.SpectralClustering( + n_clusters=params['n_clusters'], eigen_solver='arpack', + affinity="nearest_neighbors") + kmeanz = KMeanz( + n_clusters=params['n_clusters'], corr=2 + ) + gmm = mixture.GaussianMixture( + n_components=1, covariance_type='full') + + clustering_algorithms = ( + ('MiniBatchKMeans', two_means), + ('SpectralClustering', spectral), + ('KMeanz', kmeanz) + # ('GaussianMixture', gmm) + ) + + for name, algorithm in clustering_algorithms: + t0 = time.time() + + # catch warnings related to kneighbors_graph + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="the number of connected components of the " + + "connectivity matrix is [0-9]{1,2}" + + " > 1. Completing it to avoid stopping the tree early.", + category=UserWarning) + warnings.filterwarnings( + "ignore", + message="Graph is not fully connected, spectral embedding" + + " may not work as expected.", + category=UserWarning) + algorithm.fit(X) + + t1 = time.time() + if hasattr(algorithm, 'labels_'): + y_pred = algorithm.labels_.astype(np.int) + else: + y_pred = algorithm.predict(X) + + plt.subplot(len(datasets), len(clustering_algorithms), plot_num) + if i_dataset == 0: + plt.title(name, size=18) + + colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', + '#f781bf', '#a65628', '#984ea3', + '#999999', '#e41a1c', '#dede00']), + int(max(y_pred) + 1)))) + plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) + + plt.xlim(-2.5, 2.5) + plt.ylim(-2.5, 2.5) + plt.xticks(()) + plt.yticks(()) + plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), + transform=plt.gca().transAxes, size=15, + horizontalalignment='right') + plot_num += 1 + +plt.show() diff --git a/tests/test.py b/tests/test.py index 2755954..2afe9ad 100644 --- a/tests/test.py +++ b/tests/test.py @@ -3,48 +3,23 @@ # author: Martin Royer # License: MIT -from functools import wraps +from functools import partial import timeit -# import pyper import numpy as np -from pecok import pecok_clustering, kmeanz_clustering -from sklearn.cluster import KMeans +from pecok import KMeanz, Pecok +from sklearn.cluster import KMeans, AgglomerativeClustering -def timethis(f): - @wraps(f) - def wrap(*args, **kw): - ts = timeit.default_timer() - result = f(*args, **kw) - te = timeit.default_timer() - return result, te-ts - return wrap - - -def hdclassif(obs, n_struct): - n,_ = obs.shape - myR = pyper.R() - myR.run('library(HDclassif)') - myR.assign('obs', obs) - myR.assign('n_struct', n_struct) - try: - myR.run('res <- hddc(obs, K=n_struct, model=c(1,2,7,9))') - result = np.array(myR['res$class']) - except: - print("fail hdclassif") - result = np.zeros(n) - return result - seed = 432 np.random.seed(seed) print("seed is %i" % seed) methods = [ - [lambda X, K: KMeans(n_clusters=K, init='k-means++', n_init=100).fit(X).labels_, 'kmeans++'], -# [hdclassif, 'HDDC'], - [pecok_clustering, 'pecok'], - [kmeanz_clustering, 'kmeanz'] + [partial(KMeans, init="k-means++", n_init=100), "k-means++"], + [partial(AgglomerativeClustering, linkage='ward'), "Hierarchical"], + [partial(KMeanz, corr=2), "KMeanz"], + [partial(Pecok, corr=2), "Pecok"], ] print("\nVAR CLUSTERING\n\n") @@ -52,7 +27,7 @@ def hdclassif(obs, n_struct): n_obs = 100 truth = np.asmatrix(np.concatenate((np.repeat(0, n_var//2), np.repeat(1, n_var//2)))) -membership = truth.T.dot(np.matrix([1, 0])) + (1-truth).T.dot(np.matrix([0, 1])) +membership = truth.T.dot(np.array([1, 0])[np.newaxis,:]) + (1-truth).T.dot(np.array([0, 1])[np.newaxis,:]) stds = np.ones(n_var) stds[:(n_var//2)] = 0.1 sigma = membership.dot(0.1*np.identity(2)).dot(membership.T) + np.diag(stds) @@ -60,14 +35,16 @@ def hdclassif(obs, n_struct): print("truth:".ljust(15), truth) for method, method_name in methods: - job_result, job_time = timethis(method)(mat_data.T, 2) + ts = timeit.default_timer() + job_result = method(n_clusters=2).fit(mat_data.T).labels_ + te = timeit.default_timer() print(method_name.ljust(15), job_result) - print("job_time: %.2f (s)".ljust(15) % job_time) + print("job_time: %.2f (s)".ljust(15) % (te-ts)) print("\nPOINT CLUSTERING\n\n") -n_var = 100 n_obs = 10 +n_var = 100 truth = np.asmatrix(np.concatenate((np.repeat(0, n_obs//2), np.repeat(1, n_obs//2)))) X = np.zeros((n_obs, n_var)) @@ -77,7 +54,8 @@ def hdclassif(obs, n_struct): print("truth:".ljust(15), truth) for method, method_name in methods: - job_result, job_time = timethis(method)(X, 2) + ts = timeit.default_timer() + job_result = method(n_clusters=2).fit(X).labels_ + te = timeit.default_timer() print(method_name.ljust(15), job_result) - print("job_time: %.2f (s)".ljust(15) % job_time) -# print("pecok:".ljust(15), pecok_clustering(X, n_struct=2, rho=100, n_iter_max=3000, verbose=True).labels_) + print("job_time: %.2f (s)".ljust(15) % (te-ts))