From e0959d88411b055d911d7e24a3a7eb5129364b37 Mon Sep 17 00:00:00 2001
From: Martin Royer <martin.royer@m4x.org>
Date: Wed, 7 Nov 2018 16:46:46 +0100
Subject: [PATCH 1/9] halfway into class-ification

---
 pecok/__init__.py   |   2 +-
 pecok/admm.py       |   4 +-
 pecok/clustering.py | 270 +++++++++++++++++++++++++++++++++++++++++---
 pecok/gamma.py      |   4 +-
 tests/test.py       |  10 +-
 5 files changed, 264 insertions(+), 26 deletions(-)

diff --git a/pecok/__init__.py b/pecok/__init__.py
index 214df5c..2e2b868 100644
--- a/pecok/__init__.py
+++ b/pecok/__init__.py
@@ -1,2 +1,2 @@
 
-from .clustering import pecok_clustering, kmeanz_clustering
+from .clustering import KMeanz, Pecok
diff --git a/pecok/admm.py b/pecok/admm.py
index 491b551..137dc71 100644
--- a/pecok/admm.py
+++ b/pecok/admm.py
@@ -48,7 +48,7 @@ def proj_Snp_imp(Y):
     return Y
 
 
-def pecok_admm(relational_data, n_struct, n_iter_max=-1, rho=5, mat_init=None, verbose=False, eps_residual=1e-4):
+def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None, verbose=False, eps_residual=1e-4):
     """Implementation of Alternating Direction Method of Multipliers
 
     Parameters
@@ -71,7 +71,7 @@ def pecok_admm(relational_data, n_struct, n_iter_max=-1, rho=5, mat_init=None, v
         n_iter = n_iter + 1
 
         oldXbar = Xbar
-        X = proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_struct)
+        X = proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_clusters)
         Y = proj_positive(Xbar - V)
         Z = proj_Snp_imp(Xbar - W)
         Xbar = (X + Y + Z)/3
diff --git a/pecok/clustering.py b/pecok/clustering.py
index 8f1e884..c60bd8b 100644
--- a/pecok/clustering.py
+++ b/pecok/clustering.py
@@ -1,30 +1,266 @@
-"""Clustering wrapper"""
+"""PECOK clustering"""
 
-# author: Martin Royer <martin.royer@math.u-psud.fr>
+# author: Martin Royer <martin.royer@m4x.org>
 # License: MIT
 
 import numpy as np
-from sklearn.cluster import AgglomerativeClustering
-from scipy.linalg import svd as LINsvd
-from scipy.sparse.linalg import svds as SPAsvd
+from scipy.linalg import svd as lin_svd
+from scipy.sparse.linalg import svds as spa_svd
+
+from sklearn.base          import BaseEstimator,  ClusterMixin, TransformerMixin
+from sklearn.cluster       import AgglomerativeClustering
 
 from .gamma import gamma_hat
 from .admm import pecok_admm
 
 
-def corrected_relational(obs, int_corr):
-    return (obs.dot(obs.T) - gamma_hat(obs, int_corr=int_corr)) / obs.shape[1]
-
+###############################################################################
+# shared correction
 
-def pecok_clustering(obs, n_struct, int_corr=4, **kwargs):
-    gram_corrected = corrected_relational(obs, int_corr=int_corr)
-    U, _, V = SPAsvd(gram_corrected, k=n_struct)
-    Bhat = pecok_admm(gram_corrected, n_struct=n_struct, mat_init=U.dot(V), **kwargs)
-    return AgglomerativeClustering(linkage='ward', n_clusters=n_struct).fit(Bhat).labels_
+def _corrected_relational(obs, corr):
+    return (obs.dot(obs.T) - gamma_hat(obs, corr=corr)) / obs.shape[1]
 
 
-def kmeanz_clustering(obs, n_struct, int_corr=4):
-    gram_corrected = corrected_relational(obs, int_corr=int_corr)
-    U, s, _ = LINsvd(gram_corrected)
+def kmeanz(X, n_clusters, corr):
+    gram_corrected = _corrected_relational(X, corr=corr)
+    U, s, _ = lin_svd(gram_corrected, compute_uv=True)
     approx = U.dot(np.diag(np.sqrt(s)))
-    return AgglomerativeClustering(linkage='ward', n_clusters=n_struct).fit(approx).labels_
+    return approx, AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit(approx)
+
+
+def pecok_clustering(obs, n_clusters, corr=4, **kwargs):
+    gram_corrected = _corrected_relational(obs, corr=corr)
+    U, _, V = spa_svd(gram_corrected, k=n_clusters)
+    Bhat = pecok_admm(gram_corrected, n_clusters=n_clusters, mat_init=U.dot(V), **kwargs)
+    return AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit(Bhat)
+
+
+class Pecok(BaseEstimator, ClusterMixin, TransformerMixin):
+    """PeCoK clustering
+        Read more in [my thesis]
+        Parameters
+        ----------
+        n_clusters : int, optional, default: 8
+            The number of clusters to form as well as the number of centroids to generate.
+        corr : int, optional, default: 0
+            Method for correcting for bias, defaults to 0:
+
+            0: no correction
+            clustering in a smart way to speed up convergence. See section
+            Notes in k_init for more details.
+            1: bad correction
+            4: good correction
+        init : @TODO{'k-means++', 'random' or an ndarray}
+            Method for initialization, defaults to 'k-means++':
+
+            'k-means++' : selects initial cluster centers for k-mean
+            clustering in a smart way to speed up convergence. See section
+            Notes in k_init for more details.
+
+         Attributes
+        ----------
+        corrected_points_: array, [n_points, n_features]
+            Points representation in Z-space
+        cluster_centers_ : array, [n_clusters, n_features]
+            Coordinates of cluster centers in Z-space
+        labels_ :
+            Labels of each point
+        inertia_ : float
+            Sum of squared distances (in Z-space) of samples to their closest cluster center.
+
+        Examples
+            --------
+            TODO
+            >>> from sklearn.cluster import KMeans
+            >>> import numpy as np
+            >>> X = np.array([[1, 2], [1, 4], [1, 0],
+            ...               [4, 2], [4, 4], [4, 0]])
+            >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
+            >>> kmeans.labels_
+            array([0, 0, 0, 1, 1, 1], dtype=int32)
+            >>> kmeans.predict([[0, 0], [4, 4]])
+            array([0, 1], dtype=int32)
+            >>> kmeans.cluster_centers_
+            array([[1., 2.],
+                   [4., 2.]])
+    """
+
+    def __init__(self, n_clusters=8, corr=0, init='notyet',
+                 verbose=0, random_state=None, copy_x=True,
+                 n_jobs=None, algorithm='auto'):
+
+        self.n_clusters = n_clusters
+        self.corr = corr
+        self.init = init
+        self.verbose = verbose
+        self.random_state = random_state
+        self.copy_x = copy_x
+        self.n_jobs = n_jobs
+        self.algorithm = algorithm
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute k-means clustering.
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape=(n_samples, n_features)
+            Training instances to cluster. It must be noted that the data
+            will be converted to C ordering, which will cause a memory
+            copy if the given data is not C-contiguous.
+        y : Ignored
+            not used, present here for API consistency by convention.
+        sample_weight : array-like, shape (n_samples,), optional
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight (default: None)
+        """
+
+        # random_state = check_random_state(self.random_state)
+
+        hc_ = \
+            pecok_clustering(
+                X, n_clusters=self.n_clusters, corr=self.corr, verbose=self.verbose)
+                # init=self.init, n_init=self.n_init,
+                # max_iter=self.max_iter, verbose=self.verbose,
+                # precompute_distances=self.precompute_distances,
+                # tol=self.tol, random_state=random_state, copy_x=self.copy_x,
+                # n_jobs=self.n_jobs, algorithm=self.algorithm,
+                # return_n_iter=True)
+        self.labels_ = hc_.labels_
+        return self
+
+    # def predict(self, X, sample_weight=None):
+    #     """Predict the closest cluster each sample in X belongs to.
+    #     In the vector quantization literature, `cluster_centers_` is called
+    #     the code book and each value returned by `predict` is the index of
+    #     the closest code in the code book.
+    #     Parameters
+    #     ----------
+    #     X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+    #         New data to predict.
+    #     sample_weight : array-like, shape (n_samples,), optional
+    #         The weights for each observation in X. If None, all observations
+    #         are assigned equal weight (default: None)
+    #     Returns
+    #     -------
+    #     labels : array, shape [n_samples,]
+    #         Index of the cluster each sample belongs to.
+    #     """
+    #     check_is_fitted(self, 'cluster_centers_')
+    #
+    #     X = self._check_test_data(X)
+    #     return self._labels_inertia_minibatch(X, sample_weight)[0]
+
+
+class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin):
+    """K-MeanZ clustering
+        Read more in [my thesis]
+        Parameters
+        ----------
+        n_clusters : int, optional, default: 8
+            The number of clusters to form as well as the number of centroids to generate.
+        corr : int, optional, default: 0
+            Method for correcting for bias, defaults to 0:
+
+            0: no correction
+            clustering in a smart way to speed up convergence. See section
+            Notes in k_init for more details.
+            1: bad correction
+            4: good correction
+        init : @TODO{'k-means++', 'random' or an ndarray}
+            Method for initialization, defaults to 'k-means++':
+
+            'k-means++' : selects initial cluster centers for k-mean
+            clustering in a smart way to speed up convergence. See section
+            Notes in k_init for more details.
+
+         Attributes
+        ----------
+        corrected_points_: array, [n_points, n_features]
+            Points representation in Z-space
+        cluster_centers_ : array, [n_clusters, n_features]
+            Coordinates of cluster centers in Z-space
+        labels_ :
+            Labels of each point
+        inertia_ : float
+            Sum of squared distances (in Z-space) of samples to their closest cluster center.
+
+        Examples
+            --------
+            TODO
+            >>> from sklearn.cluster import KMeans
+            >>> import numpy as np
+            >>> X = np.array([[1, 2], [1, 4], [1, 0],
+            ...               [4, 2], [4, 4], [4, 0]])
+            >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
+            >>> kmeans.labels_
+            array([0, 0, 0, 1, 1, 1], dtype=int32)
+            >>> kmeans.predict([[0, 0], [4, 4]])
+            array([0, 1], dtype=int32)
+            >>> kmeans.cluster_centers_
+            array([[1., 2.],
+                   [4., 2.]])
+    """
+
+    def __init__(self, n_clusters=8, corr=0, init='notyet',
+                 verbose=0, random_state=None, copy_x=True,
+                 n_jobs=None, algorithm='auto'):
+
+        self.n_clusters = n_clusters
+        self.corr = corr
+        self.init = init
+        self.verbose = verbose
+        self.random_state = random_state
+        self.copy_x = copy_x
+        self.n_jobs = n_jobs
+        self.algorithm = algorithm
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute k-means clustering.
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape=(n_samples, n_features)
+            Training instances to cluster. It must be noted that the data
+            will be converted to C ordering, which will cause a memory
+            copy if the given data is not C-contiguous.
+        y : Ignored
+            not used, present here for API consistency by convention.
+        sample_weight : array-like, shape (n_samples,), optional
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight (default: None)
+        """
+
+        # random_state = check_random_state(self.random_state)
+
+        self.corrected_points_, hc_ = \
+            kmeanz(
+                X, n_clusters=self.n_clusters, corr=self.corr)
+                # init=self.init, n_init=self.n_init,
+                # max_iter=self.max_iter, verbose=self.verbose,
+                # precompute_distances=self.precompute_distances,
+                # tol=self.tol, random_state=random_state, copy_x=self.copy_x,
+                # n_jobs=self.n_jobs, algorithm=self.algorithm,
+                # return_n_iter=True)
+        self.labels_ = hc_.labels_
+        return self
+
+    # def predict(self, X, sample_weight=None):
+    #     """Predict the closest cluster each sample in X belongs to.
+    #     In the vector quantization literature, `cluster_centers_` is called
+    #     the code book and each value returned by `predict` is the index of
+    #     the closest code in the code book.
+    #     Parameters
+    #     ----------
+    #     X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+    #         New data to predict.
+    #     sample_weight : array-like, shape (n_samples,), optional
+    #         The weights for each observation in X. If None, all observations
+    #         are assigned equal weight (default: None)
+    #     Returns
+    #     -------
+    #     labels : array, shape [n_samples,]
+    #         Index of the cluster each sample belongs to.
+    #     """
+    #     check_is_fitted(self, 'cluster_centers_')
+    #
+    #     X = self._check_test_data(X)
+    #     return self._labels_inertia_minibatch(X, sample_weight)[0]
+
diff --git a/pecok/gamma.py b/pecok/gamma.py
index 3076db0..6616fc9 100644
--- a/pecok/gamma.py
+++ b/pecok/gamma.py
@@ -89,7 +89,7 @@ def cross_diag(X):
     return np.diag(X.dot(X.T))
 
 
-def gamma_hat(X, int_corr):
+def gamma_hat(X, corr):
     ghat = {
         0: no_correction,
         1: gamma_hat2_robust,
@@ -97,5 +97,5 @@ def gamma_hat(X, int_corr):
         3: gamma_hat3,
         4: gamma_hat4,
         8: cross_diag,
-    }.get(int_corr, no_correction)(X)
+    }.get(corr, no_correction)(X)
     return np.diag(ghat)
diff --git a/tests/test.py b/tests/test.py
index 2755954..8a2808d 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -8,7 +8,7 @@
 
 # import pyper
 import numpy as np
-from pecok import pecok_clustering, kmeanz_clustering
+from pecok import KMeanz, Pecok
 from sklearn.cluster import KMeans
 
 
@@ -42,9 +42,11 @@ def hdclassif(obs, n_struct):
 
 methods = [
     [lambda X, K: KMeans(n_clusters=K, init='k-means++', n_init=100).fit(X).labels_, 'kmeans++'],
+    [lambda X, K: KMeanz(n_clusters=K, corr=2).fit(X).labels_, 'KMeanz'],
+    [lambda X, K: Pecok(n_clusters=K, corr=2).fit(X).labels_, 'Pecok'],
 #    [hdclassif, 'HDDC'],
-    [pecok_clustering, 'pecok'],
-    [kmeanz_clustering, 'kmeanz']
+#     [pecok_clustering, 'pecok'],
+#     [kmeanz_clustering, 'kmeanz']
 ]
 
 print("\nVAR CLUSTERING\n\n")
@@ -66,8 +68,8 @@ def hdclassif(obs, n_struct):
 
 
 print("\nPOINT CLUSTERING\n\n")
-n_var = 100
 n_obs = 10
+n_var = 100
 
 truth = np.asmatrix(np.concatenate((np.repeat(0, n_obs//2), np.repeat(1, n_obs//2))))
 X = np.zeros((n_obs, n_var))

From cf8810b1d417fdcd9719fff35f58416884e6bd68 Mon Sep 17 00:00:00 2001
From: Martin Royer <martin.royer@m4x.org>
Date: Sun, 27 Jan 2019 16:21:53 +0100
Subject: [PATCH 2/9] some tests

---
 tests/demo.py                     |  14 ++++
 tests/illustration_comparaison.py | 124 ++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 tests/demo.py
 create mode 100644 tests/illustration_comparaison.py

diff --git a/tests/demo.py b/tests/demo.py
new file mode 100644
index 0000000..1ce540f
--- /dev/null
+++ b/tests/demo.py
@@ -0,0 +1,14 @@
+import numpy as np
+np.seed(232123123)
+n = 10
+p = 100
+X = np.zeros((n, p))
+snr = 0.3
+X[n//2:, :] = -np.ones(p) * snr + np.random.normal(scale=.1, size=(n//2,p))
+X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p))
+
+from sklearn.cluster import KMeans
+KMeans(n_clusters=2, init='k-means++', n_init=100).fit(X).labels_
+from pecok import PeCoK
+PeCoK(n_clusters=2, corr=0).fit(X).labels_
+PeCoK(n_clusters=2, corr=4).fit(X).labels_
diff --git a/tests/illustration_comparaison.py b/tests/illustration_comparaison.py
new file mode 100644
index 0000000..04318c3
--- /dev/null
+++ b/tests/illustration_comparaison.py
@@ -0,0 +1,124 @@
+print(__doc__)
+
+import time
+import warnings
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn import cluster, datasets, mixture
+from sklearn.preprocessing import StandardScaler
+from itertools import cycle, islice
+
+from pecok import KMeanZ
+
+
+
+np.random.seed(0)
+
+random_state = 170
+n_samples = 500
+blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
+
+# Anisotropicly distributed data
+# X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
+# transformation = [[0.6, -0.6], [-0.4, 0.8]]
+# X_aniso = np.dot(X, transformation)
+# aniso = (X_aniso, y)
+
+# blobs with varied variances
+varied = datasets.make_blobs(n_samples=n_samples, centers=np.matrix([[0,0],[1,0]]),
+                             cluster_std=[.1, .5],
+                             random_state=random_state)
+
+# ============
+# Set up cluster parameters
+# ============
+plt.figure(figsize=(9 * 2 + 3, 12.5))
+plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
+                    hspace=.01)
+
+plot_num = 1
+
+default_base = {'quantile': .3,
+                'eps': .3,
+                'damping': .9,
+                'preference': -200,
+                'n_neighbors': 10,
+                'n_clusters': 2}
+
+datasets = [
+    # (noisy_circles, {'damping': .77, 'preference': -240,
+    #                  'quantile': .2, 'n_clusters': 2}),
+    # (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
+    (varied, {'eps': .18, 'n_neighbors': 2})]
+
+for i_dataset, (dataset, algo_params) in enumerate(datasets):
+    # update parameters with dataset-specific values
+    params = default_base.copy()
+    params.update(algo_params)
+
+    X, y = dataset
+    X = StandardScaler().fit_transform(X)
+
+    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
+    spectral = cluster.SpectralClustering(
+        n_clusters=params['n_clusters'], eigen_solver='arpack',
+        affinity="nearest_neighbors")
+    kmeanz = KMeanZ(
+        n_clusters=params['n_clusters'], corr=2
+    )
+    gmm = mixture.GaussianMixture(
+        n_components=1, covariance_type='full')
+
+    clustering_algorithms = (
+        ('MiniBatchKMeans', two_means),
+        ('SpectralClustering', spectral),
+        ('KMeanZ', kmeanz)
+        # ('GaussianMixture', gmm)
+    )
+
+    for name, algorithm in clustering_algorithms:
+        t0 = time.time()
+
+        # catch warnings related to kneighbors_graph
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                message="the number of connected components of the " +
+                "connectivity matrix is [0-9]{1,2}" +
+                " > 1. Completing it to avoid stopping the tree early.",
+                category=UserWarning)
+            warnings.filterwarnings(
+                "ignore",
+                message="Graph is not fully connected, spectral embedding" +
+                " may not work as expected.",
+                category=UserWarning)
+            algorithm.fit(X)
+
+        t1 = time.time()
+        if hasattr(algorithm, 'labels_'):
+            y_pred = algorithm.labels_.astype(np.int)
+        else:
+            y_pred = algorithm.predict(X)
+
+        plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
+        if i_dataset == 0:
+            plt.title(name, size=18)
+
+        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
+                                             '#f781bf', '#a65628', '#984ea3',
+                                             '#999999', '#e41a1c', '#dede00']),
+                                      int(max(y_pred) + 1))))
+        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
+
+        plt.xlim(-2.5, 2.5)
+        plt.ylim(-2.5, 2.5)
+        plt.xticks(())
+        plt.yticks(())
+        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
+                 transform=plt.gca().transAxes, size=15,
+                 horizontalalignment='right')
+        plot_num += 1
+
+plt.show()

From fa18d214e9973669db5f3e6e026dd21bcc821c0e Mon Sep 17 00:00:00 2001
From: martinroyer-datashape <martinpierreroyer@gmail.com>
Date: Wed, 10 Jul 2019 12:39:56 +0200
Subject: [PATCH 3/9] quick refac

---
 pecok/admm.py       | 30 +++++++++++++++---------------
 pecok/clustering.py | 14 +++++++-------
 pecok/gamma.py      | 26 +++++++++++++-------------
 tests/test.py       | 21 +++------------------
 4 files changed, 38 insertions(+), 53 deletions(-)

diff --git a/pecok/admm.py b/pecok/admm.py
index 137dc71..85e5b57 100644
--- a/pecok/admm.py
+++ b/pecok/admm.py
@@ -4,46 +4,46 @@
 # License: MIT
 
 import numpy as np
-from scipy import linalg as la
+from scipy import linalg
 
 
-def operator_lstarllstarinv_sym(u, v):
+def _operator_lstarllstarinv_sym(u, v):
     """Operator \widetildetilde{L}^*_{sym} on (u,v) in R^{p+1} -> R^{p*p}"""
     temp = u.repeat(u.size).reshape((u.size, u.size))
     return (temp + temp.T)/2 + np.diag(np.repeat(v, u.size))
 
 
-def proj_lin_Hsymmetric(Y, n_struct):
+def _proj_lin_Hsymmetric(Y, n_struct):
     """Projection onto \Pi_{\mathcal{A}sym}(Y)"""
     n_samples,_ = Y.shape
     x = np.sum(Y, 1) - 1
     y = np.trace(Y) - n_struct
     invx = (x-(np.sum(x)+y)/(2*n_samples))/(n_samples-1)
     invy = (y-(np.sum(x)+y)/(2*n_samples))/(n_samples-1)
-    Y = Y - operator_lstarllstarinv_sym(invx, invy)
+    Y = Y - _operator_lstarllstarinv_sym(invx, invy)
     return Y
 
 
-def proj_positive(x, thresh=0):
+def _proj_positive(x, thresh=0):
     """Project onto component-positive matrix"""
     x[x < thresh] = 0
     return x
 
 
-def proj_Snp_imp(Y):
+def _proj_Snp_imp(Y):
     """Improved projection onto semi-definite positive matrix"""
     n_samples,_ = Y.shape
-    eig_vals = la.eigh(Y, eigvals_only=True)
+    eig_vals = linalg.eigh(Y, eigvals_only=True)
     n_val_neg = np.sum(eig_vals<0)
     if n_val_neg == 0:
         return Y
     if n_val_neg == n_samples:
         return np.zeros((n_samples,n_samples))
     if n_val_neg < n_samples-n_val_neg:
-        eig_vals, v = la.eigh(-Y, eigvals=(n_samples - n_val_neg, n_samples - 1))
+        eig_vals, v = linalg.eigh(-Y, eigvals=(n_samples - n_val_neg, n_samples - 1))
         Y = Y + v.dot(np.diag(eig_vals)).dot(v.T)
     else:
-        eig_vals, v = la.eigh(Y, eigvals=(n_val_neg, n_samples - 1))
+        eig_vals, v = linalg.eigh(Y, eigvals=(n_val_neg, n_samples - 1))
         Y = v.dot(np.diag(eig_vals)).dot(v.T)
     return Y
 
@@ -71,9 +71,9 @@ def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None,
         n_iter = n_iter + 1
 
         oldXbar = Xbar
-        X = proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_clusters)
-        Y = proj_positive(Xbar - V)
-        Z = proj_Snp_imp(Xbar - W)
+        X = _proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_clusters)
+        Y = _proj_positive(Xbar - V)
+        Z = _proj_Snp_imp(Xbar - W)
         Xbar = (X + Y + Z)/3
 
         U = U + X - Xbar
@@ -82,7 +82,7 @@ def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None,
 
         res_dual = rho * np.linalg.norm(Xbar-oldXbar)
         res_primal = np.linalg.norm((X-Xbar, Z-Xbar, Y-Xbar))
-        if not (is_primal_high(eps_residual, res_primal, X, Y, Z) or is_dual_high(eps_residual, res_dual, Y, Z)):
+        if not (_is_primal_high(eps_residual, res_primal, X, Y, Z) or _is_dual_high(eps_residual, res_dual, Y, Z)):
             break
     if verbose:
         print("ADMM ends -- n_iter=%i, rho=%2.2f" % (n_iter, rho))
@@ -90,9 +90,9 @@ def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None,
     return Z
 
 
-def is_primal_high(eps_residual, res_primal, X, Y, Z):
+def _is_primal_high(eps_residual, res_primal, X, Y, Z):
     return res_primal > eps_residual * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z)))
 
 
-def is_dual_high(eps_residual, res_dual, Y, Z):
+def _is_dual_high(eps_residual, res_dual, Y, Z):
     return res_dual > eps_residual * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z))
diff --git a/pecok/clustering.py b/pecok/clustering.py
index c60bd8b..b6ca2d7 100644
--- a/pecok/clustering.py
+++ b/pecok/clustering.py
@@ -1,6 +1,6 @@
 """PECOK clustering"""
 
-# author: Martin Royer <martin.royer@m4x.org>
+# author: Martin Royer <martin.royer@math.u-psud.fr>
 # License: MIT
 
 import numpy as np
@@ -21,14 +21,14 @@ def _corrected_relational(obs, corr):
     return (obs.dot(obs.T) - gamma_hat(obs, corr=corr)) / obs.shape[1]
 
 
-def kmeanz(X, n_clusters, corr):
+def _kmeanz(X, n_clusters, corr):
     gram_corrected = _corrected_relational(X, corr=corr)
     U, s, _ = lin_svd(gram_corrected, compute_uv=True)
     approx = U.dot(np.diag(np.sqrt(s)))
     return approx, AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit(approx)
 
 
-def pecok_clustering(obs, n_clusters, corr=4, **kwargs):
+def _pecok_clustering(obs, n_clusters, corr=4, **kwargs):
     gram_corrected = _corrected_relational(obs, corr=corr)
     U, _, V = spa_svd(gram_corrected, k=n_clusters)
     Bhat = pecok_admm(gram_corrected, n_clusters=n_clusters, mat_init=U.dot(V), **kwargs)
@@ -37,7 +37,7 @@ def pecok_clustering(obs, n_clusters, corr=4, **kwargs):
 
 class Pecok(BaseEstimator, ClusterMixin, TransformerMixin):
     """PeCoK clustering
-        Read more in [my thesis]
+        Read more in [my thesis: http://www.theses.fr/2018SACLS442]
         Parameters
         ----------
         n_clusters : int, optional, default: 8
@@ -116,7 +116,7 @@ def fit(self, X, y=None, sample_weight=None):
         # random_state = check_random_state(self.random_state)
 
         hc_ = \
-            pecok_clustering(
+            _pecok_clustering(
                 X, n_clusters=self.n_clusters, corr=self.corr, verbose=self.verbose)
                 # init=self.init, n_init=self.n_init,
                 # max_iter=self.max_iter, verbose=self.verbose,
@@ -152,7 +152,7 @@ def fit(self, X, y=None, sample_weight=None):
 
 class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin):
     """K-MeanZ clustering
-        Read more in [my thesis]
+        Read more in [my thesis: http://www.theses.fr/2018SACLS442]
         Parameters
         ----------
         n_clusters : int, optional, default: 8
@@ -231,7 +231,7 @@ def fit(self, X, y=None, sample_weight=None):
         # random_state = check_random_state(self.random_state)
 
         self.corrected_points_, hc_ = \
-            kmeanz(
+            _kmeanz(
                 X, n_clusters=self.n_clusters, corr=self.corr)
                 # init=self.init, n_init=self.n_init,
                 # max_iter=self.max_iter, verbose=self.verbose,
diff --git a/pecok/gamma.py b/pecok/gamma.py
index 6616fc9..fe9f925 100644
--- a/pecok/gamma.py
+++ b/pecok/gamma.py
@@ -8,7 +8,7 @@
 
 
 
-def gamma_hat2(X):
+def _gamma_hat2(X):
     n_samples,_ = X.shape
     X2 = X / (np.linalg.norm(X, axis=1, keepdims=True)+1e-8)
     XaX2 = X.dot(X2.T)
@@ -21,7 +21,7 @@ def gamma_hat2(X):
     return gamma
 
 
-def gamma_hat2_robust(X):
+def _gamma_hat2_robust(X):
     n_samples,_ = X.shape
     X2 = X / (np.linalg.norm(X, axis=1, keepdims=True)+1e-8)
     XaX2 = X.dot(X2.T)
@@ -34,7 +34,7 @@ def gamma_hat2_robust(X):
     return gamma
 
 
-def gamma_hat3(X):
+def _gamma_hat3(X):
     """Gamma_hat3 estimator from PECOK supplement, in O(n_samples^3 * n_features)
 
     Parameters
@@ -53,7 +53,7 @@ def gamma_hat3(X):
     return gamma
 
 
-def gamma_hat4(X):
+def _gamma_hat4(X):
     """Gamma_hat4 estimator from PECOK, in O(n_samples^4 * n_features)
 
     Parameters
@@ -81,21 +81,21 @@ def gamma_hat4(X):
     return np.asarray(gamma)
 
 
-def no_correction(X):
+def _no_correction(X):
     return np.zeros(X.shape[0])
 
 
-def cross_diag(X):
+def _cross_diag(X):
     return np.diag(X.dot(X.T))
 
 
 def gamma_hat(X, corr):
     ghat = {
-        0: no_correction,
-        1: gamma_hat2_robust,
-        2: gamma_hat2,
-        3: gamma_hat3,
-        4: gamma_hat4,
-        8: cross_diag,
-    }.get(corr, no_correction)(X)
+        0: _no_correction,
+        1: _gamma_hat2_robust,
+        2: _gamma_hat2,
+        3: _gamma_hat3,
+        4: _gamma_hat4,
+        8: _cross_diag,
+    }.get(corr, _no_correction)(X)
     return np.diag(ghat)
diff --git a/tests/test.py b/tests/test.py
index 8a2808d..6d39ca1 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -6,13 +6,12 @@
 from functools import wraps
 import timeit
 
-# import pyper
 import numpy as np
 from pecok import KMeanz, Pecok
 from sklearn.cluster import KMeans
 
 
-def timethis(f):
+def _timethis(f):
     @wraps(f)
     def wrap(*args, **kw):
         ts = timeit.default_timer()
@@ -22,20 +21,6 @@ def wrap(*args, **kw):
     return wrap
 
 
-def hdclassif(obs, n_struct):
-    n,_ = obs.shape
-    myR = pyper.R()
-    myR.run('library(HDclassif)')
-    myR.assign('obs', obs)
-    myR.assign('n_struct', n_struct)
-    try:
-        myR.run('res <- hddc(obs, K=n_struct, model=c(1,2,7,9))')
-        result = np.array(myR['res$class'])
-    except:
-        print("fail hdclassif")
-        result = np.zeros(n)
-    return result
-
 seed = 432
 np.random.seed(seed)
 print("seed is %i" % seed)
@@ -62,7 +47,7 @@ def hdclassif(obs, n_struct):
 
 print("truth:".ljust(15), truth)
 for method, method_name in methods:
-    job_result, job_time = timethis(method)(mat_data.T, 2)
+    job_result, job_time = _timethis(method)(mat_data.T, 2)
     print(method_name.ljust(15), job_result)
     print("job_time: %.2f (s)".ljust(15) % job_time)
 
@@ -79,7 +64,7 @@ def hdclassif(obs, n_struct):
 
 print("truth:".ljust(15), truth)
 for method, method_name in methods:
-    job_result, job_time = timethis(method)(X, 2)
+    job_result, job_time = _timethis(method)(X, 2)
     print(method_name.ljust(15), job_result)
     print("job_time: %.2f (s)".ljust(15) % job_time)
 # print("pecok:".ljust(15), pecok_clustering(X, n_struct=2, rho=100, n_iter_max=3000, verbose=True).labels_)

From c721e278f6b6c9e24455ecf5e9ab7f1558390ada Mon Sep 17 00:00:00 2001
From: martinroyer-datashape <martinpierreroyer@gmail.com>
Date: Wed, 10 Jul 2019 14:44:10 +0200
Subject: [PATCH 4/9] minimal, basic .fit(X) sklearn framework

---
 pecok/clustering.py | 106 +++++++-------------------------------------
 tests/test.py       |  39 +++++++---------
 2 files changed, 31 insertions(+), 114 deletions(-)

diff --git a/pecok/clustering.py b/pecok/clustering.py
index b6ca2d7..d14d036 100644
--- a/pecok/clustering.py
+++ b/pecok/clustering.py
@@ -14,9 +14,6 @@
 from .admm import pecok_admm
 
 
-###############################################################################
-# shared correction
-
 def _corrected_relational(obs, corr):
     return (obs.dot(obs.T) - gamma_hat(obs, corr=corr)) / obs.shape[1]
 
@@ -59,30 +56,18 @@ class Pecok(BaseEstimator, ClusterMixin, TransformerMixin):
 
          Attributes
         ----------
-        corrected_points_: array, [n_points, n_features]
-            Points representation in Z-space
-        cluster_centers_ : array, [n_clusters, n_features]
-            Coordinates of cluster centers in Z-space
         labels_ :
             Labels of each point
-        inertia_ : float
-            Sum of squared distances (in Z-space) of samples to their closest cluster center.
 
         Examples
-            --------
-            TODO
-            >>> from sklearn.cluster import KMeans
-            >>> import numpy as np
-            >>> X = np.array([[1, 2], [1, 4], [1, 0],
-            ...               [4, 2], [4, 4], [4, 0]])
-            >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
-            >>> kmeans.labels_
-            array([0, 0, 0, 1, 1, 1], dtype=int32)
-            >>> kmeans.predict([[0, 0], [4, 4]])
-            array([0, 1], dtype=int32)
-            >>> kmeans.cluster_centers_
-            array([[1., 2.],
-                   [4., 2.]])
+        --------
+        >>> from pecok import Pecok
+        >>> import numpy as np
+        >>> X = np.array([[1, 2], [1, 4], [1, 0],
+        ...               [4, 2], [4, 4], [4, 0]])
+        >>> pecok = Pecok(n_clusters=2).fit(X)
+        >>> pecok.labels_
+        array([1, 1, 1, 0, 0, 0], dtype=int32)
     """
 
     def __init__(self, n_clusters=8, corr=0, init='notyet',
@@ -113,8 +98,6 @@ def fit(self, X, y=None, sample_weight=None):
             are assigned equal weight (default: None)
         """
 
-        # random_state = check_random_state(self.random_state)
-
         hc_ = \
             _pecok_clustering(
                 X, n_clusters=self.n_clusters, corr=self.corr, verbose=self.verbose)
@@ -127,28 +110,6 @@ def fit(self, X, y=None, sample_weight=None):
         self.labels_ = hc_.labels_
         return self
 
-    # def predict(self, X, sample_weight=None):
-    #     """Predict the closest cluster each sample in X belongs to.
-    #     In the vector quantization literature, `cluster_centers_` is called
-    #     the code book and each value returned by `predict` is the index of
-    #     the closest code in the code book.
-    #     Parameters
-    #     ----------
-    #     X : {array-like, sparse matrix}, shape = [n_samples, n_features]
-    #         New data to predict.
-    #     sample_weight : array-like, shape (n_samples,), optional
-    #         The weights for each observation in X. If None, all observations
-    #         are assigned equal weight (default: None)
-    #     Returns
-    #     -------
-    #     labels : array, shape [n_samples,]
-    #         Index of the cluster each sample belongs to.
-    #     """
-    #     check_is_fitted(self, 'cluster_centers_')
-    #
-    #     X = self._check_test_data(X)
-    #     return self._labels_inertia_minibatch(X, sample_weight)[0]
-
 
 class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin):
     """K-MeanZ clustering
@@ -176,28 +137,18 @@ class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin):
         ----------
         corrected_points_: array, [n_points, n_features]
             Points representation in Z-space
-        cluster_centers_ : array, [n_clusters, n_features]
-            Coordinates of cluster centers in Z-space
         labels_ :
             Labels of each point
-        inertia_ : float
-            Sum of squared distances (in Z-space) of samples to their closest cluster center.
 
         Examples
-            --------
-            TODO
-            >>> from sklearn.cluster import KMeans
-            >>> import numpy as np
-            >>> X = np.array([[1, 2], [1, 4], [1, 0],
-            ...               [4, 2], [4, 4], [4, 0]])
-            >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
-            >>> kmeans.labels_
-            array([0, 0, 0, 1, 1, 1], dtype=int32)
-            >>> kmeans.predict([[0, 0], [4, 4]])
-            array([0, 1], dtype=int32)
-            >>> kmeans.cluster_centers_
-            array([[1., 2.],
-                   [4., 2.]])
+        --------
+        >>> from pecok import KMeanz
+        >>> import numpy as np
+        >>> X = np.array([[1, 2], [1, 4], [1, 0],
+        ...               [4, 2], [4, 4], [4, 0]])
+        >>> kmeanz = KMeanz(n_clusters=2).fit(X)
+        >>> kmeanz.labels_
+        array([1, 1, 1, 0, 0, 0], dtype=int32)
     """
 
     def __init__(self, n_clusters=8, corr=0, init='notyet',
@@ -228,8 +179,6 @@ def fit(self, X, y=None, sample_weight=None):
             are assigned equal weight (default: None)
         """
 
-        # random_state = check_random_state(self.random_state)
-
         self.corrected_points_, hc_ = \
             _kmeanz(
                 X, n_clusters=self.n_clusters, corr=self.corr)
@@ -241,26 +190,3 @@ def fit(self, X, y=None, sample_weight=None):
                 # return_n_iter=True)
         self.labels_ = hc_.labels_
         return self
-
-    # def predict(self, X, sample_weight=None):
-    #     """Predict the closest cluster each sample in X belongs to.
-    #     In the vector quantization literature, `cluster_centers_` is called
-    #     the code book and each value returned by `predict` is the index of
-    #     the closest code in the code book.
-    #     Parameters
-    #     ----------
-    #     X : {array-like, sparse matrix}, shape = [n_samples, n_features]
-    #         New data to predict.
-    #     sample_weight : array-like, shape (n_samples,), optional
-    #         The weights for each observation in X. If None, all observations
-    #         are assigned equal weight (default: None)
-    #     Returns
-    #     -------
-    #     labels : array, shape [n_samples,]
-    #         Index of the cluster each sample belongs to.
-    #     """
-    #     check_is_fitted(self, 'cluster_centers_')
-    #
-    #     X = self._check_test_data(X)
-    #     return self._labels_inertia_minibatch(X, sample_weight)[0]
-
diff --git a/tests/test.py b/tests/test.py
index 6d39ca1..2afe9ad 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -3,22 +3,12 @@
 # author: Martin Royer <martin.royer@math.u-psud.fr>
 # License: MIT
 
-from functools import wraps
+from functools import partial
 import timeit
 
 import numpy as np
 from pecok import KMeanz, Pecok
-from sklearn.cluster import KMeans
-
-
-def _timethis(f):
-    @wraps(f)
-    def wrap(*args, **kw):
-        ts = timeit.default_timer()
-        result = f(*args, **kw)
-        te = timeit.default_timer()
-        return result, te-ts
-    return wrap
+from sklearn.cluster import KMeans, AgglomerativeClustering
 
 
 seed = 432
@@ -26,12 +16,10 @@ def wrap(*args, **kw):
 print("seed is %i" % seed)
 
 methods = [
-    [lambda X, K: KMeans(n_clusters=K, init='k-means++', n_init=100).fit(X).labels_, 'kmeans++'],
-    [lambda X, K: KMeanz(n_clusters=K, corr=2).fit(X).labels_, 'KMeanz'],
-    [lambda X, K: Pecok(n_clusters=K, corr=2).fit(X).labels_, 'Pecok'],
-#    [hdclassif, 'HDDC'],
-#     [pecok_clustering, 'pecok'],
-#     [kmeanz_clustering, 'kmeanz']
+    [partial(KMeans, init="k-means++", n_init=100), "k-means++"],
+    [partial(AgglomerativeClustering, linkage='ward'), "Hierarchical"],
+    [partial(KMeanz, corr=2), "KMeanz"],
+    [partial(Pecok, corr=2), "Pecok"],
 ]
 
 print("\nVAR CLUSTERING\n\n")
@@ -39,7 +27,7 @@ def wrap(*args, **kw):
 n_obs = 100
 
 truth = np.asmatrix(np.concatenate((np.repeat(0, n_var//2), np.repeat(1, n_var//2))))
-membership = truth.T.dot(np.matrix([1, 0])) + (1-truth).T.dot(np.matrix([0, 1]))
+membership = truth.T.dot(np.array([1, 0])[np.newaxis,:]) + (1-truth).T.dot(np.array([0, 1])[np.newaxis,:])
 stds = np.ones(n_var)
 stds[:(n_var//2)] = 0.1
 sigma = membership.dot(0.1*np.identity(2)).dot(membership.T) + np.diag(stds)
@@ -47,9 +35,11 @@ def wrap(*args, **kw):
 
 print("truth:".ljust(15), truth)
 for method, method_name in methods:
-    job_result, job_time = _timethis(method)(mat_data.T, 2)
+    ts = timeit.default_timer()
+    job_result = method(n_clusters=2).fit(mat_data.T).labels_
+    te = timeit.default_timer()
     print(method_name.ljust(15), job_result)
-    print("job_time: %.2f (s)".ljust(15) % job_time)
+    print("job_time: %.2f (s)".ljust(15) % (te-ts))
 
 
 print("\nPOINT CLUSTERING\n\n")
@@ -64,7 +54,8 @@ def wrap(*args, **kw):
 
 print("truth:".ljust(15), truth)
 for method, method_name in methods:
-    job_result, job_time = _timethis(method)(X, 2)
+    ts = timeit.default_timer()
+    job_result = method(n_clusters=2).fit(X).labels_
+    te = timeit.default_timer()
     print(method_name.ljust(15), job_result)
-    print("job_time: %.2f (s)".ljust(15) % job_time)
-# print("pecok:".ljust(15), pecok_clustering(X, n_struct=2, rho=100, n_iter_max=3000, verbose=True).labels_)
+    print("job_time: %.2f (s)".ljust(15) % (te-ts))

From 947b862fe6542297522e3709bc8dedb7b18b7dd3 Mon Sep 17 00:00:00 2001
From: martinroyer-datashape <martinpierreroyer@gmail.com>
Date: Wed, 10 Jul 2019 14:53:25 +0200
Subject: [PATCH 5/9] fix test & demos

---
 tests/demo.py                     | 11 ++++++-----
 tests/illustration_comparaison.py | 13 +++++--------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/tests/demo.py b/tests/demo.py
index 1ce540f..8ccdc52 100644
--- a/tests/demo.py
+++ b/tests/demo.py
@@ -1,5 +1,6 @@
 import numpy as np
-np.seed(232123123)
+
+np.random.seed(232123123)
 n = 10
 p = 100
 X = np.zeros((n, p))
@@ -8,7 +9,7 @@
 X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p))
 
 from sklearn.cluster import KMeans
-KMeans(n_clusters=2, init='k-means++', n_init=100).fit(X).labels_
-from pecok import PeCoK
-PeCoK(n_clusters=2, corr=0).fit(X).labels_
-PeCoK(n_clusters=2, corr=4).fit(X).labels_
+print(KMeans(n_clusters=2, init='k-means++', n_init=100).fit(X).labels_)
+from pecok import Pecok
+print(Pecok(n_clusters=2, corr=0).fit(X).labels_)
+print(Pecok(n_clusters=2, corr=4).fit(X).labels_)
diff --git a/tests/illustration_comparaison.py b/tests/illustration_comparaison.py
index 04318c3..57f4a0a 100644
--- a/tests/illustration_comparaison.py
+++ b/tests/illustration_comparaison.py
@@ -1,5 +1,3 @@
-print(__doc__)
-
 import time
 import warnings
 
@@ -10,8 +8,7 @@
 from sklearn.preprocessing import StandardScaler
 from itertools import cycle, islice
 
-from pecok import KMeanZ
-
+from pecok import KMeanz
 
 
 np.random.seed(0)
@@ -27,8 +24,8 @@
 # aniso = (X_aniso, y)
 
 # blobs with varied variances
-varied = datasets.make_blobs(n_samples=n_samples, centers=np.matrix([[0,0],[1,0]]),
-                             cluster_std=[.1, .5],
+varied = datasets.make_blobs(n_samples=n_samples, centers=np.array([[0,0],[1,0]]),
+                             cluster_std=[.05, .5],
                              random_state=random_state)
 
 # ============
@@ -65,7 +62,7 @@
     spectral = cluster.SpectralClustering(
         n_clusters=params['n_clusters'], eigen_solver='arpack',
         affinity="nearest_neighbors")
-    kmeanz = KMeanZ(
+    kmeanz = KMeanz(
         n_clusters=params['n_clusters'], corr=2
     )
     gmm = mixture.GaussianMixture(
@@ -74,7 +71,7 @@
     clustering_algorithms = (
         ('MiniBatchKMeans', two_means),
         ('SpectralClustering', spectral),
-        ('KMeanZ', kmeanz)
+        ('KMeanz', kmeanz)
         # ('GaussianMixture', gmm)
     )
 

From 0051724fe2c02f18cadbdfb355683ad9f5e47d91 Mon Sep 17 00:00:00 2001
From: martinroyer <16647869+martinroyer@users.noreply.github.com>
Date: Wed, 10 Jul 2019 15:05:00 +0200
Subject: [PATCH 6/9] minimal examples in readme

---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 1b634d2..d757c86 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,24 @@ You can find it on PyPI and install it with:
 pip install pecok
 ```
 
+# Minimal example
+
+```python
+    
+import numpy as np
+np.random.seed(232123123)
+n = 10
+p = 100
+X = np.zeros((n, p))
+snr = 0.3
+X[n//2:, :] = -np.ones(p) * snr + np.random.normal(scale=.1, size=(n//2,p))
+X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p))
+
+from pecok import Pecok, KMeanz
+Pecok(n_clusters=2, corr=4).fit(X).labels_
+KMeans(n_clusters=2, corr=2).fit(X).labels_
+```
+
 
 ### Clustering algorithms
 

From 53b268ca69b86dcd4a512c1c3cd37cd549c35207 Mon Sep 17 00:00:00 2001
From: martinroyer <16647869+martinroyer@users.noreply.github.com>
Date: Wed, 10 Jul 2019 15:11:59 +0200
Subject: [PATCH 7/9] improved readme

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index d757c86..9c64cef 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 
 # Description
 
-pecok is a python package containing a few clustering algorithms (unsupervised learning) for a given number of clusters. It is based on the PECOK paper (https://arxiv.org/abs/1606.05100)
+pecok is a python package containing a few clustering algorithms (unsupervised learning) for a given number of clusters. It is based on the PECOK paper https://arxiv.org/abs/1606.05100 for variable clustering that later got extended here: https://arxiv.org/abs/1508.01939 (to appear in Annals of Statistics). See also https://papers.nips.cc/paper/6776-adaptive-clustering-through-semidefinite-programming.
 
 # Installing pecok
 
@@ -35,15 +35,15 @@ X[:n//2, :] = np.ones(p) * snr + np.random.normal(scale=1, size=(n//2,p))
 
 from pecok import Pecok, KMeanz
 Pecok(n_clusters=2, corr=4).fit(X).labels_
-KMeans(n_clusters=2, corr=2).fit(X).labels_
+KMeanz(n_clusters=2, corr=2).fit(X).labels_
 ```
 
 
-### Clustering algorithms
+## Clustering algorithms
 
-Currently available algorithms are:
+Currently available algorithms (with sklearn object framework implementing `fit` routine) are:
 
-  * **pecok_clustering**: the main clustering algorithm described in [Bunea, Giraud, Royer, Verzelen ('17)]
+  * **Pecok**: the main clustering algorithm described in [Bunea, Giraud, Royer, Verzelen ('17)]
 
     Parameters:
 
@@ -56,7 +56,7 @@ Currently available algorithms are:
     |**n_iter_max** = -1| if positive, sets the stop condition for maximum number of iteration of the ADMM algorithm used to approximate the solution of the SDP.|
 	|**verbose** = False| yields print for time and residuals value at ADMM stop time.|
 
-  * **kmeanz_clustering**: efficient variant for main clustering algorithm, see my PhD thesis
+  * **KMeanz**: efficient variant for main clustering algorithm, introduced in my PhD thesis: http://www.theses.fr/2018SACLS442
 
     Parameters:
 

From f53e16f6d5ed2d53228da8d00d877aa894dbeb58 Mon Sep 17 00:00:00 2001
From: martinroyer <martinpierreroyer@gmail.com>
Date: Wed, 10 Jul 2019 15:44:29 +0200
Subject: [PATCH 8/9] version maj

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c984f0b..5c86af3 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 	install_requires=[
           'numpy', 'scipy', 'scikit-learn'
       ],
-    version = '1.0.2',
+    version = '1.0.3',
     description = 'Implementation of Pecok',
     author='Martin Royer',
     author_email='martinpierreroyer@gmail.com',

From ba83845b8d566fd98fb9a5dc2bfc088aaeae54f3 Mon Sep 17 00:00:00 2001
From: martinroyer <16647869+martinroyer@users.noreply.github.com>
Date: Mon, 29 Jul 2019 08:55:55 +0200
Subject: [PATCH 9/9] Update README.md

update readme
---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 9c64cef..0ebc8ba 100644
--- a/README.md
+++ b/README.md
@@ -49,9 +49,9 @@ Currently available algorithms (with sklearn object framework implementing `fit`
 
     | **name** | **description** |
     | --- | --- |
-    |**obs** | data matrix (n \times p), clustering is applied to the lines of **obs**.|
-    |**n_struct** | number of structures to separate the data into.|
-    |**int_corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. |
+    |**X** | data matrix (n \times p), clustering is applied to the lines of **X**.|
+    |**n_clusters** | number of structures to separate the data into.|
+    |**corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. |
     |**rho** = 5| bias-variance tradeoff parameter in ADMM.|
     |**n_iter_max** = -1| if positive, sets the stop condition for maximum number of iteration of the ADMM algorithm used to approximate the solution of the SDP.|
 	|**verbose** = False| yields print for time and residuals value at ADMM stop time.|
@@ -62,7 +62,7 @@ Currently available algorithms (with sklearn object framework implementing `fit`
 
     | **name** | **description** |
     | --- | --- |
-    |**obs** | data matrix (n \times p), clustering is applied to the lines of **obs**.|
-    |**n_struct** | number of structures to separate the data into.|
-    |**int_corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. |
+    |**X** | data matrix (n \times p), clustering is applied to the lines of **X**.|
+    |**n_clusters** | number of structures to separate the data into.|
+    |**corr** = 4| correction to be used, between 0 and 4. 0 means no correction, 4 is the correction from [Bunea, Giraud, Royer, Verzelen ('17)]. 1, 2 and 3 are more efficient proxy for the correction, we only recommend 2 and 3. |