quick refac

martinroyer · martinroyer · commit fa18d214e997 · 2019-07-10T12:39:56.000+02:00
diff --git a/pecok/admm.py b/pecok/admm.py
@@ -4,46 +4,46 @@
 # License: MIT
 
 import numpy as np
-from scipy import linalg as la
+from scipy import linalg
 
 
-def operator_lstarllstarinv_sym(u, v):
+def _operator_lstarllstarinv_sym(u, v):
     """Operator \widetildetilde{L}^*_{sym} on (u,v) in R^{p+1} -> R^{p*p}"""
     temp = u.repeat(u.size).reshape((u.size, u.size))
     return (temp + temp.T)/2 + np.diag(np.repeat(v, u.size))
 
 
-def proj_lin_Hsymmetric(Y, n_struct):
+def _proj_lin_Hsymmetric(Y, n_struct):
     """Projection onto \Pi_{\mathcal{A}sym}(Y)"""
     n_samples,_ = Y.shape
     x = np.sum(Y, 1) - 1
     y = np.trace(Y) - n_struct
     invx = (x-(np.sum(x)+y)/(2*n_samples))/(n_samples-1)
     invy = (y-(np.sum(x)+y)/(2*n_samples))/(n_samples-1)
-    Y = Y - operator_lstarllstarinv_sym(invx, invy)
+    Y = Y - _operator_lstarllstarinv_sym(invx, invy)
     return Y
 
 
-def proj_positive(x, thresh=0):
+def _proj_positive(x, thresh=0):
     """Project onto component-positive matrix"""
     x[x < thresh] = 0
     return x
 
 
-def proj_Snp_imp(Y):
+def _proj_Snp_imp(Y):
     """Improved projection onto semi-definite positive matrix"""
     n_samples,_ = Y.shape
-    eig_vals = la.eigh(Y, eigvals_only=True)
+    eig_vals = linalg.eigh(Y, eigvals_only=True)
     n_val_neg = np.sum(eig_vals<0)
     if n_val_neg == 0:
         return Y
     if n_val_neg == n_samples:
         return np.zeros((n_samples,n_samples))
     if n_val_neg < n_samples-n_val_neg:
-        eig_vals, v = la.eigh(-Y, eigvals=(n_samples - n_val_neg, n_samples - 1))
+        eig_vals, v = linalg.eigh(-Y, eigvals=(n_samples - n_val_neg, n_samples - 1))
         Y = Y + v.dot(np.diag(eig_vals)).dot(v.T)
     else:
-        eig_vals, v = la.eigh(Y, eigvals=(n_val_neg, n_samples - 1))
+        eig_vals, v = linalg.eigh(Y, eigvals=(n_val_neg, n_samples - 1))
         Y = v.dot(np.diag(eig_vals)).dot(v.T)
     return Y
 
@@ -71,9 +71,9 @@ def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None,
         n_iter = n_iter + 1
 
         oldXbar = Xbar
-        X = proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_clusters)
-        Y = proj_positive(Xbar - V)
-        Z = proj_Snp_imp(Xbar - W)
+        X = _proj_lin_Hsymmetric(Xbar - U + relational_data / rho, n_clusters)
+        Y = _proj_positive(Xbar - V)
+        Z = _proj_Snp_imp(Xbar - W)
         Xbar = (X + Y + Z)/3
 
         U = U + X - Xbar
@@ -82,17 +82,17 @@ def pecok_admm(relational_data, n_clusters, n_iter_max=-1, rho=5, mat_init=None,
 
         res_dual = rho * np.linalg.norm(Xbar-oldXbar)
         res_primal = np.linalg.norm((X-Xbar, Z-Xbar, Y-Xbar))
-        if not (is_primal_high(eps_residual, res_primal, X, Y, Z) or is_dual_high(eps_residual, res_dual, Y, Z)):
+        if not (_is_primal_high(eps_residual, res_primal, X, Y, Z) or _is_dual_high(eps_residual, res_dual, Y, Z)):
             break
     if verbose:
         print("ADMM ends -- n_iter=%i, rho=%2.2f" % (n_iter, rho))
         print("          -- res_primal=%.3e, res_dual=%.3e" % (res_primal, res_dual))
     return Z
 
 
-def is_primal_high(eps_residual, res_primal, X, Y, Z):
+def _is_primal_high(eps_residual, res_primal, X, Y, Z):
     return res_primal > eps_residual * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z)))
 
 
-def is_dual_high(eps_residual, res_dual, Y, Z):
+def _is_dual_high(eps_residual, res_dual, Y, Z):
     return res_dual > eps_residual * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z))
diff --git a/pecok/clustering.py b/pecok/clustering.py
@@ -1,6 +1,6 @@
 """PECOK clustering"""
 
-# author: Martin Royer <martin.royer@m4x.org>
+# author: Martin Royer <martin.royer@math.u-psud.fr>
 # License: MIT
 
 import numpy as np
@@ -21,14 +21,14 @@ def _corrected_relational(obs, corr):
     return (obs.dot(obs.T) - gamma_hat(obs, corr=corr)) / obs.shape[1]
 
 
-def kmeanz(X, n_clusters, corr):
+def _kmeanz(X, n_clusters, corr):
     gram_corrected = _corrected_relational(X, corr=corr)
     U, s, _ = lin_svd(gram_corrected, compute_uv=True)
     approx = U.dot(np.diag(np.sqrt(s)))
     return approx, AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit(approx)
 
 
-def pecok_clustering(obs, n_clusters, corr=4, **kwargs):
+def _pecok_clustering(obs, n_clusters, corr=4, **kwargs):
     gram_corrected = _corrected_relational(obs, corr=corr)
     U, _, V = spa_svd(gram_corrected, k=n_clusters)
     Bhat = pecok_admm(gram_corrected, n_clusters=n_clusters, mat_init=U.dot(V), **kwargs)
@@ -37,7 +37,7 @@ def pecok_clustering(obs, n_clusters, corr=4, **kwargs):
 
 class Pecok(BaseEstimator, ClusterMixin, TransformerMixin):
     """PeCoK clustering
-        Read more in [my thesis]
+        Read more in [my thesis: http://www.theses.fr/2018SACLS442]
         Parameters
         ----------
         n_clusters : int, optional, default: 8
@@ -116,7 +116,7 @@ def fit(self, X, y=None, sample_weight=None):
         # random_state = check_random_state(self.random_state)
 
         hc_ = \
-            pecok_clustering(
+            _pecok_clustering(
                 X, n_clusters=self.n_clusters, corr=self.corr, verbose=self.verbose)
                 # init=self.init, n_init=self.n_init,
                 # max_iter=self.max_iter, verbose=self.verbose,
@@ -152,7 +152,7 @@ def fit(self, X, y=None, sample_weight=None):
 
 class KMeanz(BaseEstimator, ClusterMixin, TransformerMixin):
     """K-MeanZ clustering
-        Read more in [my thesis]
+        Read more in [my thesis: http://www.theses.fr/2018SACLS442]
         Parameters
         ----------
         n_clusters : int, optional, default: 8
@@ -231,7 +231,7 @@ def fit(self, X, y=None, sample_weight=None):
         # random_state = check_random_state(self.random_state)
 
         self.corrected_points_, hc_ = \
-            kmeanz(
+            _kmeanz(
                 X, n_clusters=self.n_clusters, corr=self.corr)
                 # init=self.init, n_init=self.n_init,
                 # max_iter=self.max_iter, verbose=self.verbose,
diff --git a/pecok/gamma.py b/pecok/gamma.py
@@ -8,7 +8,7 @@
 
 
 
-def gamma_hat2(X):
+def _gamma_hat2(X):
     n_samples,_ = X.shape
     X2 = X / (np.linalg.norm(X, axis=1, keepdims=True)+1e-8)
     XaX2 = X.dot(X2.T)
@@ -21,7 +21,7 @@ def gamma_hat2(X):
     return gamma
 
 
-def gamma_hat2_robust(X):
+def _gamma_hat2_robust(X):
     n_samples,_ = X.shape
     X2 = X / (np.linalg.norm(X, axis=1, keepdims=True)+1e-8)
     XaX2 = X.dot(X2.T)
@@ -34,7 +34,7 @@ def gamma_hat2_robust(X):
     return gamma
 
 
-def gamma_hat3(X):
+def _gamma_hat3(X):
     """Gamma_hat3 estimator from PECOK supplement, in O(n_samples^3 * n_features)
 
     Parameters
@@ -53,7 +53,7 @@ def gamma_hat3(X):
     return gamma
 
 
-def gamma_hat4(X):
+def _gamma_hat4(X):
     """Gamma_hat4 estimator from PECOK, in O(n_samples^4 * n_features)
 
     Parameters
@@ -81,21 +81,21 @@ def gamma_hat4(X):
     return np.asarray(gamma)
 
 
-def no_correction(X):
+def _no_correction(X):
     return np.zeros(X.shape[0])
 
 
-def cross_diag(X):
+def _cross_diag(X):
     return np.diag(X.dot(X.T))
 
 
 def gamma_hat(X, corr):
     ghat = {
-        0: no_correction,
-        1: gamma_hat2_robust,
-        2: gamma_hat2,
-        3: gamma_hat3,
-        4: gamma_hat4,
-        8: cross_diag,
-    }.get(corr, no_correction)(X)
+        0: _no_correction,
+        1: _gamma_hat2_robust,
+        2: _gamma_hat2,
+        3: _gamma_hat3,
+        4: _gamma_hat4,
+        8: _cross_diag,
+    }.get(corr, _no_correction)(X)
     return np.diag(ghat)
diff --git a/tests/test.py b/tests/test.py
@@ -6,13 +6,12 @@
 from functools import wraps
 import timeit
 
-# import pyper
 import numpy as np
 from pecok import KMeanz, Pecok
 from sklearn.cluster import KMeans
 
 
-def timethis(f):
+def _timethis(f):
     @wraps(f)
     def wrap(*args, **kw):
         ts = timeit.default_timer()
@@ -22,20 +21,6 @@ def wrap(*args, **kw):
     return wrap
 
 
-def hdclassif(obs, n_struct):
-    n,_ = obs.shape
-    myR = pyper.R()
-    myR.run('library(HDclassif)')
-    myR.assign('obs', obs)
-    myR.assign('n_struct', n_struct)
-    try:
-        myR.run('res <- hddc(obs, K=n_struct, model=c(1,2,7,9))')
-        result = np.array(myR['res$class'])
-    except:
-        print("fail hdclassif")
-        result = np.zeros(n)
-    return result
-
 seed = 432
 np.random.seed(seed)
 print("seed is %i" % seed)
@@ -62,7 +47,7 @@ def hdclassif(obs, n_struct):
 
 print("truth:".ljust(15), truth)
 for method, method_name in methods:
-    job_result, job_time = timethis(method)(mat_data.T, 2)
+    job_result, job_time = _timethis(method)(mat_data.T, 2)
     print(method_name.ljust(15), job_result)
     print("job_time: %.2f (s)".ljust(15) % job_time)
 
@@ -79,7 +64,7 @@ def hdclassif(obs, n_struct):
 
 print("truth:".ljust(15), truth)
 for method, method_name in methods:
-    job_result, job_time = timethis(method)(X, 2)
+    job_result, job_time = _timethis(method)(X, 2)
     print(method_name.ljust(15), job_result)
     print("job_time: %.2f (s)".ljust(15) % job_time)
 # print("pecok:".ljust(15), pecok_clustering(X, n_struct=2, rho=100, n_iter_max=3000, verbose=True).labels_)