From cab9ce751f860322b4d34d2254967737187ace7d Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 13 Feb 2020 15:28:39 +0100 Subject: [PATCH 01/70] scml first commit --- metric_learn/constraints.py | 55 +++++++ metric_learn/scml.py | 319 ++++++++++++++++++++++++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 metric_learn/scml.py diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py index 752ca6e0..a85d551a 100644 --- a/metric_learn/constraints.py +++ b/metric_learn/constraints.py @@ -6,6 +6,8 @@ import warnings from six.moves import xrange from sklearn.utils import check_random_state +from sklearn.neighbors import NearestNeighbors +from numpy.matlib import repmat __all__ = ['Constraints'] @@ -100,3 +102,56 @@ def wrap_pairs(X, constraints): y = np.concatenate([np.ones_like(a), -np.ones_like(c)]) pairs = X[constraints] return pairs, y + + +def _comb(A, B, C, sizeB, sizeC): + # generate an array will all combinations of choosing + # an element from A, B and C + return np.vstack((repmat(A, sizeB*sizeC, 1).ravel(order='F'), + repmat(np.hstack(B), sizeC, 1).ravel(order='F'), + repmat(C, 1, sizeB).ravel())).T + + +def generate_knntriplets(X, y, k_genuine, k_impostor): + + labels = np.unique(y) + L = len(labels) + len_input = np.size(y, 0) + triplets = np.empty((len_input*k_genuine*k_impostor, 3), dtype=np.intp) + + start = 0 + finish = 0 + neigh = NearestNeighbors() + + for i in range(L): + + # generate mask for current label + gen_mask = y == labels[i] + gen_indx = np.where(gen_mask) + + # get k_genuine genuine neighbours + neigh.fit(X=X[gen_indx]) + gen_neigh = np.take(gen_indx, neigh.kneighbors(n_neighbors=k_genuine, + return_distance=False)) + + # generate mask for impostors of current label + imp_indx = np.where(np.invert(gen_mask)) + + # get k_impostor impostor neighbours + neigh.fit(X=X[imp_indx]) + imp_neigh = np.take(imp_indx, neigh.kneighbors( + n_neighbors=k_impostor, + X=X[gen_mask], + return_distance=False)) + + # lenght = len_label*k_genuine*k_impostor + finish += np.sum(gen_mask)*k_genuine*k_impostor + + triplets[start:finish, :] = _comb(gen_indx, gen_neigh, + imp_neigh, k_genuine, + k_impostor) + start = finish + + # TODO: deal with too litle elements for k neighbors to be yielded + + return triplets diff --git a/metric_learn/scml.py b/metric_learn/scml.py new file mode 100644 index 00000000..ebdacd4a --- /dev/null +++ b/metric_learn/scml.py @@ -0,0 +1,319 @@ +""" +Sparse Compositional Metric Learning (SCML) +""" + +from __future__ import print_function, absolute_import, division +import numpy as np +from .base_metric import _TripletsClassifierMixin, MahalanobisMixin +from sklearn.base import TransformerMixin +from .constraints import generate_knntriplets +from sklearn.preprocessing import normalize +from sklearn.neighbors import NearestNeighbors +from sklearn.cluster import KMeans +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis + + +class _BaseSCML_global(MahalanobisMixin): + + _tuple_size = 3 # constraints are triplets + + def __init__(self, beta=1e-5, B=None, + max_iter=100000, verbose=False, + preprocessor=None, random_state=None): + self.beta = beta + self.max_iter = max_iter + self.verbose = verbose + self.preprocessor = preprocessor + self.random_state = random_state + super(_BaseSCML_global, self).__init__(preprocessor) + + def _fit(self, triplets, B, n_basis): + + # TODO: manage B + # if B is None + # error + # if B is array + # pass + # if option + # do something + + triplets = self._prepare_inputs(triplets, type_of_inputs='tuples') + + triplets, X = self._to_index_points(triplets) + + # TODO: should be given access to gamma? + gamma = 5e-3 + dist_diff = self._compute_dist_diff(triplets, X, B) + + n_basis = B.shape[0] + sizeT = triplets.shape[0] + + w = np.zeros((1, n_basis)) + avg_grad_w = np.zeros((1, n_basis)) + + output_iter = 5000 # output every output_iter iterations + + best_w = np.empty((1, n_basis)) + obj = np.empty((self.max_iter, 1)) + nImp = np.empty((self.max_iter, 1), dtype=int) + + best_obj = np.inf + + for iter in range(self.max_iter): + if (iter % output_iter == 0): + + obj1 = np.sum(w)*self.beta + + obj2 = 0.0 + count = 0 + + for i in range(sizeT): + slack_val = 1 + dist_diff[i, :].dot(w.T) + + if (slack_val > 0): + count += 1 + obj2 += slack_val + + obj2 = obj2/sizeT + + obj[iter] = obj1 + obj2 + nImp[iter] = count + + if(self.verbose): + print("[Global] iter %d\t obj %.6f\t num_imp %d" % (iter, + obj[iter], nImp[iter])) + + # update the best + if (obj[iter] < best_obj): + best_obj = obj[iter] + best_w = w + + idx = np.random.randint(low=0, high=sizeT) + + slack_val = 1 + dist_diff[idx, :].dot(w.T) + + if (slack_val > 0): + avg_grad_w = (iter * avg_grad_w + dist_diff[idx, :]) / (iter+1) + else: + avg_grad_w = iter * avg_grad_w / (iter+1) + + scale_f = -np.sqrt(iter+1) / gamma + + # TODO: maybe there is a better way to do this? + w.fill(0) + pos_mask = avg_grad_w > self.beta + w[pos_mask] = scale_f * (avg_grad_w[pos_mask] + self.beta) + neg_mask = avg_grad_w < - self.beta + w[neg_mask] = scale_f * (avg_grad_w[neg_mask] - self.beta) + + w[w < 0] = 0 + + if(self.verbose): + print("max iteration reached.") + + self.components_ = self._get_components(best_w, B) + + return self + +# should this go to utils? + def _compute_dist_diff(self, T, X, B): + XB = np.matmul(X, B.T) + T = np.vstack(T) + lenT = len(T) + # all positive and negative pairs with lowest index first + # np.array (2*lenT,2) + T_pairs_sorted = np.sort(np.vstack((T[:, [0, 1]], T[:, [0, 2]])), + kind='stable') + # calculate all unique pairs + uniqPairs, indeces = np.unique(T_pairs_sorted, return_inverse=True, + axis=0) + # calculate L2 distance acording to bases only for unique pairs + dist = np.square(XB[uniqPairs[:, 0], :]-XB[uniqPairs[:, 1], :]) + + # return the diference of distances between all positive and negative + # pairs + return dist[indeces[:lenT]]-dist[indeces[lenT:]] + + def _get_components(self, w, B): + """ + get components matrix (L) from computed mahalanobis matrix + """ + + # get rid of inactive bases + active_idx = w > 0 + w = w[active_idx] + B = B[np.squeeze(active_idx), :] + + K, d = B.shape + + if(K < d): # if metric is low-rank + return B*np.sqrt(w)[..., None] + + else: # if metric is full rank + return np.dot(B.T * np.sqrt(w), B) + + def _to_index_points(self, triplets): + shape = triplets.shape + X, triplets = np.unique(np.vstack(triplets), return_inverse=True, axis=0) + triplets = triplets.reshape(shape[:2]) + return triplets, X + + +class SCML_global(_BaseSCML_global, _TripletsClassifierMixin): + + def fit(self, triplets, B, n_basis): + """Learn the SCML model. + + Parameters + ---------- + triplets : array-like, shape=(n_constraints, 3, n_features) or \ + (n_constraints, 3) + 3D array-like of triplets of points or 2D array of triplets of + indicators. In order to supervise the algorithm in the right way, we + should have the three samples ordered in a way such that: + d(triplets[i, 0],triplets[i, 1]) < d(triplets[i, 1], triplets[i, 3]) + for all 0 <= i < n_constraints. + B : (n_basis, n_features) array of floats that form the basis set from + which the metric will be constructed. + + Returns + ------- + self : object + Returns the instance. + """ + return _BaseSCML_global._fit(triplets, B, n_basis) + + +class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): + + def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, B=None, + n_basis=None, max_iter=100000, verbose=False, + preprocessor=None, random_state=None): + self.k_genuine = k_genuine + self.k_impostor = k_impostor + _BaseSCML_global.__init__(self, beta=beta, max_iter=max_iter, + verbose=verbose, preprocessor=preprocessor, + random_state=random_state) + + def fit(self, X, y, B, n_basis, random_state=None): + """Create constraints from labels and learn the SCML model. + + Parameters + ---------- + X : (n x d) matrix + Input data, where each row corresponds to a single instance. + + y : (n) array-like + Data labels. + + B : string or (n_basis x d) array, through this the basis construction + can be selected or directly given by an array. + + Returns + ------- + self : object + Returns the instance. + """ + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) + self.preprocessor = X + + if(B == "LDA"): + B = self._generate_bases_LDA(X, y, n_basis, random_state) + # this should set super's B + + triplets = generate_knntriplets(X, y, self.k_genuine, + self.k_impostor) + + return self._fit(triplets, B, n_basis) + + def _generate_bases_LDA(self, X, y, n_basis, random_state=None): + + labels, class_count = np.unique(y, return_counts=True) + n_class = len(labels) + + # n_basis must be greater or equal to n_class + if(n_basis < n_class): + ValueError("number of basis should be greater than the number of " + "classes") + + dim = np.size(X, 1) + num_eig = min(n_class-1, dim) + n_clusters = int(np.ceil(n_basis/(2 * num_eig))) + + # TODO: maybe give acces to Kmeans jobs for faster computation? + kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, + algorithm='elkan').fit(X) + cX = kmeans.cluster_centers_ + + # TODO: find a better way to choose neighbourhood size + if dim > 50: + nK = 50 + else: + nK = 10 + + nK_class = [min(c, nK) for c in class_count] + + idx_set = np.zeros((n_clusters, sum(nK_class)), dtype=np.int) + + start = 0 + finish = 0 + neigh = NearestNeighbors() + + for c in range(n_class): + sel_c = y == labels[c] + nk = nK_class[c] + # get nK_class genuine neighbours + neigh.fit(X=X[sel_c]) + + finish += nk + idx_set[:, start:finish] = np.take(np.where(sel_c), + neigh.kneighbors(X=cX, + n_neighbors=nk, + return_distance=False)) + start = finish + + B = np.zeros((n_basis, dim)) + for i in range(n_clusters): + lda = LinearDiscriminantAnalysis() + lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) + B[num_eig*i: num_eig*(i+1), :] = normalize(lda.scalings_.T) + + nK = 20 + + nK_class = [min(c, nK) for c in class_count] + + idx_set = np.zeros((n_clusters, sum(nK_class)), dtype=np.int) + + start = 0 + finish = 0 + + for c in range(n_class): + sel_c = np.where(y == labels[c]) + nk = nK_class[c] + # get nK_class genuine neighbours + neigh.fit(X=X[sel_c]) + + finish += nk + idx_set[:, start:finish] = np.take(sel_c, neigh.kneighbors(X=cX, + n_neighbors=nk, + return_distance=False)) + start = finish + + finish = num_eig * n_clusters + + for i in range(n_clusters): + lda = LinearDiscriminantAnalysis() + lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) + start = finish + finish += num_eig + # TODO: maybe handle tail more elegantly by + # limiting lda n_components + if(start == n_basis): + pass + elif(finish <= n_basis): + B[start:finish, :] = normalize(lda.scalings_.T) + else: + B[start:, :] = normalize(lda.scalings_.T[:n_basis-start]) + break + + return B From 41e2cefd4e7545601affb905a748822d91c1862d Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 13 Feb 2020 18:01:47 +0100 Subject: [PATCH 02/70] add scml to __init__.py --- metric_learn/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/metric_learn/__init__.py b/metric_learn/__init__.py index b036ccfa..5210c157 100644 --- a/metric_learn/__init__.py +++ b/metric_learn/__init__.py @@ -11,10 +11,12 @@ from .rca import RCA, RCA_Supervised from .mlkr import MLKR from .mmc import MMC, MMC_Supervised +from .scml import SCML_global, SCML_global_Supervised from ._version import __version__ __all__ = ['Constraints', 'Covariance', 'ITML', 'ITML_Supervised', 'LMNN', 'LSML', 'LSML_Supervised', 'SDML', 'SDML_Supervised', 'NCA', 'LFDA', 'RCA', 'RCA_Supervised', - 'MLKR', 'MMC', 'MMC_Supervised', '__version__'] + 'MLKR', 'MMC', 'MMC_Supervised', 'SCML_global', + 'SCML_global_Supervised', '__version__'] From 8ee9a87e65b6c4d18ed0fc19de67b7929aa8c9c3 Mon Sep 17 00:00:00 2001 From: grudloff Date: Fri, 14 Feb 2020 15:26:00 +0100 Subject: [PATCH 03/70] fix in components calculation --- metric_learn/scml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index ebdacd4a..d636ec39 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -150,7 +150,7 @@ def _get_components(self, w, B): return B*np.sqrt(w)[..., None] else: # if metric is full rank - return np.dot(B.T * np.sqrt(w), B) + return np.linalg.cholesky(np.dot(B.T * w, B)).T def _to_index_points(self, triplets): shape = triplets.shape From f201f9f459ba4840531211d1def9089da884eab3 Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 18 Feb 2020 13:45:24 +0100 Subject: [PATCH 04/70] remove triplet generator, added in triplets PR --- metric_learn/constraints.py | 55 ------------------------------------- 1 file changed, 55 deletions(-) diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py index a85d551a..752ca6e0 100644 --- a/metric_learn/constraints.py +++ b/metric_learn/constraints.py @@ -6,8 +6,6 @@ import warnings from six.moves import xrange from sklearn.utils import check_random_state -from sklearn.neighbors import NearestNeighbors -from numpy.matlib import repmat __all__ = ['Constraints'] @@ -102,56 +100,3 @@ def wrap_pairs(X, constraints): y = np.concatenate([np.ones_like(a), -np.ones_like(c)]) pairs = X[constraints] return pairs, y - - -def _comb(A, B, C, sizeB, sizeC): - # generate an array will all combinations of choosing - # an element from A, B and C - return np.vstack((repmat(A, sizeB*sizeC, 1).ravel(order='F'), - repmat(np.hstack(B), sizeC, 1).ravel(order='F'), - repmat(C, 1, sizeB).ravel())).T - - -def generate_knntriplets(X, y, k_genuine, k_impostor): - - labels = np.unique(y) - L = len(labels) - len_input = np.size(y, 0) - triplets = np.empty((len_input*k_genuine*k_impostor, 3), dtype=np.intp) - - start = 0 - finish = 0 - neigh = NearestNeighbors() - - for i in range(L): - - # generate mask for current label - gen_mask = y == labels[i] - gen_indx = np.where(gen_mask) - - # get k_genuine genuine neighbours - neigh.fit(X=X[gen_indx]) - gen_neigh = np.take(gen_indx, neigh.kneighbors(n_neighbors=k_genuine, - return_distance=False)) - - # generate mask for impostors of current label - imp_indx = np.where(np.invert(gen_mask)) - - # get k_impostor impostor neighbours - neigh.fit(X=X[imp_indx]) - imp_neigh = np.take(imp_indx, neigh.kneighbors( - n_neighbors=k_impostor, - X=X[gen_mask], - return_distance=False)) - - # lenght = len_label*k_genuine*k_impostor - finish += np.sum(gen_mask)*k_genuine*k_impostor - - triplets[start:finish, :] = _comb(gen_indx, gen_neigh, - imp_neigh, k_genuine, - k_impostor) - start = finish - - # TODO: deal with too litle elements for k neighbors to be yielded - - return triplets From 87c3da02e4b2ceb37feb2ed3052d355847212686 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 19 Feb 2020 11:33:39 +0100 Subject: [PATCH 05/70] change init&fit interface, faster compute & others --- metric_learn/scml.py | 199 +++++++++++++++++++++++-------------------- 1 file changed, 107 insertions(+), 92 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index d636ec39..a923b6d4 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -11,52 +11,54 @@ from sklearn.neighbors import NearestNeighbors from sklearn.cluster import KMeans from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.utils import check_array class _BaseSCML_global(MahalanobisMixin): _tuple_size = 3 # constraints are triplets - def __init__(self, beta=1e-5, B=None, - max_iter=100000, verbose=False, - preprocessor=None, random_state=None): + def __init__(self, beta=1e-5, basis=None, n_basis=None, + max_iter=100000, verbose=False, preprocessor=None, + random_state=None): self.beta = beta + self.basis = basis + self.n_basis = n_basis self.max_iter = max_iter self.verbose = verbose self.preprocessor = preprocessor self.random_state = random_state super(_BaseSCML_global, self).__init__(preprocessor) - def _fit(self, triplets, B, n_basis): + def _fit(self, triplets): + + if self.preprocessor is not None: + n_features = self.preprocessor.shape[1] + else: + n_features = self.triplets.shape[1] - # TODO: manage B - # if B is None - # error - # if B is array - # pass - # if option - # do something + self._initialize_basis(n_features) triplets = self._prepare_inputs(triplets, type_of_inputs='tuples') + # TODO: + # This algorithm is build to work with indeces, but in order to be + # compliant with the current handling of inputs it is converted + # back to indices by the following function. This should be improved + # in the future. triplets, X = self._to_index_points(triplets) # TODO: should be given access to gamma? gamma = 5e-3 - dist_diff = self._compute_dist_diff(triplets, X, B) + dist_diff = self._compute_dist_diff(triplets, X) - n_basis = B.shape[0] sizeT = triplets.shape[0] - w = np.zeros((1, n_basis)) - avg_grad_w = np.zeros((1, n_basis)) + w = np.zeros((1, self.n_basis)) + avg_grad_w = np.zeros((1, self.n_basis)) output_iter = 5000 # output every output_iter iterations - best_w = np.empty((1, n_basis)) - obj = np.empty((self.max_iter, 1)) - nImp = np.empty((self.max_iter, 1), dtype=int) - best_obj = np.inf for iter in range(self.max_iter): @@ -64,30 +66,27 @@ def _fit(self, triplets, B, n_basis): obj1 = np.sum(w)*self.beta - obj2 = 0.0 - count = 0 - - for i in range(sizeT): - slack_val = 1 + dist_diff[i, :].dot(w.T) - - if (slack_val > 0): - count += 1 - obj2 += slack_val - - obj2 = obj2/sizeT - - obj[iter] = obj1 + obj2 - nImp[iter] = count + # Every triplet distance difference in the space given by L + # plus a slack of one + slack_val = 1 + np.matmul(dist_diff, w.T, order='F') + # Mask of places with positive slack + slack_mask = slack_val > 0 + obj2 = np.sum(slack_val[slack_mask])/sizeT + obj = obj1 + obj2 if(self.verbose): + count = np.sum(slack_mask) print("[Global] iter %d\t obj %.6f\t num_imp %d" % (iter, - obj[iter], nImp[iter])) + obj, count)) # update the best - if (obj[iter] < best_obj): - best_obj = obj[iter] + if (obj < best_obj): + best_obj = obj best_w = w + # TODO: + # Maybe allow the usage of mini-batch opt? + idx = np.random.randint(low=0, high=sizeT) slack_val = 1 + dist_diff[idx, :].dot(w.T) @@ -99,25 +98,19 @@ def _fit(self, triplets, B, n_basis): scale_f = -np.sqrt(iter+1) / gamma - # TODO: maybe there is a better way to do this? - w.fill(0) - pos_mask = avg_grad_w > self.beta - w[pos_mask] = scale_f * (avg_grad_w[pos_mask] + self.beta) - neg_mask = avg_grad_w < - self.beta - w[neg_mask] = scale_f * (avg_grad_w[neg_mask] - self.beta) - - w[w < 0] = 0 + # proximal operator and negative trimming equivalent + w = scale_f * np.minimum(avg_grad_w + self.beta, 0) if(self.verbose): print("max iteration reached.") - self.components_ = self._get_components(best_w, B) + self.components_ = self._get_components(best_w) return self # should this go to utils? - def _compute_dist_diff(self, T, X, B): - XB = np.matmul(X, B.T) + def _compute_dist_diff(self, T, X): + XB = np.matmul(X, self.basis.T) T = np.vstack(T) lenT = len(T) # all positive and negative pairs with lowest index first @@ -134,7 +127,7 @@ def _compute_dist_diff(self, T, X, B): # pairs return dist[indeces[:lenT]]-dist[indeces[lenT:]] - def _get_components(self, w, B): + def _get_components(self, w): """ get components matrix (L) from computed mahalanobis matrix """ @@ -142,15 +135,15 @@ def _get_components(self, w, B): # get rid of inactive bases active_idx = w > 0 w = w[active_idx] - B = B[np.squeeze(active_idx), :] + basis = self.basis[np.squeeze(active_idx), :] - K, d = B.shape + K, d = basis.shape if(K < d): # if metric is low-rank - return B*np.sqrt(w)[..., None] + return basis*np.sqrt(w)[..., None] else: # if metric is full rank - return np.linalg.cholesky(np.dot(B.T * w, B)).T + return np.linalg.cholesky(np.matmul(basis.T * w, basis, order='F')).T def _to_index_points(self, triplets): shape = triplets.shape @@ -158,10 +151,31 @@ def _to_index_points(self, triplets): triplets = triplets.reshape(shape[:2]) return triplets, X + def _initialize_basis(self, n_features): + authorized_basis = [] + if isinstance(self.basis, np.ndarray): + self.basis = check_array(self.basis) + self.n_basis = self.basis.shape[0] + if self.basis.shape[1] != n_features: + raise ValueError('The input dimensionality ({}) of the given ' + 'linear transformation `init` must match the ' + 'dimensionality of the given inputs `X` ({}).' + .format(self.basis.shape[1], n_features)) + elif self.basis not in authorized_basis: + raise ValueError( + "`basis` must be '{}' " + "or a numpy array of shape (n_basis, n_features)." + .format("', '".join(authorized_basis))) + + # TODO: + # Add other options passed as string + elif type(self.basis) is str: + ValueError("No option for basis currently supported") + class SCML_global(_BaseSCML_global, _TripletsClassifierMixin): - def fit(self, triplets, B, n_basis): + def fit(self, triplets): """Learn the SCML model. Parameters @@ -173,29 +187,28 @@ def fit(self, triplets, B, n_basis): should have the three samples ordered in a way such that: d(triplets[i, 0],triplets[i, 1]) < d(triplets[i, 1], triplets[i, 3]) for all 0 <= i < n_constraints. - B : (n_basis, n_features) array of floats that form the basis set from - which the metric will be constructed. Returns ------- self : object Returns the instance. """ - return _BaseSCML_global._fit(triplets, B, n_basis) + return _BaseSCML_global._fit(triplets) class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): - def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, B=None, + def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis=None, n_basis=None, max_iter=100000, verbose=False, preprocessor=None, random_state=None): self.k_genuine = k_genuine self.k_impostor = k_impostor - _BaseSCML_global.__init__(self, beta=beta, max_iter=max_iter, - verbose=verbose, preprocessor=preprocessor, + _BaseSCML_global.__init__(self, beta=beta, basis=basis, n_basis=n_basis, + max_iter=max_iter, verbose=verbose, + preprocessor=preprocessor, random_state=random_state) - def fit(self, X, y, B, n_basis, random_state=None): + def fit(self, X, y, random_state=None): """Create constraints from labels and learn the SCML model. Parameters @@ -206,9 +219,6 @@ def fit(self, X, y, B, n_basis, random_state=None): y : (n) array-like Data labels. - B : string or (n_basis x d) array, through this the basis construction - can be selected or directly given by an array. - Returns ------- self : object @@ -217,28 +227,35 @@ def fit(self, X, y, B, n_basis, random_state=None): X, y = self._prepare_inputs(X, y, ensure_min_samples=2) self.preprocessor = X - if(B == "LDA"): - B = self._generate_bases_LDA(X, y, n_basis, random_state) - # this should set super's B + # TODO: + # it can be a problem if fit is called more than once, + # should that case be handled? + + if(self.basis == "LDA"): + self._generate_bases_LDA(X, y, random_state) triplets = generate_knntriplets(X, y, self.k_genuine, self.k_impostor) - return self._fit(triplets, B, n_basis) + return self._fit(triplets) - def _generate_bases_LDA(self, X, y, n_basis, random_state=None): + def _generate_bases_LDA(self, X, y, random_state=None): labels, class_count = np.unique(y, return_counts=True) n_class = len(labels) + # TODO: maybe a default value for this case? + if(self.n_basis is None): + raise ValueError('The number of basis given by n_basis must be set') + # n_basis must be greater or equal to n_class - if(n_basis < n_class): - ValueError("number of basis should be greater than the number of " + if(self.n_basis < n_class): + ValueError("The number of basis should be greater than the number of " "classes") dim = np.size(X, 1) num_eig = min(n_class-1, dim) - n_clusters = int(np.ceil(n_basis/(2 * num_eig))) + n_clusters = int(np.ceil(self.n_basis/(2 * num_eig))) # TODO: maybe give acces to Kmeans jobs for faster computation? kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, @@ -251,7 +268,7 @@ def _generate_bases_LDA(self, X, y, n_basis, random_state=None): else: nK = 10 - nK_class = [min(c, nK) for c in class_count] + nK_class = np.minimum(class_count, nK) idx_set = np.zeros((n_clusters, sum(nK_class)), dtype=np.int) @@ -260,27 +277,26 @@ def _generate_bases_LDA(self, X, y, n_basis, random_state=None): neigh = NearestNeighbors() for c in range(n_class): - sel_c = y == labels[c] + sel_c = np.where(y == labels[c]) nk = nK_class[c] - # get nK_class genuine neighbours + # get nK_class same class neighbours neigh.fit(X=X[sel_c]) finish += nk - idx_set[:, start:finish] = np.take(np.where(sel_c), - neigh.kneighbors(X=cX, + idx_set[:, start:finish] = np.take(sel_c, neigh.kneighbors(X=cX, n_neighbors=nk, return_distance=False)) start = finish - B = np.zeros((n_basis, dim)) + self.basis = np.zeros((self.n_basis, dim)) for i in range(n_clusters): lda = LinearDiscriminantAnalysis() lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) - B[num_eig*i: num_eig*(i+1), :] = normalize(lda.scalings_.T) + self.basis[num_eig*i: num_eig*(i+1), :] = normalize(lda.scalings_.T) nK = 20 - nK_class = [min(c, nK) for c in class_count] + nK_class = np.minimum(class_count, nK) idx_set = np.zeros((n_clusters, sum(nK_class)), dtype=np.int) @@ -300,20 +316,19 @@ def _generate_bases_LDA(self, X, y, n_basis, random_state=None): start = finish finish = num_eig * n_clusters + n_components = None for i in range(n_clusters): - lda = LinearDiscriminantAnalysis() - lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) start = finish finish += num_eig - # TODO: maybe handle tail more elegantly by - # limiting lda n_components - if(start == n_basis): - pass - elif(finish <= n_basis): - B[start:finish, :] = normalize(lda.scalings_.T) - else: - B[start:, :] = normalize(lda.scalings_.T[:n_basis-start]) - break - - return B + # handle tail, as n_basis != n_clusters*2*n_eig + if (finish > self.n_basis): + finish = self.n_basis + n_components = finish-start + + lda = LinearDiscriminantAnalysis() + lda.fit(X[idx_set[i, :]], y[idx_set[i, :]], n_components=n_components) + + self.basis[start:finish, :] = normalize(lda.scalings_.T) + + return From 21a6fc03a4670d8f9065d61b4963494a4fa69d0c Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 19 Feb 2020 17:09:29 +0100 Subject: [PATCH 06/70] added coments & docstrings, small code changes --- metric_learn/scml.py | 219 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 186 insertions(+), 33 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index a923b6d4..c0c60500 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -6,7 +6,7 @@ import numpy as np from .base_metric import _TripletsClassifierMixin, MahalanobisMixin from sklearn.base import TransformerMixin -from .constraints import generate_knntriplets +from .constraints import Constraints from sklearn.preprocessing import normalize from sklearn.neighbors import NearestNeighbors from sklearn.cluster import KMeans @@ -31,6 +31,11 @@ def __init__(self, beta=1e-5, basis=None, n_basis=None, super(_BaseSCML_global, self).__init__(preprocessor) def _fit(self, triplets): + """ + Optimization procedure to find a sparse vector of weights to + construct the metric from the basis set. This is based on the + dual averaging method. + """ if self.preprocessor is not None: n_features = self.preprocessor.shape[1] @@ -63,7 +68,7 @@ def _fit(self, triplets): for iter in range(self.max_iter): if (iter % output_iter == 0): - + # regularization part of obj function obj1 = np.sum(w)*self.beta # Every triplet distance difference in the space given by L @@ -72,7 +77,9 @@ def _fit(self, triplets): # Mask of places with positive slack slack_mask = slack_val > 0 + # loss function of learning task part of obj function obj2 = np.sum(slack_val[slack_mask])/sizeT + obj = obj1 + obj2 if(self.verbose): count = np.sum(slack_mask) @@ -98,26 +105,31 @@ def _fit(self, triplets): scale_f = -np.sqrt(iter+1) / gamma - # proximal operator and negative trimming equivalent + # proximal operator with negative trimming equivalent w = scale_f * np.minimum(avg_grad_w + self.beta, 0) if(self.verbose): print("max iteration reached.") + # return L matrix yielded from best weights self.components_ = self._get_components(best_w) return self -# should this go to utils? def _compute_dist_diff(self, T, X): + """ + Helper function to compute the distance difference of every triplet in the + space yielded by the basis set. + """ + # Transformation of data by the basis set XB = np.matmul(X, self.basis.T) - T = np.vstack(T) + lenT = len(T) - # all positive and negative pairs with lowest index first + # get all positive and negative pairs with lowest index first # np.array (2*lenT,2) T_pairs_sorted = np.sort(np.vstack((T[:, [0, 1]], T[:, [0, 2]])), kind='stable') - # calculate all unique pairs + # calculate all unique pairs and their indeces uniqPairs, indeces = np.unique(T_pairs_sorted, return_inverse=True, axis=0) # calculate L2 distance acording to bases only for unique pairs @@ -174,6 +186,65 @@ def _initialize_basis(self, n_features): class SCML_global(_BaseSCML_global, _TripletsClassifierMixin): + """Sparse Compositional Metric Learning (SCML) + + `SCML` builds a metric as the sparse positive combination of a set of locally + discriminative rank-one PSD basis. This allows an optimization scheme with + only `K` parameters, that can be yielded with an efficient stochastic + composite optimization over a set of triplets constraints. Each triplet is + constructed as a relative distance comparison with respect to the first + element so that the second element is closer than the last. + Read more in the :ref:`User Guide `. + Parameters + ---------- + basis : None, string or numpy array, optional (default=None) + Prior to set for the metric. Possible options are + '', and a numpy array of shape (n_basis, n_features). If + None an error will be raised as the basis set is esential + to SCML. + numpy array + A matrix of shape (n_basis, n_features), that will be used as + the basis set for the metric construction. + n_basis : int, optional + Number of basis to be yielded. In case it is not set it will be set based + on the basis numpy array. If an string option is pased to basis an error + wild be raised as this value will be needed. + max_iter : int, optional + verbose : bool, optional + if True, prints information while learning + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get triplets from indices. If array-like, + triplets will be formed like this: X[indices]. + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + Examples + -------- + >>> from metric_learn import SCLM_global_Supervised + >>> from sklearn.datasets import load_iris + >>> iris_data = load_iris() + >>> X = iris_data['data'] + >>> Y = iris_data['target'] + >>> scml = SCML_global_Supervised(basis='LDA', n_basis=400) + >>> scml.fit(X, Y) + References + ---------- + .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. + `_. \ + (AAAI), 2014. + .. [2] Adapted from original \ + `Matlab implementation.`_. + See Also + -------- + metric_learn.SCML_global : The original weakly-supervised algorithm. + + :ref:`supervised_version` : The section of the project documentation + that describes the supervised version of weakly supervised estimators. + """ def fit(self, triplets): """Learn the SCML model. @@ -197,6 +268,67 @@ def fit(self, triplets): class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): + """Supervised version of Sparse Compositional Metric Learning (SCML) + + `SCML_global_Supervised` creates triplets by taking `k_genuine` neighbours + of the same class and `k_impostor` neighbours from diferent classes for each + point and then runs the SCML algorithm on these triplets. + Read more in the :ref:`User Guide `. + Parameters + ---------- + basis : None, string or numpy array, optional (default=None) + Prior to set for the metric. Possible options are + 'LDA', and a numpy array of shape (n_basis, n_features). If + None an error will be raised as the basis set is esential + to SCML. + 'LDA' + The `n_basis` basis set is constructed from the LDA of significant + local regions in the feature space via clustering, for each region + center k-nearest neighbors are used to obtain the LDA scalings, + which correspond to the locally discriminative basis. + numpy array + A matrix of shape (n_basis, n_features), that will be used as + the basis set for the metric construction. + n_basis : int, optional + Number of basis to be yielded. In case it is not set it will be set based + on the basis numpy array. If an string option is pased to basis an error + wild be raised as this value will be needed. + max_iter : int, optional + verbose : bool, optional + if True, prints information while learning + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get triplets from indices. If array-like, + triplets will be formed like this: X[indices]. + random_state : int or numpy.RandomState or None, optional (default=None) + A pseudo random number generator object or a seed for it if int. + Attributes + ---------- + components_ : `numpy.ndarray`, shape=(n_features, n_features) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `components_from_metric`.) + Examples + -------- + >>> from metric_learn import SCML + >>> triplets = np.array([[[1.2, 3.2], [2.3, 5.5], [2.1, 0.6]], + >>> [[4.5, 2.3], [2.1, 2.3], [7.3, 3.4]]]) + >>> scml = SCML(random_state=42) + >>> scml.fit(triplets) + SCML(beta=1e-5, B=None, max_iter=100000, verbose=False, + preprocessor=None, random_state=None) + References + ---------- + .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. + `_. \ + (AAAI), 2014. + .. [2] Adapted from original \ + `Matlab implementation.`_. + See Also + -------- + metric_learn.SCML_global_Supervised : The supervised version of this + algorithm, which construct the triplets from the labels. + :ref:`supervised_version` : The section of the project documentation + that describes the supervised version of weakly supervised estimators. + """ def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis=None, n_basis=None, max_iter=100000, verbose=False, @@ -208,7 +340,7 @@ def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis=None, preprocessor=preprocessor, random_state=random_state) - def fit(self, X, y, random_state=None): + def fit(self, X, y): """Create constraints from labels and learn the SCML model. Parameters @@ -232,14 +364,22 @@ def fit(self, X, y, random_state=None): # should that case be handled? if(self.basis == "LDA"): - self._generate_bases_LDA(X, y, random_state) + self._generate_bases_LDA(X, y) - triplets = generate_knntriplets(X, y, self.k_genuine, - self.k_impostor) + constraints = Constraints(y) + triplets = constraints.generate_knntriplets(X, y, self.k_genuine, + self.k_impostor) return self._fit(triplets) - def _generate_bases_LDA(self, X, y, random_state=None): + def _generate_bases_LDA(self, X, y): + """ + Helper function that computes the n_basis basis set constructed from the + LDA of significant local regions in the feature space via clustering, for + each region center k-nearest neighbors are used to obtain the LDA scalings, + which correspond to the locally discriminative basis. Currently this is + done at two scales `k={10,20}` if `n_feature < 50` or else `k={20,50}`. + """ labels, class_count = np.unique(y, return_counts=True) n_class = len(labels) @@ -253,24 +393,30 @@ def _generate_bases_LDA(self, X, y, random_state=None): ValueError("The number of basis should be greater than the number of " "classes") - dim = np.size(X, 1) - num_eig = min(n_class-1, dim) + n_features = np.size(X, 1) + # Number of basis yielded from each LDA + num_eig = min(n_class-1, n_features) + # Number of clusters needed for 2 scales given the number of basis + # yielded by every LDA n_clusters = int(np.ceil(self.n_basis/(2 * num_eig))) # TODO: maybe give acces to Kmeans jobs for faster computation? - kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, + kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state, algorithm='elkan').fit(X) cX = kmeans.cluster_centers_ # TODO: find a better way to choose neighbourhood size - if dim > 50: - nK = 50 + if n_features > 50: + k = 50 else: - nK = 10 + k = 10 - nK_class = np.minimum(class_count, nK) + # In case some class has less elements than k + k_class = np.minimum(class_count, k) - idx_set = np.zeros((n_clusters, sum(nK_class)), dtype=np.int) + # Construct index set with neighbors of every element of every class + + idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) start = 0 finish = 0 @@ -278,49 +424,56 @@ def _generate_bases_LDA(self, X, y, random_state=None): for c in range(n_class): sel_c = np.where(y == labels[c]) - nk = nK_class[c] - # get nK_class same class neighbours + kc = k_class[c] + # get k_class same class neighbours neigh.fit(X=X[sel_c]) - finish += nk + finish += kc idx_set[:, start:finish] = np.take(sel_c, neigh.kneighbors(X=cX, - n_neighbors=nk, + n_neighbors=kc, return_distance=False)) start = finish - self.basis = np.zeros((self.n_basis, dim)) + # Compute basis for every cluster in first scale + self.basis = np.zeros((self.n_basis, n_features)) for i in range(n_clusters): lda = LinearDiscriminantAnalysis() lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) self.basis[num_eig*i: num_eig*(i+1), :] = normalize(lda.scalings_.T) - nK = 20 + # second scale + k = 20 + + # In case some class has less elements than k + k_class = np.minimum(class_count, k) - nK_class = np.minimum(class_count, nK) + # Construct index set with neighbors of every element of every class - idx_set = np.zeros((n_clusters, sum(nK_class)), dtype=np.int) + idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) start = 0 finish = 0 for c in range(n_class): sel_c = np.where(y == labels[c]) - nk = nK_class[c] - # get nK_class genuine neighbours - neigh.fit(X=X[sel_c]) + kc = k_class[c] - finish += nk + # get k_class genuine neighbours + neigh.fit(X=X[sel_c]) + finish += kc idx_set[:, start:finish] = np.take(sel_c, neigh.kneighbors(X=cX, - n_neighbors=nk, + n_neighbors=kc, return_distance=False)) start = finish + # Compute basis for every cluster in first scale finish = num_eig * n_clusters n_components = None for i in range(n_clusters): start = finish finish += num_eig + # handle tail, as n_basis != n_clusters*2*n_eig if (finish > self.n_basis): finish = self.n_basis From 5453c753b5e08fa67d60ba8af0e4fcdcbb969aee Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 19 Feb 2020 17:41:42 +0100 Subject: [PATCH 07/70] typos and added choice of gamma & output_iter --- metric_learn/scml.py | 48 ++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index c0c60500..5610d747 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -18,13 +18,15 @@ class _BaseSCML_global(MahalanobisMixin): _tuple_size = 3 # constraints are triplets - def __init__(self, beta=1e-5, basis=None, n_basis=None, - max_iter=100000, verbose=False, preprocessor=None, - random_state=None): + def __init__(self, beta=1e-5, basis=None, n_basis=None, gamma=5e-3, + max_iter=100000, output_iter=5000, verbose=False, + preprocessor=None, random_state=None): self.beta = beta self.basis = basis self.n_basis = n_basis + self.gamma = gamma self.max_iter = max_iter + self.output_iter = output_iter self.verbose = verbose self.preprocessor = preprocessor self.random_state = random_state @@ -47,14 +49,12 @@ def _fit(self, triplets): triplets = self._prepare_inputs(triplets, type_of_inputs='tuples') # TODO: - # This algorithm is build to work with indeces, but in order to be + # This algorithm is built to work with indices, but in order to be # compliant with the current handling of inputs it is converted # back to indices by the following function. This should be improved # in the future. triplets, X = self._to_index_points(triplets) - # TODO: should be given access to gamma? - gamma = 5e-3 dist_diff = self._compute_dist_diff(triplets, X) sizeT = triplets.shape[0] @@ -62,12 +62,10 @@ def _fit(self, triplets): w = np.zeros((1, self.n_basis)) avg_grad_w = np.zeros((1, self.n_basis)) - output_iter = 5000 # output every output_iter iterations - best_obj = np.inf for iter in range(self.max_iter): - if (iter % output_iter == 0): + if (iter % self.output_iter == 0): # regularization part of obj function obj1 = np.sum(w)*self.beta @@ -103,7 +101,7 @@ def _fit(self, triplets): else: avg_grad_w = iter * avg_grad_w / (iter+1) - scale_f = -np.sqrt(iter+1) / gamma + scale_f = -np.sqrt(iter+1) / self.gamma # proximal operator with negative trimming equivalent w = scale_f * np.minimum(avg_grad_w + self.beta, 0) @@ -129,15 +127,15 @@ def _compute_dist_diff(self, T, X): # np.array (2*lenT,2) T_pairs_sorted = np.sort(np.vstack((T[:, [0, 1]], T[:, [0, 2]])), kind='stable') - # calculate all unique pairs and their indeces - uniqPairs, indeces = np.unique(T_pairs_sorted, return_inverse=True, + # calculate all unique pairs and their indices + uniqPairs, indices = np.unique(T_pairs_sorted, return_inverse=True, axis=0) # calculate L2 distance acording to bases only for unique pairs dist = np.square(XB[uniqPairs[:, 0], :]-XB[uniqPairs[:, 1], :]) # return the diference of distances between all positive and negative # pairs - return dist[indeces[:lenT]]-dist[indeces[lenT:]] + return dist[indices[:lenT]]-dist[indices[lenT:]] def _get_components(self, w): """ @@ -197,6 +195,8 @@ class SCML_global(_BaseSCML_global, _TripletsClassifierMixin): Read more in the :ref:`User Guide `. Parameters ---------- + beta: float (default=1e-5) + L1 regularization parameter. basis : None, string or numpy array, optional (default=None) Prior to set for the metric. Possible options are '', and a numpy array of shape (n_basis, n_features). If @@ -209,7 +209,13 @@ class SCML_global(_BaseSCML_global, _TripletsClassifierMixin): Number of basis to be yielded. In case it is not set it will be set based on the basis numpy array. If an string option is pased to basis an error wild be raised as this value will be needed. - max_iter : int, optional + gamma: float (default = 5e-3) + Learning rate + max_iter : int (default = 100000) + Number of iterations for the algorithm + output_iter : int (default = 5000) + Number of iterations to check current weights performance and output this + information in case verbose is True. verbose : bool, optional if True, prints information while learning preprocessor : array-like, shape=(n_samples, n_features) or callable @@ -276,6 +282,8 @@ class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): Read more in the :ref:`User Guide `. Parameters ---------- + beta: float (default=1e-5) + L1 regularization parameter. basis : None, string or numpy array, optional (default=None) Prior to set for the metric. Possible options are 'LDA', and a numpy array of shape (n_basis, n_features). If @@ -293,7 +301,13 @@ class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): Number of basis to be yielded. In case it is not set it will be set based on the basis numpy array. If an string option is pased to basis an error wild be raised as this value will be needed. - max_iter : int, optional + gamma: float (default = 5e-3) + Learning rate + max_iter : int (default = 100000) + Number of iterations for the algorithm + output_iter : int (default = 5000) + Number of iterations to check current weights performance and output this + information in case verbose is True. verbose : bool, optional if True, prints information while learning preprocessor : array-like, shape=(n_samples, n_features) or callable @@ -331,8 +345,8 @@ class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): """ def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis=None, - n_basis=None, max_iter=100000, verbose=False, - preprocessor=None, random_state=None): + n_basis=None, gamma=5e-3, max_iter=100000, output_iter=5000, + verbose=False, preprocessor=None, random_state=None): self.k_genuine = k_genuine self.k_impostor = k_impostor _BaseSCML_global.__init__(self, beta=beta, basis=basis, n_basis=n_basis, From 5f8d88535a6f9be16040c6fc22e294869e791fea Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 20 Feb 2020 10:49:40 +0100 Subject: [PATCH 08/70] some small improvements --- metric_learn/scml.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 5610d747..377b4eb4 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -76,7 +76,7 @@ def _fit(self, triplets): slack_mask = slack_val > 0 # loss function of learning task part of obj function - obj2 = np.sum(slack_val[slack_mask])/sizeT + obj2 = np.sum(slack_val, where=slack_mask)/sizeT obj = obj1 + obj2 if(self.verbose): @@ -162,6 +162,8 @@ def _to_index_points(self, triplets): return triplets, X def _initialize_basis(self, n_features): + # TODO: + # Add other options passed as string authorized_basis = [] if isinstance(self.basis, np.ndarray): self.basis = check_array(self.basis) @@ -177,11 +179,6 @@ def _initialize_basis(self, n_features): "or a numpy array of shape (n_basis, n_features)." .format("', '".join(authorized_basis))) - # TODO: - # Add other options passed as string - elif type(self.basis) is str: - ValueError("No option for basis currently supported") - class SCML_global(_BaseSCML_global, _TripletsClassifierMixin): """Sparse Compositional Metric Learning (SCML) @@ -381,7 +378,7 @@ def fit(self, X, y): self._generate_bases_LDA(X, y) constraints = Constraints(y) - triplets = constraints.generate_knntriplets(X, y, self.k_genuine, + triplets = constraints.generate_knntriplets(X, self.k_genuine, self.k_impostor) return self._fit(triplets) @@ -419,7 +416,7 @@ def _generate_bases_LDA(self, X, y): algorithm='elkan').fit(X) cX = kmeans.cluster_centers_ - # TODO: find a better way to choose neighbourhood size + # TODO: find a better way to choose neighborhood size if n_features > 50: k = 50 else: @@ -428,7 +425,7 @@ def _generate_bases_LDA(self, X, y): # In case some class has less elements than k k_class = np.minimum(class_count, k) - # Construct index set with neighbors of every element of every class + # Construct index set with neighbors for every element of every class idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) @@ -450,8 +447,8 @@ def _generate_bases_LDA(self, X, y): # Compute basis for every cluster in first scale self.basis = np.zeros((self.n_basis, n_features)) + lda = LinearDiscriminantAnalysis() for i in range(n_clusters): - lda = LinearDiscriminantAnalysis() lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) self.basis[num_eig*i: num_eig*(i+1), :] = normalize(lda.scalings_.T) @@ -461,7 +458,7 @@ def _generate_bases_LDA(self, X, y): # In case some class has less elements than k k_class = np.minimum(class_count, k) - # Construct index set with neighbors of every element of every class + # Construct index set with neighbors for every element of every class idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) @@ -480,10 +477,10 @@ def _generate_bases_LDA(self, X, y): return_distance=False)) start = finish - # Compute basis for every cluster in first scale + # Compute basis for every cluster in second scale finish = num_eig * n_clusters n_components = None - + lda = LinearDiscriminantAnalysis() for i in range(n_clusters): start = finish finish += num_eig @@ -493,7 +490,6 @@ def _generate_bases_LDA(self, X, y): finish = self.n_basis n_components = finish-start - lda = LinearDiscriminantAnalysis() lda.fit(X[idx_set[i, :]], y[idx_set[i, :]], n_components=n_components) self.basis[start:finish, :] = normalize(lda.scalings_.T) From 1083f57a7b064550c80627cd351232cfd8ba46d6 Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 20 Feb 2020 13:37:45 +0100 Subject: [PATCH 09/70] lda tail handling rollback --- metric_learn/scml.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 377b4eb4..9776ac0c 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -479,7 +479,6 @@ def _generate_bases_LDA(self, X, y): # Compute basis for every cluster in second scale finish = num_eig * n_clusters - n_components = None lda = LinearDiscriminantAnalysis() for i in range(n_clusters): start = finish @@ -488,10 +487,9 @@ def _generate_bases_LDA(self, X, y): # handle tail, as n_basis != n_clusters*2*n_eig if (finish > self.n_basis): finish = self.n_basis - n_components = finish-start - lda.fit(X[idx_set[i, :]], y[idx_set[i, :]], n_components=n_components) + lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) - self.basis[start:finish, :] = normalize(lda.scalings_.T) + self.basis[start:finish, :] = normalize(lda.scalings_.T[:finish-start]) return From 78b965800d749b04e1e91b9cee06eeafe2a4f91c Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 20 Feb 2020 14:28:33 +0100 Subject: [PATCH 10/70] performance improvement by precomputing rand_ints --- metric_learn/scml.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 9776ac0c..620fbe42 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -64,6 +64,7 @@ def _fit(self, triplets): best_obj = np.inf + rand_int = np.random.randint(low=0, high=sizeT, size=self.max_iter) for iter in range(self.max_iter): if (iter % self.output_iter == 0): # regularization part of obj function @@ -92,9 +93,9 @@ def _fit(self, triplets): # TODO: # Maybe allow the usage of mini-batch opt? - idx = np.random.randint(low=0, high=sizeT) + idx = rand_int[iter] - slack_val = 1 + dist_diff[idx, :].dot(w.T) + slack_val = 1 + np.matmul(dist_diff[idx, :], w.T) if (slack_val > 0): avg_grad_w = (iter * avg_grad_w + dist_diff[idx, :]) / (iter+1) From bc203f578210de3bdd2f3634e7c0fc3549541bdf Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 5 Mar 2020 10:34:16 +0100 Subject: [PATCH 11/70] small fix in components computation --- metric_learn/scml.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 620fbe42..26a2aa6e 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -144,17 +144,17 @@ def _get_components(self, w): """ # get rid of inactive bases - active_idx = w > 0 - w = w[active_idx] - basis = self.basis[np.squeeze(active_idx), :] + active_idx, = w > 0 + w = w[..., active_idx] + basis = self.basis[active_idx, :] K, d = basis.shape if(K < d): # if metric is low-rank - return basis*np.sqrt(w)[..., None] + return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(B) else: # if metric is full rank - return np.linalg.cholesky(np.matmul(basis.T * w, basis, order='F')).T + return np.linalg.cholesky(np.matmul(basis.T, w.T*basis, order='F')).T def _to_index_points(self, triplets): shape = triplets.shape From ecdb74d2cdb884c9563d2d1a09f8abecf4600d4a Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 5 Mar 2020 10:54:04 +0100 Subject: [PATCH 12/70] flake8 fix --- metric_learn/scml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 26a2aa6e..d59040c7 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -151,7 +151,7 @@ def _get_components(self, w): K, d = basis.shape if(K < d): # if metric is low-rank - return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(B) + return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(B) else: # if metric is full rank return np.linalg.cholesky(np.matmul(basis.T, w.T*basis, order='F')).T From f82f3b3b714b5767c1ed572cd5dd5a0d0460e642 Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 5 Mar 2020 17:35:28 +0100 Subject: [PATCH 13/70] SCML_global fit fix & other small changes --- metric_learn/__init__.py | 2 +- metric_learn/scml.py | 6 +----- test/test_utils.py | 24 +++++------------------- 3 files changed, 7 insertions(+), 25 deletions(-) diff --git a/metric_learn/__init__.py b/metric_learn/__init__.py index 5210c157..6e7cc30c 100644 --- a/metric_learn/__init__.py +++ b/metric_learn/__init__.py @@ -18,5 +18,5 @@ __all__ = ['Constraints', 'Covariance', 'ITML', 'ITML_Supervised', 'LMNN', 'LSML', 'LSML_Supervised', 'SDML', 'SDML_Supervised', 'NCA', 'LFDA', 'RCA', 'RCA_Supervised', - 'MLKR', 'MMC', 'MMC_Supervised', 'SCML_global', + 'MLKR', 'MMC', 'MMC_Supervised', 'SCML_global', 'SCML_global_Supervised', '__version__'] diff --git a/metric_learn/scml.py b/metric_learn/scml.py index d59040c7..943870e2 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -268,7 +268,7 @@ def fit(self, triplets): self : object Returns the instance. """ - return _BaseSCML_global._fit(triplets) + return self._fit(triplets) class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): @@ -371,10 +371,6 @@ def fit(self, X, y): X, y = self._prepare_inputs(X, y, ensure_min_samples=2) self.preprocessor = X - # TODO: - # it can be a problem if fit is called more than once, - # should that case be handled? - if(self.basis == "LDA"): self._generate_bases_LDA(X, y) diff --git a/test/test_utils.py b/test/test_utils.py index a4cf86f4..d6e58ab6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -16,14 +16,13 @@ from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, MMC_Supervised, RCA_Supervised, SDML_Supervised, - Constraints) + SCML_global, SCML_global_Supervised, Constraints) from metric_learn.base_metric import (ArrayIndexer, MahalanobisMixin, _PairsClassifierMixin, _TripletsClassifierMixin, _QuadrupletsClassifierMixin) from metric_learn.exceptions import PreprocessorError, NonPSDError from sklearn.datasets import make_regression, make_blobs, load_iris -from metric_learn.lsml import _BaseLSML SEED = 42 @@ -98,21 +97,6 @@ def build_triplets(with_preprocessor=False): return X[triplets], None -class mock_triplet_LSML(_BaseLSML, _TripletsClassifierMixin): - # Mock Triplet learner from LSML which is a quadruplets learner - # in order to test TripletClassifierMixin basic methods - - _tuple_size = 4 - - def fit(self, triplets, weights=None): - quadruplets = triplets[:, [0, 1, 0, 2]] - return self._fit(quadruplets, weights=weights) - - def decision_function(self, triplets): - self._tuple_size = 3 - return _TripletsClassifierMixin.decision_function(self, triplets) - - def build_quadruplets(with_preprocessor=False): # builds a toy quadruplets problem X, indices = build_data() @@ -133,7 +117,7 @@ def build_quadruplets(with_preprocessor=False): [learner for (learner, _) in quadruplets_learners])) -triplets_learners = [(mock_triplet_LSML(), build_triplets)] +triplets_learners = [(SCML_global(), build_triplets)] ids_triplets_learners = list(map(lambda x: x.__class__.__name__, [learner for (learner, _) in triplets_learners])) @@ -155,7 +139,8 @@ def build_quadruplets(with_preprocessor=False): (MMC_Supervised(max_iter=5), build_classification), (RCA_Supervised(num_chunks=5), build_classification), (SDML_Supervised(prior='identity', balance_param=1e-5), - build_classification)] + build_classification), (SCML_global_Supervised(), + build_classification)] ids_classifiers = list(map(lambda x: x.__class__.__name__, [learner for (learner, _) in classifiers])) @@ -165,6 +150,7 @@ def build_quadruplets(with_preprocessor=False): [learner for (learner, _) in regressors])) WeaklySupervisedClasses = (_PairsClassifierMixin, + _TripletsClassifierMixin, _QuadrupletsClassifierMixin) tuples_learners = pairs_learners + quadruplets_learners From 2018d09131db58b8908c560ef69f354ea7d1942e Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 11 Mar 2020 15:18:18 +0100 Subject: [PATCH 14/70] Proper use of init vars and unsup bases generation --- metric_learn/scml.py | 187 +++++++++++++++++++++++++++++++------------ 1 file changed, 136 insertions(+), 51 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 943870e2..f233ea45 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -11,15 +11,16 @@ from sklearn.neighbors import NearestNeighbors from sklearn.cluster import KMeans from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from sklearn.utils import check_array +from sklearn.utils import check_array, check_random_state +import warnings class _BaseSCML_global(MahalanobisMixin): _tuple_size = 3 # constraints are triplets - def __init__(self, beta=1e-5, basis=None, n_basis=None, gamma=5e-3, - max_iter=100000, output_iter=5000, verbose=False, + def __init__(self, beta=1e-5, basis='triplet_diffs', n_basis=None, + gamma=5e-3, max_iter=100000, output_iter=5000, verbose=False, preprocessor=None, random_state=None): self.beta = beta self.basis = basis @@ -32,20 +33,17 @@ def __init__(self, beta=1e-5, basis=None, n_basis=None, gamma=5e-3, self.random_state = random_state super(_BaseSCML_global, self).__init__(preprocessor) - def _fit(self, triplets): + def _fit(self, triplets, X=None, basis=None, n_basis=None): """ Optimization procedure to find a sparse vector of weights to construct the metric from the basis set. This is based on the dual averaging method. """ - if self.preprocessor is not None: - n_features = self.preprocessor.shape[1] - else: - n_features = self.triplets.shape[1] - - self._initialize_basis(n_features) + if X is not None: + triplets = X[triplets] + # Currently prepare_inputs makes triplets contain points and not indices triplets = self._prepare_inputs(triplets, type_of_inputs='tuples') # TODO: @@ -53,18 +51,22 @@ def _fit(self, triplets): # compliant with the current handling of inputs it is converted # back to indices by the following function. This should be improved # in the future. - triplets, X = self._to_index_points(triplets) + triplets, X = self._to_index_points(triplets, X) + + if basis is None: + basis, n_basis = self._initialize_basis(triplets, X) - dist_diff = self._compute_dist_diff(triplets, X) + dist_diff = self._compute_dist_diff(triplets, X, basis) sizeT = triplets.shape[0] - w = np.zeros((1, self.n_basis)) - avg_grad_w = np.zeros((1, self.n_basis)) + w = np.zeros((1, n_basis)) + avg_grad_w = np.zeros((1, n_basis)) best_obj = np.inf - rand_int = np.random.randint(low=0, high=sizeT, size=self.max_iter) + rng = check_random_state(self.random_state) + rand_int = rng.randint(low=0, high=sizeT, size=self.max_iter) for iter in range(self.max_iter): if (iter % self.output_iter == 0): # regularization part of obj function @@ -111,17 +113,17 @@ def _fit(self, triplets): print("max iteration reached.") # return L matrix yielded from best weights - self.components_ = self._get_components(best_w) + self.components_ = self._get_components(best_w, basis) return self - def _compute_dist_diff(self, T, X): + def _compute_dist_diff(self, T, X, basis): """ Helper function to compute the distance difference of every triplet in the space yielded by the basis set. """ # Transformation of data by the basis set - XB = np.matmul(X, self.basis.T) + XB = np.matmul(X, basis.T) lenT = len(T) # get all positive and negative pairs with lowest index first @@ -138,7 +140,7 @@ def _compute_dist_diff(self, T, X): # pairs return dist[indices[:lenT]]-dist[indices[lenT:]] - def _get_components(self, w): + def _get_components(self, w, basis): """ get components matrix (L) from computed mahalanobis matrix """ @@ -146,7 +148,7 @@ def _get_components(self, w): # get rid of inactive bases active_idx, = w > 0 w = w[..., active_idx] - basis = self.basis[active_idx, :] + basis = basis[active_idx, :] K, d = basis.shape @@ -156,29 +158,93 @@ def _get_components(self, w): else: # if metric is full rank return np.linalg.cholesky(np.matmul(basis.T, w.T*basis, order='F')).T - def _to_index_points(self, triplets): + def _to_index_points(self, triplets, X=None): shape = triplets.shape - X, triplets = np.unique(np.vstack(triplets), return_inverse=True, axis=0) - triplets = triplets.reshape(shape[:2]) - return triplets, X + if len(shape) == 3: + X, triplets = np.unique(np.vstack(triplets), return_inverse=True, axis=0) + triplets = triplets.reshape(shape[:2]) + return triplets, X + elif(len(shape) == 2 and X is not None): + return triplets, X + elif(self.preprocessor is not None): + return triplets, self.preprocessor + else: + raise ValueError('A preprocessor is needed when triplets are indices') + + def _initialize_basis(self, triplets, X): + """ TODO: complete function description + """ + + if self.preprocessor is not None: + n_features = self.preprocessor.shape[1] + else: + n_features = triplets.shape[1] - def _initialize_basis(self, n_features): # TODO: # Add other options passed as string - authorized_basis = [] + authorized_basis = ['triplet_diffs'] if isinstance(self.basis, np.ndarray): - self.basis = check_array(self.basis) - self.n_basis = self.basis.shape[0] - if self.basis.shape[1] != n_features: + # TODO: should copy? + basis = check_array(self.basis, copy=True) + n_basis = basis.shape[0] + if basis.shape[1] != n_features: raise ValueError('The input dimensionality ({}) of the given ' 'linear transformation `init` must match the ' 'dimensionality of the given inputs `X` ({}).' - .format(self.basis.shape[1], n_features)) + .format(basis.shape[1], n_features)) elif self.basis not in authorized_basis: raise ValueError( "`basis` must be '{}' " "or a numpy array of shape (n_basis, n_features)." .format("', '".join(authorized_basis))) + if self.basis is authorized_basis[0]: + basis, n_basis = self._generate_bases_dist_diff(triplets, X) + + return basis, n_basis + + def _generate_bases_dist_diff(self, triplets, X): + """ Bases are generated from triplets as differences of positive or + negative pairs + TODO: complete function description + """ + + # TODO: Have a proportion of drawn pos and neg pairs? + + # get all positive and negative pairs with lowest index first + # np.array (2*lenT,2) + T_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]], + triplets[:, [0, 2]])), + kind='stable') + # calculate all unique pairs and their indices + uniqPairs = np.unique(T_pairs_sorted, axis=0) + + if self.n_basis is None: + n_basis = uniqPairs.shape[0] + + elif self.n_basis > uniqPairs.shape[0]: + print("n_basis too big") + n_basis = uniqPairs.shape[0] + + else: + n_basis = self.n_basis + + if len(triplets.shape) == 3: + pass + elif X is not None: + uniqPairs = X[uniqPairs] + else: + raise ValueError('The processor must be set if indices are used for the' + 'triplets construction') + + rng = check_random_state(self.random_state) + + # Select n_basis + selected_pairs = uniqPairs[rng.choice(uniqPairs.shape[0], + size=n_basis, replace=False), :, :] + + basis = selected_pairs[:, 0]-selected_pairs[:, 1] + + return basis, n_basis class SCML_global(_BaseSCML_global, _TripletsClassifierMixin): @@ -268,6 +334,7 @@ def fit(self, triplets): self : object Returns the instance. """ + return self._fit(triplets) @@ -342,7 +409,7 @@ class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): that describes the supervised version of weakly supervised estimators. """ - def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis=None, + def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='LDA', n_basis=None, gamma=5e-3, max_iter=100000, output_iter=5000, verbose=False, preprocessor=None, random_state=None): self.k_genuine = k_genuine @@ -369,16 +436,17 @@ def fit(self, X, y): Returns the instance. """ X, y = self._prepare_inputs(X, y, ensure_min_samples=2) - self.preprocessor = X + # TODO: this should be replaced by a _initialize_bases_supervised + # for future adding of other approaches of basis generation if(self.basis == "LDA"): - self._generate_bases_LDA(X, y) + basis, n_basis = self._generate_bases_LDA(X, y) constraints = Constraints(y) triplets = constraints.generate_knntriplets(X, self.k_genuine, self.k_impostor) - return self._fit(triplets) + return self._fit(triplets, X, basis, n_basis) def _generate_bases_LDA(self, X, y): """ @@ -392,21 +460,33 @@ def _generate_bases_LDA(self, X, y): labels, class_count = np.unique(y, return_counts=True) n_class = len(labels) - # TODO: maybe a default value for this case? - if(self.n_basis is None): - raise ValueError('The number of basis given by n_basis must be set') + n_features = X.shape[1] + # Number of basis yielded from each LDA + num_eig = min(n_class-1, n_features) + + if self.n_basis is None: + n_basis = min(20*n_features, X.shape[0]*2*num_eig) + warnings.warn('The number of basis will be set to n_basis= %d' % n_basis) # n_basis must be greater or equal to n_class - if(self.n_basis < n_class): - ValueError("The number of basis should be greater than the number of " - "classes") + elif self.n_basis < n_class: + raise ValueError("The number of basis should be greater than the" + " number of classes") + elif np.issubdtype(self.n_basis, np.integer): + n_basis = self.n_basis + else: + raise ValueError("n_basis should be an integer, instead it is of type %s" + % type(self.n_basis)) - n_features = np.size(X, 1) - # Number of basis yielded from each LDA - num_eig = min(n_class-1, n_features) # Number of clusters needed for 2 scales given the number of basis # yielded by every LDA - n_clusters = int(np.ceil(self.n_basis/(2 * num_eig))) + n_clusters = int(np.ceil(n_basis/(2 * num_eig))) + + if(n_clusters > X.shape[0]): + raise ValueError("There are not enough samples to yield the required" + " amount of clusters for the selected number of basis," + " the current maximum is n_basis = %d" % + X.shape[0]*2*num_eig) # TODO: maybe give acces to Kmeans jobs for faster computation? kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state, @@ -426,8 +506,11 @@ def _generate_bases_LDA(self, X, y): idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) + # TODO: It may be better to precompute this similarly to how it is done + # with the triplets generator start = 0 finish = 0 + neigh = NearestNeighbors() for c in range(n_class): @@ -443,11 +526,11 @@ def _generate_bases_LDA(self, X, y): start = finish # Compute basis for every cluster in first scale - self.basis = np.zeros((self.n_basis, n_features)) + basis = np.zeros((n_basis, n_features)) lda = LinearDiscriminantAnalysis() for i in range(n_clusters): lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) - self.basis[num_eig*i: num_eig*(i+1), :] = normalize(lda.scalings_.T) + basis[num_eig*i: num_eig*(i+1), :] = normalize(lda.scalings_.T) # second scale k = 20 @@ -459,6 +542,8 @@ def _generate_bases_LDA(self, X, y): idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) + # TODO: It may be better to precompute this similarly to how it is done + # with the triplets generator start = 0 finish = 0 @@ -476,17 +561,17 @@ def _generate_bases_LDA(self, X, y): # Compute basis for every cluster in second scale finish = num_eig * n_clusters - lda = LinearDiscriminantAnalysis() + for i in range(n_clusters): start = finish finish += num_eig # handle tail, as n_basis != n_clusters*2*n_eig - if (finish > self.n_basis): - finish = self.n_basis + if (finish > n_basis): + finish = n_basis lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) - self.basis[start:finish, :] = normalize(lda.scalings_.T[:finish-start]) + basis[start:finish, :] = normalize(lda.scalings_.T[:finish-start]) - return + return basis, n_basis From e9e654c9bb7ac9a94241f39050884e7373d67ea2 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 11 Mar 2020 15:23:37 +0100 Subject: [PATCH 15/70] triplet dataset format & remove_y for triplets --- test/test_utils.py | 52 +++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index d6e58ab6..bf287216 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -91,10 +91,10 @@ def build_triplets(with_preprocessor=False): triplets = constraints.generate_knntriplets(X, k_genuine=3, k_impostor=4) if with_preprocessor: # if preprocessor, we build a 2D array of triplets of indices - return triplets, X + return Dataset(triplets, np.ones(len(triplets)), X, np.arange(len(X))) else: # if not, we build a 3D array of triplets of samples - return X[triplets], None + return Dataset(X[triplets], np.ones(len(triplets)), None, X) def build_quadruplets(with_preprocessor=False): @@ -153,8 +153,9 @@ def build_quadruplets(with_preprocessor=False): _TripletsClassifierMixin, _QuadrupletsClassifierMixin) -tuples_learners = pairs_learners + quadruplets_learners -ids_tuples_learners = ids_pairs_learners + ids_quadruplets_learners +tuples_learners = pairs_learners + triplets_learners + quadruplets_learners +ids_tuples_learners = ids_pairs_learners + ids_triplets_learners \ + + ids_quadruplets_learners supervised_learners = classifiers + regressors ids_supervised_learners = ids_classifiers + ids_regressors @@ -163,13 +164,13 @@ def build_quadruplets(with_preprocessor=False): ids_metric_learners = ids_tuples_learners + ids_supervised_learners -def remove_y_quadruplets(estimator, X, y): - """Quadruplets learners have no y in fit, but to write test for all - estimators, it is convenient to have this function, that will return X and y - if the estimator needs a y to fit on, and just X otherwise.""" +def remove_y(estimator, X, y): + """Quadruplets and triplets learners have no y in fit, but to write test for + all estimators, it is convenient to have this function, that will return X + and y if the estimator needs a y to fit on, and just X otherwise.""" + no_y_fit = quadruplets_learners + triplets_learners if estimator.__class__.__name__ in [e.__class__.__name__ - for (e, _) in - quadruplets_learners]: + for (e, _) in no_y_fit]: return (X,) else: return (X, y) @@ -817,13 +818,12 @@ def test_error_message_tuple_size(estimator, _): per tuple, it throws an error message""" estimator = clone(estimator) set_random_state(estimator) - invalid_pairs = np.array([[[1.3, 6.3], [3., 6.8], [6.5, 4.4]], - [[1.9, 5.3], [1., 7.8], [3.2, 1.2]]]) + invalid_pairs = np.ones((2, 5, 2)) y = [1, 1] with pytest.raises(ValueError) as raised_err: - estimator.fit(*remove_y_quadruplets(estimator, invalid_pairs, y)) - expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 3 " - "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n" + estimator.fit(*remove_y(estimator, invalid_pairs, y)) + expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 5 " + "element(s) instead (shape=(2, 5, 2)):\ninput={}.\n" .format(estimator._tuple_size, make_context(estimator), invalid_pairs)) assert str(raised_err.value) == expected_msg @@ -897,35 +897,21 @@ def test_same_with_or_without_preprocessor(estimator, build_dataset): dataset_formed.data, random_state=SEED) - def make_random_state(estimator): - rs = {} - if estimator.__class__.__name__[-11:] == '_Supervised': - rs['random_state'] = check_random_state(SEED) - return rs - estimator_with_preprocessor = clone(estimator) set_random_state(estimator_with_preprocessor) estimator_with_preprocessor.set_params(preprocessor=X) - estimator_with_preprocessor.fit(*remove_y_quadruplets(estimator, - indices_train, - y_train), - **make_random_state(estimator)) + estimator_with_preprocessor.fit(*remove_y(estimator, indices_train, y_train)) estimator_without_preprocessor = clone(estimator) set_random_state(estimator_without_preprocessor) estimator_without_preprocessor.set_params(preprocessor=None) - estimator_without_preprocessor.fit(*remove_y_quadruplets(estimator, - formed_train, - y_train), - **make_random_state(estimator)) + estimator_without_preprocessor.fit(*remove_y(estimator, formed_train, + y_train)) estimator_with_prep_formed = clone(estimator) set_random_state(estimator_with_prep_formed) estimator_with_prep_formed.set_params(preprocessor=X) - estimator_with_prep_formed.fit(*remove_y_quadruplets(estimator, - indices_train, - y_train), - **make_random_state(estimator)) + estimator_with_prep_formed.fit(*remove_y(estimator, indices_train, y_train)) # test prediction methods for method in ["predict", "decision_function"]: From 686b7ebb32499509b3d9c48281ef119ff29f4887 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 11 Mar 2020 15:24:25 +0100 Subject: [PATCH 16/70] adaptation with dataset format --- test/test_triplets_classifiers.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/test_triplets_classifiers.py b/test/test_triplets_classifiers.py index 8cedd8cc..10393919 100644 --- a/test/test_triplets_classifiers.py +++ b/test/test_triplets_classifiers.py @@ -14,7 +14,7 @@ def test_predict_only_one_or_minus_one(estimator, build_dataset, with_preprocessor): """Test that all predicted values are either +1 or -1""" - input_data, preprocessor = build_dataset(with_preprocessor) + input_data, _, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) @@ -33,7 +33,7 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset, with_preprocessor): """Test that a NotFittedError is raised if someone tries to predict and the metric learner has not been fitted.""" - input_data, preprocessor = build_dataset(with_preprocessor) + input_data, _, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) @@ -46,8 +46,7 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset, def test_accuracy_toy_example(estimator, build_dataset): """Test that the default scoring for triplets (accuracy) works on some toy example""" - triplets, X = build_dataset(with_preprocessor=True) - triplets = X[triplets] + triplets, _, _, X = build_dataset(with_preprocessor=False) estimator = clone(estimator) set_random_state(estimator) estimator.fit(triplets) From 4ff5f4cb5e6445620db0e76cf129e3340d541c4a Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 11 Mar 2020 15:24:52 +0100 Subject: [PATCH 17/70] remove labels for triplets and quadruplets --- test/test_base_metric.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/test_base_metric.py b/test/test_base_metric.py index b2b1d339..b1be4e84 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -5,7 +5,7 @@ import numpy as np from sklearn import clone from sklearn.utils.testing import set_random_state -from test.test_utils import ids_metric_learners, metric_learners +from test.test_utils import ids_metric_learners, metric_learners, remove_y def remove_spaces(s): @@ -135,12 +135,12 @@ def test_get_metric_is_independent_from_metric_learner(estimator, # we fit the metric learner on it and then we compute the metric on some # points - model.fit(input_data, labels) + model.fit(*remove_y(model, input_data, labels)) metric = model.get_metric() score = metric(X[0], X[1]) # then we refit the estimator on another dataset - model.fit(np.sin(input_data), labels) + model.fit(*remove_y(model, np.sin(input_data), labels)) # we recompute the distance between the two points: it should be the same score_bis = metric(X[0], X[1]) @@ -155,7 +155,7 @@ def test_get_metric_raises_error(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(input_data, labels) + model.fit(*remove_y(model, input_data, labels)) metric = model.get_metric() list_test_get_metric_raises = [(X[0].tolist() + [5.2], X[1]), # vectors with @@ -178,7 +178,7 @@ def test_get_metric_works_does_not_raise(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(input_data, labels) + model.fit(*remove_y(model, input_data, labels)) metric = model.get_metric() list_test_get_metric_doesnt_raise = [(X[0], X[1]), @@ -210,20 +210,20 @@ def test_n_components(estimator, build_dataset): if hasattr(model, 'n_components'): set_random_state(model) model.set_params(n_components=None) - model.fit(input_data, labels) + model.fit(*remove_y(model, input_data, labels)) assert model.components_.shape == (X.shape[1], X.shape[1]) model = clone(estimator) set_random_state(model) model.set_params(n_components=X.shape[1] - 1) - model.fit(input_data, labels) + model.fit(*remove_y(model, input_data, labels)) assert model.components_.shape == (X.shape[1] - 1, X.shape[1]) model = clone(estimator) set_random_state(model) model.set_params(n_components=X.shape[1] + 1) with pytest.raises(ValueError) as expected_err: - model.fit(input_data, labels) + model.fit(*remove_y(model, input_data, labels)) assert (str(expected_err.value) == 'Invalid n_components, must be in [1, {}]'.format(X.shape[1])) @@ -231,7 +231,7 @@ def test_n_components(estimator, build_dataset): set_random_state(model) model.set_params(n_components=0) with pytest.raises(ValueError) as expected_err: - model.fit(input_data, labels) + model.fit(*remove_y(model, input_data, labels)) assert (str(expected_err.value) == 'Invalid n_components, must be in [1, {}]'.format(X.shape[1])) From dc50dc734c397962a3d37bc15cb71500f6872694 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 11 Mar 2020 15:26:38 +0100 Subject: [PATCH 18/70] remove labels --- test/test_mahalanobis_mixin.py | 45 ++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 91fb435f..bf288e51 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -15,11 +15,12 @@ from metric_learn._util import make_context, _initialize_metric_mahalanobis from metric_learn.base_metric import (_QuadrupletsClassifierMixin, + _TripletsClassifierMixin, _PairsClassifierMixin) from metric_learn.exceptions import NonPSDError from test.test_utils import (ids_metric_learners, metric_learners, - remove_y_quadruplets, ids_classifiers) + remove_y, ids_classifiers) RNG = check_random_state(0) @@ -33,7 +34,7 @@ def test_score_pairs_pairwise(estimator, build_dataset): X = X[:n_samples] model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) pairwise = model.score_pairs(np.array(list(product(X, X))))\ .reshape(n_samples, n_samples) @@ -57,7 +58,7 @@ def test_score_pairs_toy_example(estimator, build_dataset): X = X[:n_samples] model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) pairs = np.stack([X[:10], X[10:20]], axis=1) embedded_pairs = pairs.dot(model.components_.T) distances = np.sqrt(np.sum((embedded_pairs[:, 1] - @@ -73,7 +74,7 @@ def test_score_pairs_finite(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) pairs = np.array(list(product(X, X))) assert np.isfinite(model.score_pairs(pairs)).all() @@ -87,7 +88,7 @@ def test_score_pairs_dim(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) tuples = np.array(list(product(X, X))) assert model.score_pairs(tuples).shape == (tuples.shape[0],) context = make_context(estimator) @@ -118,7 +119,7 @@ def test_embed_toy_example(estimator, build_dataset): X = X[:n_samples] model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) embedded_points = X.dot(model.components_.T) assert_array_almost_equal(model.transform(X), embedded_points) @@ -130,7 +131,7 @@ def test_embed_dim(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) assert model.transform(X).shape == X.shape # assert that ValueError is thrown if input shape is 1D @@ -144,7 +145,7 @@ def test_embed_dim(estimator, build_dataset): # we test that the shape is also OK when doing dimensionality reduction if hasattr(model, 'n_components'): model.set_params(n_components=2) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) assert model.transform(X).shape == (X.shape[0], 2) # assert that ValueError is thrown if input shape is 1D with pytest.raises(ValueError) as raised_error: @@ -159,7 +160,7 @@ def test_embed_finite(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) assert np.isfinite(model.transform(X)).all() @@ -170,7 +171,7 @@ def test_embed_is_linear(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) assert_array_almost_equal(model.transform(X[:10] + X[10:20]), model.transform(X[:10]) + model.transform(X[10:20])) @@ -189,7 +190,7 @@ def test_get_metric_equivalent_to_explicit_mahalanobis(estimator, input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) metric = model.get_metric() n_features = X.shape[1] a, b = (rng.randn(n_features), rng.randn(n_features)) @@ -208,7 +209,7 @@ def test_get_metric_is_pseudo_metric(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) metric = model.get_metric() n_features = X.shape[1] @@ -234,7 +235,7 @@ def test_metric_raises_deprecation_warning(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) with pytest.warns(DeprecationWarning) as raised_warning: model.metric() @@ -251,7 +252,7 @@ def test_get_metric_compatible_with_scikit_learn(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) clustering = DBSCAN(metric=model.get_metric()) clustering.fit(X) @@ -264,7 +265,7 @@ def test_get_squared_metric(estimator, build_dataset): input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) metric = model.get_metric() n_features = X.shape[1] @@ -284,7 +285,7 @@ def test_components_is_2D(estimator, build_dataset): model = clone(estimator) set_random_state(model) # test that it works for X.shape[1] features - model.fit(*remove_y_quadruplets(estimator, input_data, labels)) + model.fit(*remove_y(estimator, input_data, labels)) assert model.components_.shape == (X.shape[1], X.shape[1]) # test that it works for 1 feature @@ -298,12 +299,18 @@ def test_components_is_2D(estimator, build_dataset): to_keep = np.where(np.abs(diffs.ravel()) > 1e-9) trunc_data = trunc_data[to_keep] labels = labels[to_keep] + if isinstance(estimator, _TripletsClassifierMixin): + for slice_idx in [[0, 1], [0, 2]]: + pairs = trunc_data[:, slice_idx, :] + diffs = pairs[:, 1, :] - pairs[:, 0, :] + to_keep = np.abs(diffs.ravel()) > 1e-9 + trunc_data = trunc_data[to_keep] elif isinstance(estimator, _PairsClassifierMixin): diffs = trunc_data[:, 1, :] - trunc_data[:, 0, :] to_keep = np.where(np.abs(diffs.ravel()) > 1e-9) trunc_data = trunc_data[to_keep] labels = labels[to_keep] - model.fit(*remove_y_quadruplets(estimator, trunc_data, labels)) + model.fit(*remove_y(estimator, trunc_data, labels)) assert model.components_.shape == (1, 1) # the components must be 2D @@ -735,9 +742,9 @@ def test_deterministic_initialization(estimator, build_dataset): model.set_params(prior='random') model1 = clone(model) set_random_state(model1, 42) - model1 = model1.fit(input_data, labels) + model1 = model1.fit(*remove_y(model, input_data, labels)) model2 = clone(model) set_random_state(model2, 42) - model2 = model2.fit(input_data, labels) + model2 = model2.fit(*remove_y(model, input_data, labels)) np.testing.assert_allclose(model1.get_mahalanobis_matrix(), model2.get_mahalanobis_matrix()) From 10d1d0416f2773d046048cd263eb3c3a3861a00d Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 11 Mar 2020 15:28:19 +0100 Subject: [PATCH 19/70] remove labels & old fit random_state asignation --- test/test_sklearn_compat.py | 63 +++++++++++++++---------------------- 1 file changed, 26 insertions(+), 37 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index b2056c09..539cb1ee 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -20,8 +20,8 @@ from test.test_utils import (metric_learners, ids_metric_learners, mock_preprocessor, tuples_learners, ids_tuples_learners, pairs_learners, - ids_pairs_learners, remove_y_quadruplets, - quadruplets_learners) + ids_pairs_learners, remove_y, + triplets_learners, quadruplets_learners) class Stable_RCA_Supervised(RCA_Supervised): @@ -125,8 +125,7 @@ def test_array_like_inputs(estimator, build_dataset, with_preprocessor): input_variants, label_variants = generate_array_like(input_data, labels) for input_variant in input_variants: for label_variant in label_variants: - estimator.fit(*remove_y_quadruplets(estimator, input_variant, - label_variant)) + estimator.fit(*remove_y(estimator, input_variant, label_variant)) if hasattr(estimator, "predict"): estimator.predict(input_variant) if hasattr(estimator, "predict_proba"): @@ -137,8 +136,7 @@ def test_array_like_inputs(estimator, build_dataset, with_preprocessor): estimator.decision_function(input_variant) if hasattr(estimator, "score"): for label_variant in label_variants: - estimator.score(*remove_y_quadruplets(estimator, input_variant, - label_variant)) + estimator.score(*remove_y(estimator, input_variant, label_variant)) X_variants, _ = generate_array_like(X) for X_variant in X_variants: @@ -199,13 +197,10 @@ def test_cross_validation_is_finite(estimator, build_dataset): estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) assert np.isfinite(cross_val_score(estimator, - *remove_y_quadruplets(estimator, - input_data, - labels))).all() + *remove_y(estimator, input_data, labels) + )).all() assert np.isfinite(cross_val_predict(estimator, - *remove_y_quadruplets(estimator, - input_data, - labels) + *remove_y(estimator, input_data, labels) )).all() @@ -237,28 +232,26 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset, train_mask = np.ones(input_data.shape[0], bool) train_mask[test_slice] = False y_train, y_test = labels[train_mask], labels[test_slice] - estimator.fit(*remove_y_quadruplets(estimator, - input_data[train_mask], - y_train)) + estimator.fit(*remove_y(estimator, input_data[train_mask], y_train)) if hasattr(estimator, "score"): - scores.append(estimator.score(*remove_y_quadruplets( + scores.append(estimator.score(*remove_y( estimator, input_data[test_slice], y_test))) if hasattr(estimator, "predict"): predictions[test_slice] = estimator.predict(input_data[test_slice]) if hasattr(estimator, "score"): assert all(scores == cross_val_score( - estimator, *remove_y_quadruplets(estimator, input_data, labels), + estimator, *remove_y(estimator, input_data, labels), cv=kfold)) if hasattr(estimator, "predict"): assert all(predictions == cross_val_predict( estimator, - *remove_y_quadruplets(estimator, input_data, labels), + *remove_y(estimator, input_data, labels), cv=kfold)) def check_score(estimator, tuples, y): if hasattr(estimator, "score"): - score = estimator.score(*remove_y_quadruplets(estimator, tuples, y)) + score = estimator.score(*remove_y(estimator, tuples, y)) assert np.isfinite(score) @@ -282,7 +275,7 @@ def test_simple_estimator(estimator, build_dataset, with_preprocessor): estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) - estimator.fit(*remove_y_quadruplets(estimator, tuples_train, y_train)) + estimator.fit(*remove_y(estimator, tuples_train, y_train)) check_score(estimator, tuples_test, y_test) check_predict(estimator, tuples_test) @@ -329,9 +322,7 @@ def test_estimators_fit_returns_self(estimator, build_dataset, input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) - assert estimator.fit(*remove_y_quadruplets(estimator, - input_data, - labels)) is estimator + assert estimator.fit(*remove_y(estimator, input_data, labels)) is estimator @pytest.mark.parametrize('with_preprocessor', [True, False]) @@ -343,9 +334,10 @@ def test_pipeline_consistency(estimator, build_dataset, # check that make_pipeline(est) gives same score as est # we do this test on all except quadruplets (since they don't have a y # in fit): + no_label_learners = quadruplets_learners + triplets_learners if estimator.__class__.__name__ not in [e.__class__.__name__ for (e, _) in - quadruplets_learners]: + no_label_learners]: input_data, y, preprocessor, _ = build_dataset(with_preprocessor) def make_random_state(estimator, in_pipeline): @@ -359,20 +351,17 @@ def make_random_state(estimator, in_pipeline): return rs estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) + estimator.set_params(preprocessor=preprocessor, + **make_random_state(estimator, False)) pipeline = make_pipeline(estimator) - estimator.fit(*remove_y_quadruplets(estimator, input_data, y), - **make_random_state(estimator, False)) - pipeline.fit(*remove_y_quadruplets(estimator, input_data, y), - **make_random_state(estimator, True)) + estimator.fit(input_data, y) + estimator.set_params(preprocessor=preprocessor) + pipeline.set_params(**make_random_state(estimator, True)) + pipeline.fit(input_data, y) if hasattr(estimator, 'score'): - result = estimator.score(*remove_y_quadruplets(estimator, - input_data, - y)) - result_pipe = pipeline.score(*remove_y_quadruplets(estimator, - input_data, - y)) + result = estimator.score(input_data, y) + result_pipe = pipeline.score(input_data, y) assert_allclose_dense_sparse(result, result_pipe) if hasattr(estimator, 'predict'): @@ -398,7 +387,7 @@ def test_dict_unchanged(estimator, build_dataset, with_preprocessor): estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "n_components"): estimator.n_components = 1 - estimator.fit(*remove_y_quadruplets(estimator, input_data, labels)) + estimator.fit(*remove_y(estimator, input_data, labels)) def check_dict(): assert estimator.__dict__ == dict_before, ( @@ -429,7 +418,7 @@ def test_dont_overwrite_parameters(estimator, build_dataset, estimator.n_components = 1 dict_before_fit = estimator.__dict__.copy() - estimator.fit(*remove_y_quadruplets(estimator, input_data, labels)) + estimator.fit(*remove_y(estimator, input_data, labels)) dict_after_fit = estimator.__dict__ public_keys_after_fit = [key for key in dict_after_fit.keys() From 28146628b4e15ff99f29ca89de6ac6b74d1f9236 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 11 Mar 2020 15:59:04 +0100 Subject: [PATCH 20/70] compliant with older numpy versions --- metric_learn/scml.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index f233ea45..164da4bd 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -14,6 +14,16 @@ from sklearn.utils import check_array, check_random_state import warnings +# hack around lack of where in older numpy versions +try: + np.sum([[0, 1], [1, 1]], where=[False, True], axis=1) +except TypeError: + def sum_were(X, where): + return np.sum(X, where=where) +else: + def sum_where(X, where): + return np.sum(X[where]) + class _BaseSCML_global(MahalanobisMixin): @@ -74,12 +84,12 @@ def _fit(self, triplets, X=None, basis=None, n_basis=None): # Every triplet distance difference in the space given by L # plus a slack of one - slack_val = 1 + np.matmul(dist_diff, w.T, order='F') + slack_val = 1 + np.matmul(dist_diff, w.T) # Mask of places with positive slack slack_mask = slack_val > 0 # loss function of learning task part of obj function - obj2 = np.sum(slack_val, where=slack_mask)/sizeT + obj2 = sum_where(slack_val, slack_mask)/sizeT obj = obj1 + obj2 if(self.verbose): @@ -156,7 +166,7 @@ def _get_components(self, w, basis): return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(B) else: # if metric is full rank - return np.linalg.cholesky(np.matmul(basis.T, w.T*basis, order='F')).T + return np.linalg.cholesky(np.matmul(basis.T, w.T*basis)).T def _to_index_points(self, triplets, X=None): shape = triplets.shape From e9f436283efc66da7400abe713a6bcf3983f8bb8 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 11 Mar 2020 16:21:27 +0100 Subject: [PATCH 21/70] small typo and fix order --- metric_learn/scml.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 164da4bd..1e078d32 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -18,11 +18,11 @@ try: np.sum([[0, 1], [1, 1]], where=[False, True], axis=1) except TypeError: - def sum_were(X, where): - return np.sum(X, where=where) -else: def sum_where(X, where): return np.sum(X[where]) +else: + def sum_where(X, where): + return np.sum(X, where=where) class _BaseSCML_global(MahalanobisMixin): From a9d1a025e82362209addb8ecaf37d72be7b9481f Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 11 Mar 2020 16:37:05 +0100 Subject: [PATCH 22/70] fix n_basis check --- metric_learn/scml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 1e078d32..01eb83a6 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -482,7 +482,7 @@ def _generate_bases_LDA(self, X, y): elif self.n_basis < n_class: raise ValueError("The number of basis should be greater than the" " number of classes") - elif np.issubdtype(self.n_basis, np.integer): + elif isinstance(self.n_basis, int): n_basis = self.n_basis else: raise ValueError("n_basis should be an integer, instead it is of type %s" From b1c01fd9dd718abfef03611a8be4f8ff250d8792 Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 12 Mar 2020 13:17:46 +0100 Subject: [PATCH 23/70] initialize_basis_supervised and some refactoring --- metric_learn/scml.py | 54 ++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 01eb83a6..6f1f9e54 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -170,16 +170,9 @@ def _get_components(self, w, basis): def _to_index_points(self, triplets, X=None): shape = triplets.shape - if len(shape) == 3: - X, triplets = np.unique(np.vstack(triplets), return_inverse=True, axis=0) - triplets = triplets.reshape(shape[:2]) - return triplets, X - elif(len(shape) == 2 and X is not None): - return triplets, X - elif(self.preprocessor is not None): - return triplets, self.preprocessor - else: - raise ValueError('A preprocessor is needed when triplets are indices') + X, triplets = np.unique(np.vstack(triplets), return_inverse=True, axis=0) + triplets = triplets.reshape(shape[:2]) + return triplets, X def _initialize_basis(self, triplets, X): """ TODO: complete function description @@ -232,19 +225,15 @@ def _generate_bases_dist_diff(self, triplets, X): n_basis = uniqPairs.shape[0] elif self.n_basis > uniqPairs.shape[0]: - print("n_basis too big") n_basis = uniqPairs.shape[0] + warnings.warn("The selected number of basis is greater than the number " + "of points, only n_basis = %d will be generated" % + n_basis) else: n_basis = self.n_basis - if len(triplets.shape) == 3: - pass - elif X is not None: - uniqPairs = X[uniqPairs] - else: - raise ValueError('The processor must be set if indices are used for the' - 'triplets construction') + uniqPairs = X[uniqPairs] rng = check_random_state(self.random_state) @@ -447,10 +436,7 @@ def fit(self, X, y): """ X, y = self._prepare_inputs(X, y, ensure_min_samples=2) - # TODO: this should be replaced by a _initialize_bases_supervised - # for future adding of other approaches of basis generation - if(self.basis == "LDA"): - basis, n_basis = self._generate_bases_LDA(X, y) + basis, n_basis = self._initialize_basis_supervised(X, y) constraints = Constraints(y) triplets = constraints.generate_knntriplets(X, self.k_genuine, @@ -458,6 +444,30 @@ def fit(self, X, y): return self._fit(triplets, X, basis, n_basis) + def _initialize_basis_supervised(self, X, y): + """ TODO: complete function description + """ + + # TODO: + # Add other options passed as string + authorized_basis = ['triplet_diffs'] + supervised_basis = ['LDA'] + authorized_basis += supervised_basis + + if not(isinstance(self.basis, np.ndarray)) \ + and self.basis not in authorized_basis: + raise ValueError( + "`basis` must be '{}' " + "or a numpy array of shape (n_basis, n_features)." + .format("', '".join(authorized_basis))) + + if self.basis is supervised_basis[0]: + basis, n_basis = self._generate_bases_LDA(X, y) + else: + basis, n_basis = None, None + + return basis, n_basis + def _generate_bases_LDA(self, X, y): """ Helper function that computes the n_basis basis set constructed from the From f4217c8b8d7e030582a536fafbe7652a703c466b Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 12 Mar 2020 15:25:14 +0100 Subject: [PATCH 24/70] proper n_basis handling --- metric_learn/scml.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 6f1f9e54..e38f66ca 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -222,9 +222,17 @@ def _generate_bases_dist_diff(self, triplets, X): uniqPairs = np.unique(T_pairs_sorted, axis=0) if self.n_basis is None: + # TODO: Get a good default n_basis directive n_basis = uniqPairs.shape[0] + warnings.warn('The number of basis will be set to n_basis= %d' % n_basis) + + elif isinstance(self.n_basis, int): + n_basis = self.n_basis + else: + raise ValueError("n_basis should be an integer, instead it is of type %s" + % type(self.n_basis)) - elif self.n_basis > uniqPairs.shape[0]: + if n_basis > uniqPairs.shape[0]: n_basis = uniqPairs.shape[0] warnings.warn("The selected number of basis is greater than the number " "of points, only n_basis = %d will be generated" % @@ -452,7 +460,7 @@ def _initialize_basis_supervised(self, X, y): # Add other options passed as string authorized_basis = ['triplet_diffs'] supervised_basis = ['LDA'] - authorized_basis += supervised_basis + authorized_basis = supervised_basis + authorized_basis if not(isinstance(self.basis, np.ndarray)) \ and self.basis not in authorized_basis: @@ -485,19 +493,23 @@ def _generate_bases_LDA(self, X, y): num_eig = min(n_class-1, n_features) if self.n_basis is None: + # TODO: Get a good default n_basis directive n_basis = min(20*n_features, X.shape[0]*2*num_eig) warnings.warn('The number of basis will be set to n_basis= %d' % n_basis) - # n_basis must be greater or equal to n_class - elif self.n_basis < n_class: - raise ValueError("The number of basis should be greater than the" - " number of classes") elif isinstance(self.n_basis, int): n_basis = self.n_basis else: raise ValueError("n_basis should be an integer, instead it is of type %s" % type(self.n_basis)) + if n_basis <= n_class: + raise ValueError("The number of basis should be greater than the" + " number of classes") + elif n_basis >= X.shape[0]*2*num_eig: + raise ValueError("The selected number of basis needs a greater number of" + " clusters than the number of available samples") + # Number of clusters needed for 2 scales given the number of basis # yielded by every LDA n_clusters = int(np.ceil(n_basis/(2 * num_eig))) From 8dd0fbe644ac129e159311844dbc4db8cde8f1a4 Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 12 Mar 2020 15:25:57 +0100 Subject: [PATCH 25/70] scml specific tests --- test/metric_learn_test.py | 109 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 3 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 5a271890..fd32f6c0 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -21,9 +21,10 @@ else: HAS_SKGGM = True from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, - LSML_Supervised, ITML_Supervised, SDML_Supervised, - RCA_Supervised, MMC_Supervised, SDML, RCA, ITML, - LSML) + SCML_global_Supervised, LSML_Supervised, + ITML_Supervised, SDML_Supervised, RCA_Supervised, + MMC_Supervised, SDML, RCA, ITML, LSML, SCML_global, + Constraints) # Import this specially for testing. from metric_learn.constraints import wrap_pairs from metric_learn.lmnn import _sum_outer_products @@ -76,6 +77,108 @@ def test_singular_returns_pseudo_inverse(self): pseudo_inverse) +class TestSCML(MetricTestCase): + def test_iris(self): + scml = SCML_global_Supervised() + scml.fit(self.iris_points, self.iris_labels) + + csep = class_separation(scml.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.3) + + def test_bad_basis(self): + scml = SCML_global(basis='bad_basis') + triplets = np.ones((3, 3, 3)) + authorized_basis = ['triplet_diffs'] + msg = ("`basis` must be '{}' or a numpy array of shape (n_basis, " + "n_features).".format("', '".join(authorized_basis))) + with pytest.raises(ValueError) as raised_error: + scml.fit(triplets) + assert msg == raised_error.value.args[0] + + def test_big_n_basis(self): + scml = SCML_global(n_basis=4) + triplets = np.ones((3, 3, 3)) + n_basis = 1 + msg = ("The selected number of basis is greater than the number of points" + ", only n_basis = %d will be generated" % n_basis) + with pytest.warns(UserWarning) as raised_warning: + scml.fit(triplets) + assert msg == raised_warning[0].message.args[0] + + def test_n_basis_wrong_type(self): + X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) + y = np.array([1, 0, 1, 0]) + + constraints = Constraints(y) + triplets = constraints.generate_knntriplets(X, k_genuine=1, k_impostor=1) + triplets = X[triplets] + + n_basis = 4.0 + + scml = SCML_global(n_basis=n_basis) + msg = ("n_basis should be an integer, instead it is of type %s" + % type(n_basis)) + with pytest.raises(ValueError) as raised_error: + scml.fit(triplets) + assert msg == raised_error.value.args[0] + + def test_bad_basis_supervised(self): + scml = SCML_global_Supervised(basis='bad_basis') + X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) + y = np.array([1, 0, 1, 0]) + authorized_basis = ['triplet_diffs'] + supervised_basis = ['LDA'] + authorized_basis = supervised_basis + authorized_basis + msg = ("`basis` must be '{}' or a numpy array of shape (n_basis, " + "n_features).".format("', '".join(authorized_basis))) + with pytest.raises(ValueError) as raised_error: + scml.fit(X, y) + assert msg == raised_error.value.args[0] + + def test_small_n_basis_supervised(self): + X = np.array([[0, 0], [1, 1], [3, 3]]) + y = np.array([1, 2, 3]) + + labels, class_count = np.unique(y, return_counts=True) + n_class = len(labels) + + scml = SCML_global_Supervised(n_basis=n_class) + msg = ("The number of basis should be greater than the number of classes") + with pytest.raises(ValueError) as raised_error: + scml.fit(X, y) + assert msg == raised_error.value.args[0] + + def test_big_n_basis_supervised(self): + X = np.array([[0, 0], [1, 1], [3, 3]]) + y = np.array([1, 2, 3]) + + labels, class_count = np.unique(y, return_counts=True) + n_class = len(labels) + num_eig = min(n_class-1, X.shape[1]) + + n_basis = X.shape[0]*2*num_eig + + scml = SCML_global_Supervised(n_basis=n_basis) + msg = ("The selected number of basis needs a greater number of clusters" + " than the number of available samples") + with pytest.raises(ValueError) as raised_error: + scml.fit(X, y) + assert msg == raised_error.value.args[0] + + def test_n_basis_wrong_type_supervised(self): + X = np.array([[0, 0], [1, 1], [3, 3]]) + y = np.array([1, 2, 3]) + + n_basis = 4.0 + + scml = SCML_global_Supervised(n_basis=n_basis) + msg = ("n_basis should be an integer, instead it is of type %s" + % type(n_basis)) + with pytest.raises(ValueError) as raised_error: + scml.fit(X, y) + assert msg == raised_error.value.args[0] + + class TestLSML(MetricTestCase): def test_iris(self): lsml = LSML_Supervised(num_constraints=200) From 8c6567e609db6496d404b7dac22e5ffd55873d4b Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 12 Mar 2020 16:00:28 +0100 Subject: [PATCH 26/70] remove small mistake --- metric_learn/scml.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index e38f66ca..a60914a7 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -238,9 +238,6 @@ def _generate_bases_dist_diff(self, triplets, X): "of points, only n_basis = %d will be generated" % n_basis) - else: - n_basis = self.n_basis - uniqPairs = X[uniqPairs] rng = check_random_state(self.random_state) From b8bc94eec2939be44e5202a6d2823e865c529e36 Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 12 Mar 2020 16:46:33 +0100 Subject: [PATCH 27/70] test user input basis --- metric_learn/scml.py | 12 +----------- test/metric_learn_test.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index a60914a7..ee754a54 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -177,11 +177,7 @@ def _to_index_points(self, triplets, X=None): def _initialize_basis(self, triplets, X): """ TODO: complete function description """ - - if self.preprocessor is not None: - n_features = self.preprocessor.shape[1] - else: - n_features = triplets.shape[1] + n_features = X.shape[1] # TODO: # Add other options passed as string @@ -511,12 +507,6 @@ def _generate_bases_LDA(self, X, y): # yielded by every LDA n_clusters = int(np.ceil(n_basis/(2 * num_eig))) - if(n_clusters > X.shape[0]): - raise ValueError("There are not enough samples to yield the required" - " amount of clusters for the selected number of basis," - " the current maximum is n_basis = %d" % - X.shape[0]*2*num_eig) - # TODO: maybe give acces to Kmeans jobs for faster computation? kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state, algorithm='elkan').fit(X) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index fd32f6c0..b272b7bf 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -178,6 +178,25 @@ def test_n_basis_wrong_type_supervised(self): scml.fit(X, y) assert msg == raised_error.value.args[0] + def test_array_basis_supervised(self): + """ Test that the proper error is raised when the shape of the input basis + array is not consistent with the input + """ + X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) + y = np.array([1, 0, 1, 0]) + + basis = np.eye(3) + + scml = SCML_global_Supervised(n_basis=3, basis=basis, k_genuine=1, + k_impostor=1) + + msg = ('The input dimensionality ({}) of the given linear transformation ' + '`init` must match the dimensionality of the given inputs `X` ({}).' + .format(basis.shape[1], X.shape[1])) + with pytest.raises(ValueError) as raised_error: + scml.fit(X, y) + assert msg == raised_error.value.args[0] + class TestLSML(MetricTestCase): def test_iris(self): From cfad0b969f4cb46906d6c1bfdbfb0f62c61e38dd Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 17 Mar 2020 12:49:08 +0100 Subject: [PATCH 28/70] Changed names and messages and some refactoring --- metric_learn/__init__.py | 6 +- metric_learn/scml.py | 198 ++++++++++++++++++++++---------------- test/metric_learn_test.py | 35 ++++--- test/test_utils.py | 6 +- 4 files changed, 138 insertions(+), 107 deletions(-) diff --git a/metric_learn/__init__.py b/metric_learn/__init__.py index 6e7cc30c..38aa2f7e 100644 --- a/metric_learn/__init__.py +++ b/metric_learn/__init__.py @@ -11,12 +11,12 @@ from .rca import RCA, RCA_Supervised from .mlkr import MLKR from .mmc import MMC, MMC_Supervised -from .scml import SCML_global, SCML_global_Supervised +from .scml import SCML, SCML_Supervised from ._version import __version__ __all__ = ['Constraints', 'Covariance', 'ITML', 'ITML_Supervised', 'LMNN', 'LSML', 'LSML_Supervised', 'SDML', 'SDML_Supervised', 'NCA', 'LFDA', 'RCA', 'RCA_Supervised', - 'MLKR', 'MMC', 'MMC_Supervised', 'SCML_global', - 'SCML_global_Supervised', '__version__'] + 'MLKR', 'MMC', 'MMC_Supervised', 'SCML', + 'SCML_Supervised', '__version__'] diff --git a/metric_learn/scml.py b/metric_learn/scml.py index ee754a54..a5d29e8c 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -25,7 +25,7 @@ def sum_where(X, where): return np.sum(X, where=where) -class _BaseSCML_global(MahalanobisMixin): +class _BaseSCML(MahalanobisMixin): _tuple_size = 3 # constraints are triplets @@ -41,7 +41,7 @@ def __init__(self, beta=1e-5, basis='triplet_diffs', n_basis=None, self.verbose = verbose self.preprocessor = preprocessor self.random_state = random_state - super(_BaseSCML_global, self).__init__(preprocessor) + super(_BaseSCML, self).__init__(preprocessor) def _fit(self, triplets, X=None, basis=None, n_basis=None): """ @@ -61,14 +61,14 @@ def _fit(self, triplets, X=None, basis=None, n_basis=None): # compliant with the current handling of inputs it is converted # back to indices by the following function. This should be improved # in the future. - triplets, X = self._to_index_points(triplets, X) + triplets, X = self._to_index_points(triplets) if basis is None: basis, n_basis = self._initialize_basis(triplets, X) dist_diff = self._compute_dist_diff(triplets, X, basis) - sizeT = triplets.shape[0] + n_triplets = triplets.shape[0] w = np.zeros((1, n_basis)) avg_grad_w = np.zeros((1, n_basis)) @@ -76,9 +76,9 @@ def _fit(self, triplets, X=None, basis=None, n_basis=None): best_obj = np.inf rng = check_random_state(self.random_state) - rand_int = rng.randint(low=0, high=sizeT, size=self.max_iter) + rand_int = rng.randint(low=0, high=n_triplets, size=self.max_iter) for iter in range(self.max_iter): - if (iter % self.output_iter == 0): + if iter % self.output_iter == 0: # regularization part of obj function obj1 = np.sum(w)*self.beta @@ -89,16 +89,16 @@ def _fit(self, triplets, X=None, basis=None, n_basis=None): slack_mask = slack_val > 0 # loss function of learning task part of obj function - obj2 = sum_where(slack_val, slack_mask)/sizeT + obj2 = sum_where(slack_val, slack_mask)/n_triplets obj = obj1 + obj2 - if(self.verbose): + if self.verbose: count = np.sum(slack_mask) print("[Global] iter %d\t obj %.6f\t num_imp %d" % (iter, obj, count)) # update the best - if (obj < best_obj): + if obj < best_obj: best_obj = obj best_w = w @@ -109,7 +109,7 @@ def _fit(self, triplets, X=None, basis=None, n_basis=None): slack_val = 1 + np.matmul(dist_diff[idx, :], w.T) - if (slack_val > 0): + if slack_val > 0: avg_grad_w = (iter * avg_grad_w + dist_diff[idx, :]) / (iter+1) else: avg_grad_w = iter * avg_grad_w / (iter+1) @@ -119,11 +119,11 @@ def _fit(self, triplets, X=None, basis=None, n_basis=None): # proximal operator with negative trimming equivalent w = scale_f * np.minimum(avg_grad_w + self.beta, 0) - if(self.verbose): + if self.verbose: print("max iteration reached.") # return L matrix yielded from best weights - self.components_ = self._get_components(best_w, basis) + self.components_ = self._components_from_basis_weights(basis, best_w) return self @@ -135,7 +135,7 @@ def _compute_dist_diff(self, T, X, basis): # Transformation of data by the basis set XB = np.matmul(X, basis.T) - lenT = len(T) + lenT = T.shape[0] # get all positive and negative pairs with lowest index first # np.array (2*lenT,2) T_pairs_sorted = np.sort(np.vstack((T[:, [0, 1]], T[:, [0, 2]])), @@ -144,13 +144,13 @@ def _compute_dist_diff(self, T, X, basis): uniqPairs, indices = np.unique(T_pairs_sorted, return_inverse=True, axis=0) # calculate L2 distance acording to bases only for unique pairs - dist = np.square(XB[uniqPairs[:, 0], :]-XB[uniqPairs[:, 1], :]) + dist = np.square(XB[uniqPairs[:, 0], :] - XB[uniqPairs[:, 1], :]) # return the diference of distances between all positive and negative # pairs - return dist[indices[:lenT]]-dist[indices[lenT:]] + return dist[indices[:lenT]] - dist[indices[lenT:]] - def _get_components(self, w, basis): + def _components_from_basis_weights(self, basis, w): """ get components matrix (L) from computed mahalanobis matrix """ @@ -160,15 +160,18 @@ def _get_components(self, w, basis): w = w[..., active_idx] basis = basis[active_idx, :] - K, d = basis.shape + n_basis, n_features = basis.shape - if(K < d): # if metric is low-rank - return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(B) + if n_basis < n_features: # if metric is low-rank + warnings.warn("The number of effective basis is less than the numbert of" + " features of the input, in consequence the learned " + "transformation reduces the dimension to %d." % n_basis) + return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(basis) else: # if metric is full rank return np.linalg.cholesky(np.matmul(basis.T, w.T*basis)).T - def _to_index_points(self, triplets, X=None): + def _to_index_points(self, triplets): shape = triplets.shape X, triplets = np.unique(np.vstack(triplets), return_inverse=True, axis=0) triplets = triplets.reshape(shape[:2]) @@ -185,18 +188,16 @@ def _initialize_basis(self, triplets, X): if isinstance(self.basis, np.ndarray): # TODO: should copy? basis = check_array(self.basis, copy=True) - n_basis = basis.shape[0] if basis.shape[1] != n_features: - raise ValueError('The input dimensionality ({}) of the given ' - 'linear transformation `init` must match the ' - 'dimensionality of the given inputs `X` ({}).' - .format(basis.shape[1], n_features)) + raise ValueError('The dimensionality ({}) of the provided bases must' + ' match the dimensionality of the given inputs `X` ' + '({}).'.format(basis.shape[1], n_features)) elif self.basis not in authorized_basis: raise ValueError( - "`basis` must be '{}' " - "or a numpy array of shape (n_basis, n_features)." + "`basis` must be one of the options '{}' " + "or an array of shape (n_basis, n_features)." .format("', '".join(authorized_basis))) - if self.basis is authorized_basis[0]: + if self.basis == 'triplet_diffs': basis, n_basis = self._generate_bases_dist_diff(triplets, X) return basis, n_basis @@ -247,70 +248,87 @@ def _generate_bases_dist_diff(self, triplets, X): return basis, n_basis -class SCML_global(_BaseSCML_global, _TripletsClassifierMixin): +class SCML(_BaseSCML, _TripletsClassifierMixin): """Sparse Compositional Metric Learning (SCML) - `SCML` builds a metric as the sparse positive combination of a set of locally - discriminative rank-one PSD basis. This allows an optimization scheme with - only `K` parameters, that can be yielded with an efficient stochastic - composite optimization over a set of triplets constraints. Each triplet is - constructed as a relative distance comparison with respect to the first - element so that the second element is closer than the last. + `SCML` learns a metric from triplet constraints by optimizing sparse + positive weights assigned to a set of `K` locally discriminative rank-one + PSD bases. This can be formulated as an optimization problem with only `K` + parameters, that can be solved with an efficient stochastic composite scheme. + Read more in the :ref:`User Guide `. + Parameters ---------- beta: float (default=1e-5) L1 regularization parameter. - basis : None, string or numpy array, optional (default=None) - Prior to set for the metric. Possible options are - '', and a numpy array of shape (n_basis, n_features). If - None an error will be raised as the basis set is esential - to SCML. - numpy array + + basis : string or array-like, optional (default='triplet_diffs') + Set of bases to construct the metric. Possible options are + 'triplet_diffs', and an array-like of shape (n_basis, n_features). + + 'triplet_diffs' + The basis set is constructed from the differences between points of + `n_basis` positive or negative pairs taken from the triplets + constrains. + + array-like A matrix of shape (n_basis, n_features), that will be used as the basis set for the metric construction. + n_basis : int, optional Number of basis to be yielded. In case it is not set it will be set based - on the basis numpy array. If an string option is pased to basis an error - wild be raised as this value will be needed. + on `basis`. If no value is selected a default will be computed based on + the input. + gamma: float (default = 5e-3) - Learning rate + Learning rate for the optimization algorithm. + max_iter : int (default = 100000) - Number of iterations for the algorithm + Number of iterations for the algorithm. + output_iter : int (default = 5000) Number of iterations to check current weights performance and output this information in case verbose is True. + verbose : bool, optional - if True, prints information while learning + If True, prints information while learning. + preprocessor : array-like, shape=(n_samples, n_features) or callable The preprocessor to call to get triplets from indices. If array-like, triplets will be formed like this: X[indices]. + random_state : int or numpy.RandomState or None, optional (default=None) A pseudo random number generator object or a seed for it if int. + Attributes ---------- components_ : `numpy.ndarray`, shape=(n_features, n_features) The linear transformation ``L`` deduced from the learned Mahalanobis - metric (See function `components_from_metric`.) + metric (See function `_components_from_basis_weights`.) + Examples -------- - >>> from metric_learn import SCLM_global_Supervised - >>> from sklearn.datasets import load_iris - >>> iris_data = load_iris() - >>> X = iris_data['data'] - >>> Y = iris_data['target'] - >>> scml = SCML_global_Supervised(basis='LDA', n_basis=400) - >>> scml.fit(X, Y) + >>> from metric_learn import SCML + >>> triplets = [[[1.2, 7.5], [1.3, 1.5], [6.2, 9.7]], + >>> [[1.3, 4.5], [3.2, 4.6], [5.4, 5.4]], + >>> [[3.2, 7.5], [3.3, 1.5], [8.2, 9.7]], + >>> [[3.3, 4.5], [5.2, 4.6], [7.4, 5.4]]] + >>> scml = SCML() + >>> scml.fit(triplets) + References ---------- .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. `_. \ (AAAI), 2014. + .. [2] Adapted from original \ `Matlab implementation.`_. + See Also -------- - metric_learn.SCML_global : The original weakly-supervised algorithm. + metric_learn.SCML_Supervised : The supervised version of the algorithm. :ref:`supervised_version` : The section of the project documentation that describes the supervised version of weakly supervised estimators. @@ -324,10 +342,8 @@ def fit(self, triplets): triplets : array-like, shape=(n_constraints, 3, n_features) or \ (n_constraints, 3) 3D array-like of triplets of points or 2D array of triplets of - indicators. In order to supervise the algorithm in the right way, we - should have the three samples ordered in a way such that: - d(triplets[i, 0],triplets[i, 1]) < d(triplets[i, 1], triplets[i, 3]) - for all 0 <= i < n_constraints. + indicators. Triplets are assumed to be ordered such that: + d(triplets[i, 0],triplets[i, 1]) < d(triplets[i, 0], triplets[i, 2]). Returns ------- @@ -338,53 +354,65 @@ def fit(self, triplets): return self._fit(triplets) -class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): +class SCML_Supervised(_BaseSCML, TransformerMixin): """Supervised version of Sparse Compositional Metric Learning (SCML) - `SCML_global_Supervised` creates triplets by taking `k_genuine` neighbours - of the same class and `k_impostor` neighbours from diferent classes for each + `SCML_Supervised` creates triplets by taking `k_genuine` neighbours + of the same class and `k_impostor` neighbours from different classes for each point and then runs the SCML algorithm on these triplets. + Read more in the :ref:`User Guide `. + Parameters ---------- beta: float (default=1e-5) L1 regularization parameter. - basis : None, string or numpy array, optional (default=None) - Prior to set for the metric. Possible options are - 'LDA', and a numpy array of shape (n_basis, n_features). If - None an error will be raised as the basis set is esential - to SCML. + + basis : string or an array-like, optional (default='LDA') + Set of bases to construct the metric. Possible options are + 'LDA', and an array-like of shape (n_basis, n_features). + 'LDA' The `n_basis` basis set is constructed from the LDA of significant local regions in the feature space via clustering, for each region center k-nearest neighbors are used to obtain the LDA scalings, which correspond to the locally discriminative basis. - numpy array + + array-like A matrix of shape (n_basis, n_features), that will be used as the basis set for the metric construction. + n_basis : int, optional Number of basis to be yielded. In case it is not set it will be set based - on the basis numpy array. If an string option is pased to basis an error - wild be raised as this value will be needed. + on `basis`. If no value is selected a default will be computed based on + the input. + gamma: float (default = 5e-3) - Learning rate + Learning rate for the optimization algorithm. + max_iter : int (default = 100000) - Number of iterations for the algorithm + Number of iterations for the algorithm. + output_iter : int (default = 5000) Number of iterations to check current weights performance and output this information in case verbose is True. + verbose : bool, optional - if True, prints information while learning + If True, prints information while learning. + preprocessor : array-like, shape=(n_samples, n_features) or callable The preprocessor to call to get triplets from indices. If array-like, triplets will be formed like this: X[indices]. + random_state : int or numpy.RandomState or None, optional (default=None) A pseudo random number generator object or a seed for it if int. + Attributes ---------- components_ : `numpy.ndarray`, shape=(n_features, n_features) The linear transformation ``L`` deduced from the learned Mahalanobis - metric (See function `components_from_metric`.) + metric (See function `_components_from_basis_weights`.) + Examples -------- >>> from metric_learn import SCML @@ -394,17 +422,21 @@ class SCML_global_Supervised(_BaseSCML_global, TransformerMixin): >>> scml.fit(triplets) SCML(beta=1e-5, B=None, max_iter=100000, verbose=False, preprocessor=None, random_state=None) + References ---------- .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. `_. \ (AAAI), 2014. + .. [2] Adapted from original \ `Matlab implementation.`_. + See Also -------- - metric_learn.SCML_global_Supervised : The supervised version of this + metric_learn.SCML_Supervised : The supervised version of this algorithm, which construct the triplets from the labels. + :ref:`supervised_version` : The section of the project documentation that describes the supervised version of weakly supervised estimators. """ @@ -414,10 +446,10 @@ def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='LDA', verbose=False, preprocessor=None, random_state=None): self.k_genuine = k_genuine self.k_impostor = k_impostor - _BaseSCML_global.__init__(self, beta=beta, basis=basis, n_basis=n_basis, - max_iter=max_iter, verbose=verbose, - preprocessor=preprocessor, - random_state=random_state) + _BaseSCML.__init__(self, beta=beta, basis=basis, n_basis=n_basis, + max_iter=max_iter, verbose=verbose, + preprocessor=preprocessor, + random_state=random_state) def fit(self, X, y): """Create constraints from labels and learn the SCML model. @@ -458,11 +490,11 @@ def _initialize_basis_supervised(self, X, y): if not(isinstance(self.basis, np.ndarray)) \ and self.basis not in authorized_basis: raise ValueError( - "`basis` must be '{}' " - "or a numpy array of shape (n_basis, n_features)." + "`basis` must be one of the options '{}' " + "or an array of shape (n_basis, n_features)." .format("', '".join(authorized_basis))) - if self.basis is supervised_basis[0]: + if self.basis == 'LDA': basis, n_basis = self._generate_bases_LDA(X, y) else: basis, n_basis = None, None @@ -586,7 +618,7 @@ def _generate_bases_LDA(self, X, y): finish += num_eig # handle tail, as n_basis != n_clusters*2*n_eig - if (finish > n_basis): + if finish > n_basis: finish = n_basis lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index b272b7bf..4fed07d3 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -21,9 +21,9 @@ else: HAS_SKGGM = True from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, - SCML_global_Supervised, LSML_Supervised, + SCML_Supervised, LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, - MMC_Supervised, SDML, RCA, ITML, LSML, SCML_global, + MMC_Supervised, SDML, RCA, ITML, LSML, SCML, Constraints) # Import this specially for testing. from metric_learn.constraints import wrap_pairs @@ -79,24 +79,24 @@ def test_singular_returns_pseudo_inverse(self): class TestSCML(MetricTestCase): def test_iris(self): - scml = SCML_global_Supervised() + scml = SCML_Supervised() scml.fit(self.iris_points, self.iris_labels) csep = class_separation(scml.transform(self.iris_points), self.iris_labels) self.assertLess(csep, 0.3) def test_bad_basis(self): - scml = SCML_global(basis='bad_basis') + scml = SCML(basis='bad_basis') triplets = np.ones((3, 3, 3)) authorized_basis = ['triplet_diffs'] - msg = ("`basis` must be '{}' or a numpy array of shape (n_basis, " - "n_features).".format("', '".join(authorized_basis))) + msg = ("`basis` must be one of the options '{}' or an array of shape " + "(n_basis, n_features).".format("', '".join(authorized_basis))) with pytest.raises(ValueError) as raised_error: scml.fit(triplets) assert msg == raised_error.value.args[0] def test_big_n_basis(self): - scml = SCML_global(n_basis=4) + scml = SCML(n_basis=4) triplets = np.ones((3, 3, 3)) n_basis = 1 msg = ("The selected number of basis is greater than the number of points" @@ -115,7 +115,7 @@ def test_n_basis_wrong_type(self): n_basis = 4.0 - scml = SCML_global(n_basis=n_basis) + scml = SCML(n_basis=n_basis) msg = ("n_basis should be an integer, instead it is of type %s" % type(n_basis)) with pytest.raises(ValueError) as raised_error: @@ -123,14 +123,14 @@ def test_n_basis_wrong_type(self): assert msg == raised_error.value.args[0] def test_bad_basis_supervised(self): - scml = SCML_global_Supervised(basis='bad_basis') + scml = SCML_Supervised(basis='bad_basis') X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) y = np.array([1, 0, 1, 0]) authorized_basis = ['triplet_diffs'] supervised_basis = ['LDA'] authorized_basis = supervised_basis + authorized_basis - msg = ("`basis` must be '{}' or a numpy array of shape (n_basis, " - "n_features).".format("', '".join(authorized_basis))) + msg = ("`basis` must be one of the options '{}' or an array of shape " + "(n_basis, n_features).".format("', '".join(authorized_basis))) with pytest.raises(ValueError) as raised_error: scml.fit(X, y) assert msg == raised_error.value.args[0] @@ -142,7 +142,7 @@ def test_small_n_basis_supervised(self): labels, class_count = np.unique(y, return_counts=True) n_class = len(labels) - scml = SCML_global_Supervised(n_basis=n_class) + scml = SCML_Supervised(n_basis=n_class) msg = ("The number of basis should be greater than the number of classes") with pytest.raises(ValueError) as raised_error: scml.fit(X, y) @@ -158,7 +158,7 @@ def test_big_n_basis_supervised(self): n_basis = X.shape[0]*2*num_eig - scml = SCML_global_Supervised(n_basis=n_basis) + scml = SCML_Supervised(n_basis=n_basis) msg = ("The selected number of basis needs a greater number of clusters" " than the number of available samples") with pytest.raises(ValueError) as raised_error: @@ -171,7 +171,7 @@ def test_n_basis_wrong_type_supervised(self): n_basis = 4.0 - scml = SCML_global_Supervised(n_basis=n_basis) + scml = SCML_Supervised(n_basis=n_basis) msg = ("n_basis should be an integer, instead it is of type %s" % type(n_basis)) with pytest.raises(ValueError) as raised_error: @@ -187,11 +187,10 @@ def test_array_basis_supervised(self): basis = np.eye(3) - scml = SCML_global_Supervised(n_basis=3, basis=basis, k_genuine=1, - k_impostor=1) + scml = SCML_Supervised(n_basis=3, basis=basis, k_genuine=1, k_impostor=1) - msg = ('The input dimensionality ({}) of the given linear transformation ' - '`init` must match the dimensionality of the given inputs `X` ({}).' + msg = ('The dimensionality ({}) of the provided bases must match the ' + 'dimensionality of the given inputs `X` ({}).' .format(basis.shape[1], X.shape[1])) with pytest.raises(ValueError) as raised_error: scml.fit(X, y) diff --git a/test/test_utils.py b/test/test_utils.py index bf287216..6dfa22df 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -16,7 +16,7 @@ from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, MMC_Supervised, RCA_Supervised, SDML_Supervised, - SCML_global, SCML_global_Supervised, Constraints) + SCML, SCML_Supervised, Constraints) from metric_learn.base_metric import (ArrayIndexer, MahalanobisMixin, _PairsClassifierMixin, _TripletsClassifierMixin, @@ -117,7 +117,7 @@ def build_quadruplets(with_preprocessor=False): [learner for (learner, _) in quadruplets_learners])) -triplets_learners = [(SCML_global(), build_triplets)] +triplets_learners = [(SCML(), build_triplets)] ids_triplets_learners = list(map(lambda x: x.__class__.__name__, [learner for (learner, _) in triplets_learners])) @@ -139,7 +139,7 @@ def build_quadruplets(with_preprocessor=False): (MMC_Supervised(max_iter=5), build_classification), (RCA_Supervised(num_chunks=5), build_classification), (SDML_Supervised(prior='identity', balance_param=1e-5), - build_classification), (SCML_global_Supervised(), + build_classification), (SCML_Supervised(), build_classification)] ids_classifiers = list(map(lambda x: x.__class__.__name__, [learner for (learner, _) in From 04c8433a8191d97d9c7f5599bdd889f10e310da9 Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 17 Mar 2020 17:21:32 +0100 Subject: [PATCH 29/70] triplets in features form passed to _fit --- metric_learn/scml.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index a5d29e8c..3fe0d57d 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -43,16 +43,13 @@ def __init__(self, beta=1e-5, basis='triplet_diffs', n_basis=None, self.random_state = random_state super(_BaseSCML, self).__init__(preprocessor) - def _fit(self, triplets, X=None, basis=None, n_basis=None): + def _fit(self, triplets, basis=None, n_basis=None): """ Optimization procedure to find a sparse vector of weights to construct the metric from the basis set. This is based on the dual averaging method. """ - if X is not None: - triplets = X[triplets] - # Currently prepare_inputs makes triplets contain points and not indices triplets = self._prepare_inputs(triplets, type_of_inputs='tuples') @@ -475,7 +472,9 @@ def fit(self, X, y): triplets = constraints.generate_knntriplets(X, self.k_genuine, self.k_impostor) - return self._fit(triplets, X, basis, n_basis) + triplets = X[triplets] + + return self._fit(triplets, basis, n_basis) def _initialize_basis_supervised(self, X, y): """ TODO: complete function description From e67ff82762a40705ebdd916fbe1702027d64bab1 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 18 Mar 2020 12:25:52 +0100 Subject: [PATCH 30/70] change indeces handlig and edge case fix --- metric_learn/scml.py | 50 ++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 3fe0d57d..aefa373c 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -556,24 +556,21 @@ def _generate_bases_LDA(self, X, y): idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) - # TODO: It may be better to precompute this similarly to how it is done - # with the triplets generator - start = 0 - finish = 0 + start_finish_indices = np.hstack((0, k_class)).cumsum() neigh = NearestNeighbors() for c in range(n_class): sel_c = np.where(y == labels[c]) kc = k_class[c] + # get k_class same class neighbours neigh.fit(X=X[sel_c]) - finish += kc + start, finish = start_finish_indices[c:c+2] idx_set[:, start:finish] = np.take(sel_c, neigh.kneighbors(X=cX, n_neighbors=kc, return_distance=False)) - start = finish # Compute basis for every cluster in first scale basis = np.zeros((n_basis, n_features)) @@ -592,36 +589,35 @@ def _generate_bases_LDA(self, X, y): idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) - # TODO: It may be better to precompute this similarly to how it is done - # with the triplets generator - start = 0 - finish = 0 + start_finish_indices = np.hstack((0, k_class)).cumsum() for c in range(n_class): - sel_c = np.where(y == labels[c]) - kc = k_class[c] + sel_c = np.where(y == labels[c]) + kc = k_class[c] - # get k_class genuine neighbours - neigh.fit(X=X[sel_c]) - finish += kc - idx_set[:, start:finish] = np.take(sel_c, neigh.kneighbors(X=cX, - n_neighbors=kc, - return_distance=False)) - start = finish + # get k_class genuine neighbours + neigh.fit(X=X[sel_c]) + + start, finish = start_finish_indices[c:c+2] + idx_set[:, start:finish] = np.take(sel_c, neigh.kneighbors(X=cX, + n_neighbors=kc, + return_distance=False)) # Compute basis for every cluster in second scale finish = num_eig * n_clusters - for i in range(n_clusters): - start = finish - finish += num_eig + start_finish_indices = np.arange(num_eig * n_clusters, n_basis, num_eig) + start_finish_indices = np.append(start_finish_indices, n_basis) - # handle tail, as n_basis != n_clusters*2*n_eig - if finish > n_basis: - finish = n_basis + for i in range(n_clusters): + try: + start, finish = start_finish_indices[i:i+2] + except ValueError: + # No more clusters to be yielded + break - lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) + lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) - basis[start:finish, :] = normalize(lda.scalings_.T[:finish-start]) + basis[start:finish, :] = normalize(lda.scalings_.T[:finish-start]) return basis, n_basis From 10efc46b700047e26b5b94f09eab432bfa8b7339 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 18 Mar 2020 12:45:22 +0100 Subject: [PATCH 31/70] name change and typos --- metric_learn/scml.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index aefa373c..12d5b47d 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -124,7 +124,7 @@ def _fit(self, triplets, basis=None, n_basis=None): return self - def _compute_dist_diff(self, T, X, basis): + def _compute_dist_diff(self, triplets, X, basis): """ Helper function to compute the distance difference of every triplet in the space yielded by the basis set. @@ -132,24 +132,25 @@ def _compute_dist_diff(self, T, X, basis): # Transformation of data by the basis set XB = np.matmul(X, basis.T) - lenT = T.shape[0] + n_triplets = triplets.shape[0] # get all positive and negative pairs with lowest index first - # np.array (2*lenT,2) - T_pairs_sorted = np.sort(np.vstack((T[:, [0, 1]], T[:, [0, 2]])), - kind='stable') + # np.array (2*n_triplets,2) + triplets_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]], + triplets[:, [0, 2]])), + kind='stable') # calculate all unique pairs and their indices - uniqPairs, indices = np.unique(T_pairs_sorted, return_inverse=True, + uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True, axis=0) # calculate L2 distance acording to bases only for unique pairs dist = np.square(XB[uniqPairs[:, 0], :] - XB[uniqPairs[:, 1], :]) # return the diference of distances between all positive and negative # pairs - return dist[indices[:lenT]] - dist[indices[lenT:]] + return dist[indices[:n_triplets]] - dist[indices[n_triplets:]] def _components_from_basis_weights(self, basis, w): """ - get components matrix (L) from computed mahalanobis matrix + Get components matrix (L) from computed mahalanobis matrix. """ # get rid of inactive bases @@ -160,7 +161,7 @@ def _components_from_basis_weights(self, basis, w): n_basis, n_features = basis.shape if n_basis < n_features: # if metric is low-rank - warnings.warn("The number of effective basis is less than the numbert of" + warnings.warn("The number of effective basis is less than the number of" " features of the input, in consequence the learned " "transformation reduces the dimension to %d." % n_basis) return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(basis) From ed6d42b4c62d20eb36a4ac7637bc5a281a91c260 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 18 Mar 2020 17:38:14 +0100 Subject: [PATCH 32/70] improve test_components_is_2D --- test/test_mahalanobis_mixin.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index bf288e51..2e3c3ef4 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -292,24 +292,23 @@ def test_components_is_2D(estimator, build_dataset): trunc_data = input_data[..., :1] # we drop duplicates that might have been formed, i.e. of the form # aabc or abcc or aabb for quadruplets, and aa for pairs. + if isinstance(estimator, _QuadrupletsClassifierMixin): - for slice_idx in [slice(0, 2), slice(2, 4)]: - pairs = trunc_data[:, slice_idx, :] - diffs = pairs[:, 1, :] - pairs[:, 0, :] - to_keep = np.where(np.abs(diffs.ravel()) > 1e-9) - trunc_data = trunc_data[to_keep] - labels = labels[to_keep] - if isinstance(estimator, _TripletsClassifierMixin): - for slice_idx in [[0, 1], [0, 2]]: - pairs = trunc_data[:, slice_idx, :] - diffs = pairs[:, 1, :] - pairs[:, 0, :] - to_keep = np.abs(diffs.ravel()) > 1e-9 - trunc_data = trunc_data[to_keep] + pairs_idx = [[0, 1], [2, 3]] + elif isinstance(estimator, _TripletsClassifierMixin): + pairs_idx = [[0, 1], [0, 2]] elif isinstance(estimator, _PairsClassifierMixin): - diffs = trunc_data[:, 1, :] - trunc_data[:, 0, :] - to_keep = np.where(np.abs(diffs.ravel()) > 1e-9) + pairs_idx = [[0, 1]] + else: + pairs_idx = [] + + for pair_idx in pairs_idx: + pairs = trunc_data[:, pair_idx, :] + diffs = pairs[:, 1, :] - pairs[:, 0, :] + to_keep = np.abs(diffs.ravel()) > 1e-9 trunc_data = trunc_data[to_keep] labels = labels[to_keep] + model.fit(*remove_y(estimator, trunc_data, labels)) assert model.components_.shape == (1, 1) # the components must be 2D From 932ff3f5dc9e709901309353c61df5a2c1a82848 Mon Sep 17 00:00:00 2001 From: grudloff Date: Fri, 20 Mar 2020 12:04:19 +0100 Subject: [PATCH 33/70] Replace triplet_diffs option by better aproach --- metric_learn/scml.py | 84 ++++++++++++++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 23 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 12d5b47d..6d6a8619 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -201,47 +201,85 @@ def _initialize_basis(self, triplets, X): return basis, n_basis def _generate_bases_dist_diff(self, triplets, X): - """ Bases are generated from triplets as differences of positive or - negative pairs - TODO: complete function description - """ - - # TODO: Have a proportion of drawn pos and neg pairs? + """ Constructs the basis set from the differences of positive and negative + pairs from the triplets constraints. - # get all positive and negative pairs with lowest index first - # np.array (2*lenT,2) - T_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]], - triplets[:, [0, 2]])), - kind='stable') - # calculate all unique pairs and their indices - uniqPairs = np.unique(T_pairs_sorted, axis=0) + The basis set is constructed iteratively by taking n_features positive and + n_features negative pairs differences, then adding and substracting + respectively all the outerproducts, and finally adding the eigenvectors + of this matrix with positive eigenvalue. + """ + n_features = X.shape[1] + n_triplets = triplets.shape[0] if self.n_basis is None: # TODO: Get a good default n_basis directive - n_basis = uniqPairs.shape[0] - warnings.warn('The number of basis will be set to n_basis= %d' % n_basis) - + n_basis = int(n_triplets/10) + warnings.warn('As no value for `n_basis` was selected, the number of ' + 'basis will be set to n_basis= %d' % n_basis) elif isinstance(self.n_basis, int): n_basis = self.n_basis else: raise ValueError("n_basis should be an integer, instead it is of type %s" % type(self.n_basis)) - if n_basis > uniqPairs.shape[0]: - n_basis = uniqPairs.shape[0] + if n_basis > n_triplets: + n_basis = n_triplets warnings.warn("The selected number of basis is greater than the number " "of points, only n_basis = %d will be generated" % n_basis) - uniqPairs = X[uniqPairs] + basis = np.zeros((n_basis, n_features)) + + # get all positive and negative pairs with lowest index first + # np.array (2*n_triplets,2) + triplets_pairs_sorted = np.sort(np.vstack((triplets[:, [0, 1]], + triplets[:, [0, 2]])), + kind='stable') + # calculate all unique pairs and their indices + uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True, + axis=0) + # calculate L2 distance according to bases only for unique pairs + diff = X[uniqPairs[:, 0], :] - X[uniqPairs[:, 1], :] + + diff_pos = diff[indices[:n_triplets], :] + diff_neg = diff[indices[n_triplets:], :] rng = check_random_state(self.random_state) - # Select n_basis - selected_pairs = uniqPairs[rng.choice(uniqPairs.shape[0], - size=n_basis, replace=False), :, :] + start = 0 + finish = 0 + + while(finish != n_basis): + + # select n_features positive differences + d_pos = diff_pos[rng.choice(n_triplets, + size=n_features, replace=False),:] + + # select n_features negative differences + d_neg = diff_neg[rng.choice(n_triplets, + size=n_features, replace=False),:] + + # Yield matrix + diff_sum = d_pos.T.dot(d_pos) - d_neg.T.dot(d_neg) + + # Calculate eigenvalue and eigenvectors + w, v = np.linalg.eigh(diff_sum.T.dot(diff_sum)) + + # Add eigenvectors with positive eigenvalue to basis set + pos_eig_mask = w > 0 + start = finish + finish += pos_eig_mask.sum() + + try: + basis[start:finish, :] = v[pos_eig_mask] + except ValueError: + # if finish is greater than n_basis + basis[start:, :] = v[pos_eig_mask][:n_basis-start] + break - basis = selected_pairs[:, 0]-selected_pairs[:, 1] + # TODO: maybe add a warning in case there are no added bases, this could + # be caused by a bad triplet set. This would cause an infinite loop return basis, n_basis From 534cd3f5c49249856460f0a4e58d8df8c944c226 Mon Sep 17 00:00:00 2001 From: grudloff Date: Fri, 20 Mar 2020 12:06:18 +0100 Subject: [PATCH 34/70] some comments, docstring and refactoring --- metric_learn/scml.py | 65 +++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 40 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 6d6a8619..d11a25a0 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -28,6 +28,7 @@ def sum_where(X, where): class _BaseSCML(MahalanobisMixin): _tuple_size = 3 # constraints are triplets + _authorized_basis = ['triplet_diffs'] def __init__(self, beta=1e-5, basis='triplet_diffs', n_basis=None, gamma=5e-3, max_iter=100000, output_iter=5000, verbose=False, @@ -154,6 +155,7 @@ def _components_from_basis_weights(self, basis, w): """ # get rid of inactive bases + # TODO: Maybe have a tolerance over zero? active_idx, = w > 0 w = w[..., active_idx] basis = basis[active_idx, :] @@ -176,13 +178,11 @@ def _to_index_points(self, triplets): return triplets, X def _initialize_basis(self, triplets, X): - """ TODO: complete function description + """ Checks if the basis array is well constructed or constructs it based + on one of the available options. """ n_features = X.shape[1] - # TODO: - # Add other options passed as string - authorized_basis = ['triplet_diffs'] if isinstance(self.basis, np.ndarray): # TODO: should copy? basis = check_array(self.basis, copy=True) @@ -190,11 +190,11 @@ def _initialize_basis(self, triplets, X): raise ValueError('The dimensionality ({}) of the provided bases must' ' match the dimensionality of the given inputs `X` ' '({}).'.format(basis.shape[1], n_features)) - elif self.basis not in authorized_basis: + elif self.basis not in self._authorized_basis: raise ValueError( "`basis` must be one of the options '{}' " "or an array of shape (n_basis, n_features)." - .format("', '".join(authorized_basis))) + .format("', '".join(self._authorized_basis))) if self.basis == 'triplet_diffs': basis, n_basis = self._generate_bases_dist_diff(triplets, X) @@ -476,6 +476,8 @@ class SCML_Supervised(_BaseSCML, TransformerMixin): :ref:`supervised_version` : The section of the project documentation that describes the supervised version of weakly supervised estimators. """ + # Add supervised authorized basis construction options + _authorized_basis = _BaseSCML._authorized_basis + ['LDA'] def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='LDA', n_basis=None, gamma=5e-3, max_iter=100000, output_iter=5000, @@ -516,22 +518,10 @@ def fit(self, X, y): return self._fit(triplets, basis, n_basis) def _initialize_basis_supervised(self, X, y): - """ TODO: complete function description + """ Constructs the basis set following one of the supervised options in + case one is selected. """ - # TODO: - # Add other options passed as string - authorized_basis = ['triplet_diffs'] - supervised_basis = ['LDA'] - authorized_basis = supervised_basis + authorized_basis - - if not(isinstance(self.basis, np.ndarray)) \ - and self.basis not in authorized_basis: - raise ValueError( - "`basis` must be one of the options '{}' " - "or an array of shape (n_basis, n_features)." - .format("', '".join(authorized_basis))) - if self.basis == 'LDA': basis, n_basis = self._generate_bases_LDA(X, y) else: @@ -540,9 +530,10 @@ def _initialize_basis_supervised(self, X, y): return basis, n_basis def _generate_bases_LDA(self, X, y): - """ - Helper function that computes the n_basis basis set constructed from the - LDA of significant local regions in the feature space via clustering, for + """ Generates bases for the 'LDA' option. + + The basis set is constructed using Linear Discriminant Analysis of + significant local regions in the feature space via clustering, for each region center k-nearest neighbors are used to obtain the LDA scalings, which correspond to the locally discriminative basis. Currently this is done at two scales `k={10,20}` if `n_feature < 50` or else `k={20,50}`. @@ -558,7 +549,8 @@ def _generate_bases_LDA(self, X, y): if self.n_basis is None: # TODO: Get a good default n_basis directive n_basis = min(20*n_features, X.shape[0]*2*num_eig) - warnings.warn('The number of basis will be set to n_basis= %d' % n_basis) + warnings.warn('As no value for `n_basis` was selected, the number of ' + 'basis will be set to n_basis= %d' % n_basis) elif isinstance(self.n_basis, int): n_basis = self.n_basis @@ -614,9 +606,9 @@ def _generate_bases_LDA(self, X, y): # Compute basis for every cluster in first scale basis = np.zeros((n_basis, n_features)) lda = LinearDiscriminantAnalysis() - for i in range(n_clusters): + for i, start in enumerate(range(0, num_eig * n_clusters, num_eig)): lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) - basis[num_eig*i: num_eig*(i+1), :] = normalize(lda.scalings_.T) + basis[start: start + num_eig, :] = normalize(lda.scalings_.T) # second scale k = 20 @@ -634,7 +626,7 @@ def _generate_bases_LDA(self, X, y): sel_c = np.where(y == labels[c]) kc = k_class[c] - # get k_class genuine neighbours + # get k_class genuine neighbourss neigh.fit(X=X[sel_c]) start, finish = start_finish_indices[c:c+2] @@ -642,21 +634,14 @@ def _generate_bases_LDA(self, X, y): n_neighbors=kc, return_distance=False)) - # Compute basis for every cluster in second scale - finish = num_eig * n_clusters - - start_finish_indices = np.arange(num_eig * n_clusters, n_basis, num_eig) - start_finish_indices = np.append(start_finish_indices, n_basis) - - for i in range(n_clusters): + for i, start in enumerate(range(num_eig * n_clusters, + 2*num_eig * n_clusters, num_eig)): + lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) try: - start, finish = start_finish_indices[i:i+2] + basis[start: start + num_eig, :] = normalize(lda.scalings_.T) except ValueError: - # No more clusters to be yielded + # handle tail + basis[start:, :] = normalize(lda.scalings_.T[:n_basis-start]) break - lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) - - basis[start:finish, :] = normalize(lda.scalings_.T[:finish-start]) - return basis, n_basis From 576fbcbf441352cdb90c4b91c87fe8d8eeef34df Mon Sep 17 00:00:00 2001 From: grudloff Date: Fri, 20 Mar 2020 12:44:03 +0100 Subject: [PATCH 35/70] fix bad triplet set --- test/metric_learn_test.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 4fed07d3..9253b4f8 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -97,8 +97,8 @@ def test_bad_basis(self): def test_big_n_basis(self): scml = SCML(n_basis=4) - triplets = np.ones((3, 3, 3)) - n_basis = 1 + triplets = np.random.rand(3, 3, 3) + n_basis = 3 msg = ("The selected number of basis is greater than the number of points" ", only n_basis = %d will be generated" % n_basis) with pytest.warns(UserWarning) as raised_warning: @@ -126,9 +126,7 @@ def test_bad_basis_supervised(self): scml = SCML_Supervised(basis='bad_basis') X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) y = np.array([1, 0, 1, 0]) - authorized_basis = ['triplet_diffs'] - supervised_basis = ['LDA'] - authorized_basis = supervised_basis + authorized_basis + authorized_basis = ['triplet_diffs', 'LDA'] msg = ("`basis` must be one of the options '{}' or an array of shape " "(n_basis, n_features).".format("', '".join(authorized_basis))) with pytest.raises(ValueError) as raised_error: From 2bee8cc472922ced92370fda7ee212f6a2e11b1d Mon Sep 17 00:00:00 2001 From: grudloff Date: Fri, 20 Mar 2020 12:44:12 +0100 Subject: [PATCH 36/70] flake8 fix --- metric_learn/scml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index d11a25a0..ff78c773 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -254,11 +254,11 @@ def _generate_bases_dist_diff(self, triplets, X): # select n_features positive differences d_pos = diff_pos[rng.choice(n_triplets, - size=n_features, replace=False),:] + size=n_features, replace=False), :] # select n_features negative differences d_neg = diff_neg[rng.choice(n_triplets, - size=n_features, replace=False),:] + size=n_features, replace=False), :] # Yield matrix diff_sum = d_pos.T.dot(d_pos) - d_neg.T.dot(d_neg) From 895f28bbc6243abafdf49e30ce5552283cfb3601 Mon Sep 17 00:00:00 2001 From: grudloff Date: Fri, 20 Mar 2020 16:56:35 +0100 Subject: [PATCH 37/70] SCML doc first draft --- doc/metric_learn.rst | 2 ++ doc/weakly_supervised.rst | 54 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/doc/metric_learn.rst b/doc/metric_learn.rst index 76c91f48..8f91d91c 100644 --- a/doc/metric_learn.rst +++ b/doc/metric_learn.rst @@ -33,6 +33,7 @@ Supervised Learning Algorithms metric_learn.MMC_Supervised metric_learn.SDML_Supervised metric_learn.RCA_Supervised + metric_learn.SCML_Supervised Weakly Supervised Learning Algorithms ------------------------------------- @@ -45,6 +46,7 @@ Weakly Supervised Learning Algorithms metric_learn.LSML metric_learn.MMC metric_learn.SDML + metric_learn.SCML Unsupervised Learning Algorithms -------------------------------- diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst index 174d1a8b..f850b79d 100644 --- a/doc/weakly_supervised.rst +++ b/doc/weakly_supervised.rst @@ -700,6 +700,60 @@ of triplets that have the right predicted ordering. Algorithms ---------- +.. _scml: + +:py:class:`SCML ` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sparse Compositional Metric Learning +(:py:class:`SCML `) + +`SCML` learns an squared mahalanobis distance from triplet constraints by +optimizing sparse positive weights assigned to a set of :math:`K` locally discriminative +rank-one PSD bases. This can be formulated as an optimization problem with only :math:`K` +parameters, that can be solved with an efficient stochastic composite scheme. + +The Mahalanobis Matrix :math:`M` is built from a basis set :math:`B = \{b_i\}_{i=\{1,...,K\}}` +weighted by a :math:`K` dimensional vector :math:`w = \{w_i\}_{i=\{1,...,K\}}` as: + +.. math:: + + M = \sum_{i=1}^K w_i b_i b_i^T = B \cdot diag(w) \cdot B^T \quad w_i \geq 0 + +Learning :math:`M` in this form makes it PSD by design, as it is a nonnegative sum of PSD matrices. +The optimization problem of :math:`w` over the triplets constraints :math:`C` is formulated as a +classic margin-based hinge loss function over the relative constrains, a regularization :math:`\ell_1` +is added to yield an sparse representation. The formulation is the following: + +.. math:: + + \min_{w} \sum_{(x_a,x_b,x_c)\in C} [1 + d_w(x_a,x_b)-d_w(x_a,x_c)]_+ + \beta||w||_1 + +Where :math:`[\cdot]_+` is the hinge loss. + +.. topic:: Example Code: + +:: + + from metric_learn import SCML + + triplets = [[[1.2, 7.5], [1.3, 1.5], [6.2, 9.7]], + [[1.3, 4.5], [3.2, 4.6], [5.4, 5.4]], + [[3.2, 7.5], [3.3, 1.5], [8.2, 9.7]], + [[3.3, 4.5], [5.2, 4.6], [7.4, 5.4]]] + + scml = SCML() + scml.fit(triplets) + +.. topic:: References: + + .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. + `_. \ + (AAAI), 2014. + + .. [2] Adapted from original \ + `Matlab implementation.`_. + .. _learning_on_quadruplets: From 8c4ef22486575b505f4580b08571fb5c4683e897 Mon Sep 17 00:00:00 2001 From: grudloff Date: Mon, 23 Mar 2020 16:53:25 +0100 Subject: [PATCH 38/70] find neighbors for every class only once --- metric_learn/scml.py | 78 ++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index ff78c773..adbe55a2 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -569,79 +569,57 @@ def _generate_bases_LDA(self, X, y): # yielded by every LDA n_clusters = int(np.ceil(n_basis/(2 * num_eig))) - # TODO: maybe give acces to Kmeans jobs for faster computation? kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state, algorithm='elkan').fit(X) cX = kmeans.cluster_centers_ - # TODO: find a better way to choose neighborhood size + n_scales = 2 if n_features > 50: - k = 50 + scales = [20, 50] else: - k = 10 + scales = [10, 20] - # In case some class has less elements than k - k_class = np.minimum(class_count, k) + k_class = np.vstack((np.minimum(class_count, scales[0]), + np.minimum(class_count, scales[1]))) - # Construct index set with neighbors for every element of every class + idx_set = [np.zeros((n_clusters, sum(k_class[0, :])), dtype=np.int), + np.zeros((n_clusters, sum(k_class[1, :])), dtype=np.int)] - idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) - - start_finish_indices = np.hstack((0, k_class)).cumsum() + start_finish_indices = np.hstack((np.zeros((2, 1), np.int), + k_class)).cumsum(axis=1) neigh = NearestNeighbors() for c in range(n_class): sel_c = np.where(y == labels[c]) - kc = k_class[c] - # get k_class same class neighbours + # get k_class same class neighbors neigh.fit(X=X[sel_c]) + neighbors = neigh.kneighbors(X=cX, n_neighbors=k_class[1, c], + return_distance=False) - start, finish = start_finish_indices[c:c+2] - idx_set[:, start:finish] = np.take(sel_c, neigh.kneighbors(X=cX, - n_neighbors=kc, - return_distance=False)) + # add index set of neighbors for every cluster center for both scales + for s, k in enumerate(k_class[:, c]): + start, finish = start_finish_indices[s, c:c+2] + idx_set[s][:, start:finish] = np.take(sel_c, neighbors[:, :k]) - # Compute basis for every cluster in first scale + # Compute basis for every cluster in both scales basis = np.zeros((n_basis, n_features)) lda = LinearDiscriminantAnalysis() - for i, start in enumerate(range(0, num_eig * n_clusters, num_eig)): - lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) - basis[start: start + num_eig, :] = normalize(lda.scalings_.T) - - # second scale - k = 20 - - # In case some class has less elements than k - k_class = np.minimum(class_count, k) - - # Construct index set with neighbors for every element of every class - - idx_set = np.zeros((n_clusters, sum(k_class)), dtype=np.int) - - start_finish_indices = np.hstack((0, k_class)).cumsum() - - for c in range(n_class): - sel_c = np.where(y == labels[c]) - kc = k_class[c] - - # get k_class genuine neighbourss - neigh.fit(X=X[sel_c]) - - start, finish = start_finish_indices[c:c+2] - idx_set[:, start:finish] = np.take(sel_c, neigh.kneighbors(X=cX, - n_neighbors=kc, - return_distance=False)) - - for i, start in enumerate(range(num_eig * n_clusters, - 2*num_eig * n_clusters, num_eig)): - lda.fit(X[idx_set[i, :]], y[idx_set[i, :]]) + start_finish_indices = np.hstack((np.vstack((0, n_clusters * num_eig)), + np.full((2, n_clusters), + num_eig))).cumsum(axis=1) + + for s in range(n_scales): + for c in range(n_clusters): + lda.fit(X[idx_set[s][c, :]], y[idx_set[s][c, :]]) + start, finish = start_finish_indices[s, c:c+2] + normalized_scalings = normalize(lda.scalings_.T) try: - basis[start: start + num_eig, :] = normalize(lda.scalings_.T) + basis[start: finish, :] = normalized_scalings except ValueError: # handle tail - basis[start:, :] = normalize(lda.scalings_.T[:n_basis-start]) + basis[start:, :] = normalized_scalings[:n_basis-start] break return basis, n_basis From 26da826b6188099fa5ec821920e8aa24d573ad1e Mon Sep 17 00:00:00 2001 From: grudloff Date: Mon, 23 Mar 2020 17:00:55 +0100 Subject: [PATCH 39/70] improve some docstring and warnings --- metric_learn/scml.py | 159 ++++++++++++++++++-------------------- test/metric_learn_test.py | 12 +-- 2 files changed, 84 insertions(+), 87 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index adbe55a2..2455518c 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -163,9 +163,10 @@ def _components_from_basis_weights(self, basis, w): n_basis, n_features = basis.shape if n_basis < n_features: # if metric is low-rank - warnings.warn("The number of effective basis is less than the number of" - " features of the input, in consequence the learned " - "transformation reduces the dimension to %d." % n_basis) + warnings.warn("The number of bases with nonzero weight is less than the " + "number of features of the input, in consequence the " + "learned transformation reduces the dimension to %d." + % n_basis) return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(basis) else: # if metric is full rank @@ -188,7 +189,7 @@ def _initialize_basis(self, triplets, X): basis = check_array(self.basis, copy=True) if basis.shape[1] != n_features: raise ValueError('The dimensionality ({}) of the provided bases must' - ' match the dimensionality of the given inputs `X` ' + ' match the dimensionality of the data ' '({}).'.format(basis.shape[1], n_features)) elif self.basis not in self._authorized_basis: raise ValueError( @@ -204,17 +205,18 @@ def _generate_bases_dist_diff(self, triplets, X): """ Constructs the basis set from the differences of positive and negative pairs from the triplets constraints. - The basis set is constructed iteratively by taking n_features positive and - n_features negative pairs differences, then adding and substracting - respectively all the outerproducts, and finally adding the eigenvectors - of this matrix with positive eigenvalue. + The basis set is constructed iteratively by taking n_features triplets, + then adding and substracting respectively all the outerproducts of the + positive and negative pairs, and finally selecting the eigenvectors + of this matrix with positive eigenvalue. This is done until n_basis are + selected. """ n_features = X.shape[1] n_triplets = triplets.shape[0] if self.n_basis is None: # TODO: Get a good default n_basis directive - n_basis = int(n_triplets/10) + n_basis = n_features*80 warnings.warn('As no value for `n_basis` was selected, the number of ' 'basis will be set to n_basis= %d' % n_basis) elif isinstance(self.n_basis, int): @@ -223,12 +225,6 @@ def _generate_bases_dist_diff(self, triplets, X): raise ValueError("n_basis should be an integer, instead it is of type %s" % type(self.n_basis)) - if n_basis > n_triplets: - n_basis = n_triplets - warnings.warn("The selected number of basis is greater than the number " - "of points, only n_basis = %d will be generated" % - n_basis) - basis = np.zeros((n_basis, n_features)) # get all positive and negative pairs with lowest index first @@ -239,7 +235,7 @@ def _generate_bases_dist_diff(self, triplets, X): # calculate all unique pairs and their indices uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True, axis=0) - # calculate L2 distance according to bases only for unique pairs + # calculate diferences only for unique pairs diff = X[uniqPairs[:, 0], :] - X[uniqPairs[:, 1], :] diff_pos = diff[indices[:n_triplets], :] @@ -252,13 +248,15 @@ def _generate_bases_dist_diff(self, triplets, X): while(finish != n_basis): + # Select triplets to yield diff + + select_triplet = rng.choice(n_triplets, size=n_features, replace=False) + # select n_features positive differences - d_pos = diff_pos[rng.choice(n_triplets, - size=n_features, replace=False), :] + d_pos = diff_pos[select_triplet, :] # select n_features negative differences - d_neg = diff_neg[rng.choice(n_triplets, - size=n_features, replace=False), :] + d_neg = diff_neg[select_triplet, :] # Yield matrix diff_sum = d_pos.T.dot(d_pos) - d_neg.T.dot(d_neg) @@ -297,51 +295,51 @@ class SCML(_BaseSCML, _TripletsClassifierMixin): Parameters ---------- beta: float (default=1e-5) - L1 regularization parameter. + L1 regularization parameter. basis : string or array-like, optional (default='triplet_diffs') - Set of bases to construct the metric. Possible options are - 'triplet_diffs', and an array-like of shape (n_basis, n_features). + Set of bases to construct the metric. Possible options are + 'triplet_diffs', and an array-like of shape (n_basis, n_features). - 'triplet_diffs' - The basis set is constructed from the differences between points of - `n_basis` positive or negative pairs taken from the triplets - constrains. + 'triplet_diffs' + The basis set is constructed from the differences between points of + `n_basis` positive or negative pairs taken from the triplets + constrains. - array-like - A matrix of shape (n_basis, n_features), that will be used as - the basis set for the metric construction. + array-like + A matrix of shape (n_basis, n_features), that will be used as + the basis set for the metric construction. n_basis : int, optional - Number of basis to be yielded. In case it is not set it will be set based - on `basis`. If no value is selected a default will be computed based on - the input. + Number of basis to be yielded. In case it is not set it will be set based + on `basis`. If no value is selected a default will be computed based on + the input. gamma: float (default = 5e-3) - Learning rate for the optimization algorithm. + Learning rate for the optimization algorithm. max_iter : int (default = 100000) - Number of iterations for the algorithm. + Number of iterations for the algorithm. output_iter : int (default = 5000) - Number of iterations to check current weights performance and output this - information in case verbose is True. + Number of iterations to check current weights performance and output this + information in case verbose is True. verbose : bool, optional - If True, prints information while learning. + If True, prints information while learning. preprocessor : array-like, shape=(n_samples, n_features) or callable - The preprocessor to call to get triplets from indices. If array-like, - triplets will be formed like this: X[indices]. + The preprocessor to call to get triplets from indices. If array-like, + triplets will be formed like this: X[indices]. random_state : int or numpy.RandomState or None, optional (default=None) - A pseudo random number generator object or a seed for it if int. + A pseudo random number generator object or a seed for it if int. Attributes ---------- components_ : `numpy.ndarray`, shape=(n_features, n_features) - The linear transformation ``L`` deduced from the learned Mahalanobis - metric (See function `_components_from_basis_weights`.) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `_components_from_basis_weights`.) Examples -------- @@ -376,7 +374,7 @@ def fit(self, triplets): Parameters ---------- triplets : array-like, shape=(n_constraints, 3, n_features) or \ - (n_constraints, 3) + (n_constraints, 3) 3D array-like of triplets of points or 2D array of triplets of indicators. Triplets are assumed to be ordered such that: d(triplets[i, 0],triplets[i, 1]) < d(triplets[i, 0], triplets[i, 2]). @@ -402,52 +400,52 @@ class SCML_Supervised(_BaseSCML, TransformerMixin): Parameters ---------- beta: float (default=1e-5) - L1 regularization parameter. + L1 regularization parameter. - basis : string or an array-like, optional (default='LDA') - Set of bases to construct the metric. Possible options are - 'LDA', and an array-like of shape (n_basis, n_features). + basis : string or an array-like, optional (default='lda') + Set of bases to construct the metric. Possible options are + 'lda', and an array-like of shape (n_basis, n_features). - 'LDA' - The `n_basis` basis set is constructed from the LDA of significant - local regions in the feature space via clustering, for each region - center k-nearest neighbors are used to obtain the LDA scalings, - which correspond to the locally discriminative basis. + 'lda' + The `n_basis` basis set is constructed from the LDA of significant + local regions in the feature space via clustering, for each region + center k-nearest neighbors are used to obtain the LDA scalings, + which correspond to the locally discriminative basis. - array-like - A matrix of shape (n_basis, n_features), that will be used as - the basis set for the metric construction. + array-like + A matrix of shape (n_basis, n_features), that will be used as + the basis set for the metric construction. n_basis : int, optional - Number of basis to be yielded. In case it is not set it will be set based - on `basis`. If no value is selected a default will be computed based on - the input. + Number of basis to be yielded. In case it is not set it will be set based + on `basis`. If no value is selected a default will be computed based on + the input. gamma: float (default = 5e-3) - Learning rate for the optimization algorithm. + Learning rate for the optimization algorithm. max_iter : int (default = 100000) - Number of iterations for the algorithm. + Number of iterations for the algorithm. output_iter : int (default = 5000) - Number of iterations to check current weights performance and output this - information in case verbose is True. + Number of iterations to check current weights performance and output this + information in case verbose is True. verbose : bool, optional - If True, prints information while learning. + If True, prints information while learning. preprocessor : array-like, shape=(n_samples, n_features) or callable - The preprocessor to call to get triplets from indices. If array-like, - triplets will be formed like this: X[indices]. + The preprocessor to call to get triplets from indices. If array-like, + triplets will be formed like this: X[indices]. random_state : int or numpy.RandomState or None, optional (default=None) - A pseudo random number generator object or a seed for it if int. + A pseudo random number generator object or a seed for it if int. Attributes ---------- components_ : `numpy.ndarray`, shape=(n_features, n_features) - The linear transformation ``L`` deduced from the learned Mahalanobis - metric (See function `_components_from_basis_weights`.) + The linear transformation ``L`` deduced from the learned Mahalanobis + metric (See function `_components_from_basis_weights`.) Examples -------- @@ -470,16 +468,13 @@ class SCML_Supervised(_BaseSCML, TransformerMixin): See Also -------- - metric_learn.SCML_Supervised : The supervised version of this - algorithm, which construct the triplets from the labels. - - :ref:`supervised_version` : The section of the project documentation - that describes the supervised version of weakly supervised estimators. + metric_learn.SCML : The weakly supervised version of this + algorithm. """ # Add supervised authorized basis construction options - _authorized_basis = _BaseSCML._authorized_basis + ['LDA'] + _authorized_basis = _BaseSCML._authorized_basis + ['lda'] - def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='LDA', + def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='lda', n_basis=None, gamma=5e-3, max_iter=100000, output_iter=5000, verbose=False, preprocessor=None, random_state=None): self.k_genuine = k_genuine @@ -522,7 +517,7 @@ def _initialize_basis_supervised(self, X, y): case one is selected. """ - if self.basis == 'LDA': + if self.basis == 'lda': basis, n_basis = self._generate_bases_LDA(X, y) else: basis, n_basis = None, None @@ -530,7 +525,7 @@ def _initialize_basis_supervised(self, X, y): return basis, n_basis def _generate_bases_LDA(self, X, y): - """ Generates bases for the 'LDA' option. + """ Generates bases for the 'lda' option. The basis set is constructed using Linear Discriminant Analysis of significant local regions in the feature space via clustering, for @@ -615,11 +610,11 @@ def _generate_bases_LDA(self, X, y): lda.fit(X[idx_set[s][c, :]], y[idx_set[s][c, :]]) start, finish = start_finish_indices[s, c:c+2] normalized_scalings = normalize(lda.scalings_.T) - try: + try: basis[start: finish, :] = normalized_scalings - except ValueError: - # handle tail + except ValueError: + # handle tail basis[start:, :] = normalized_scalings[:n_basis-start] - break + break return basis, n_basis diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 9253b4f8..c903eb91 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -98,9 +98,11 @@ def test_bad_basis(self): def test_big_n_basis(self): scml = SCML(n_basis=4) triplets = np.random.rand(3, 3, 3) - n_basis = 3 - msg = ("The selected number of basis is greater than the number of points" - ", only n_basis = %d will be generated" % n_basis) + n_basis = 1 + msg = ("The number of bases with nonzero weight is less than the " + "number of features of the input, in consequence the " + "learned transformation reduces the dimension to %d." + % n_basis) with pytest.warns(UserWarning) as raised_warning: scml.fit(triplets) assert msg == raised_warning[0].message.args[0] @@ -126,7 +128,7 @@ def test_bad_basis_supervised(self): scml = SCML_Supervised(basis='bad_basis') X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) y = np.array([1, 0, 1, 0]) - authorized_basis = ['triplet_diffs', 'LDA'] + authorized_basis = ['triplet_diffs', 'lda'] msg = ("`basis` must be one of the options '{}' or an array of shape " "(n_basis, n_features).".format("', '".join(authorized_basis))) with pytest.raises(ValueError) as raised_error: @@ -188,7 +190,7 @@ def test_array_basis_supervised(self): scml = SCML_Supervised(n_basis=3, basis=basis, k_genuine=1, k_impostor=1) msg = ('The dimensionality ({}) of the provided bases must match the ' - 'dimensionality of the given inputs `X` ({}).' + 'dimensionality of the data ({}).' .format(basis.shape[1], X.shape[1])) with pytest.raises(ValueError) as raised_error: scml.fit(X, y) From 54525d73dfc09f63643e8b9f976cc18bd736d4a2 Mon Sep 17 00:00:00 2001 From: grudloff Date: Mon, 23 Mar 2020 17:28:21 +0100 Subject: [PATCH 40/70] add sklearn compat test --- metric_learn/scml.py | 2 +- test/test_sklearn_compat.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 2455518c..2d19cd9c 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -543,7 +543,7 @@ def _generate_bases_LDA(self, X, y): if self.n_basis is None: # TODO: Get a good default n_basis directive - n_basis = min(20*n_features, X.shape[0]*2*num_eig) + n_basis = min(20*n_features, X.shape[0]*2*num_eig - 1) warnings.warn('As no value for `n_basis` was selected, the number of ' 'basis will be set to n_basis= %d' % n_basis) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index 539cb1ee..c561ae36 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -10,7 +10,8 @@ from metric_learn import (Covariance, LFDA, LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, - MMC_Supervised, RCA_Supervised, SDML_Supervised) + MMC_Supervised, RCA_Supervised, SDML_Supervised, + SCML_Supervised) from sklearn import clone import numpy as np from sklearn.model_selection import (cross_val_score, cross_val_predict, @@ -79,6 +80,9 @@ def test_sdml(self): def test_rca(self): check_estimator(Stable_RCA_Supervised) + def test_scml(self): + check_estimator(SCML_Supervised) + RNG = check_random_state(0) From 4140585c355dadbb479ffcbc43c87735d87b8c41 Mon Sep 17 00:00:00 2001 From: grudloff Date: Mon, 23 Mar 2020 20:02:04 +0100 Subject: [PATCH 41/70] changes to doc --- doc/weakly_supervised.rst | 41 +++++++++++++++++++++------------------ metric_learn/scml.py | 11 ++++++----- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst index f850b79d..95f9320a 100644 --- a/doc/weakly_supervised.rst +++ b/doc/weakly_supervised.rst @@ -708,28 +708,31 @@ Algorithms Sparse Compositional Metric Learning (:py:class:`SCML `) -`SCML` learns an squared mahalanobis distance from triplet constraints by -optimizing sparse positive weights assigned to a set of :math:`K` locally discriminative -rank-one PSD bases. This can be formulated as an optimization problem with only :math:`K` -parameters, that can be solved with an efficient stochastic composite scheme. +`SCML` learns an squared Mahalanobis distance from triplet constraints by +optimizing sparse positive weights assigned to a set of :math:`K` rank-one +PSD bases. This can be formulated as an optimization problem with only +:math:`K` parameters, that can be solved with an efficient stochastic +composite scheme. -The Mahalanobis Matrix :math:`M` is built from a basis set :math:`B = \{b_i\}_{i=\{1,...,K\}}` +The Mahalanobis matrix :math:`M` is built from a basis set :math:`B = \{b_i\}_{i=\{1,...,K\}}` weighted by a :math:`K` dimensional vector :math:`w = \{w_i\}_{i=\{1,...,K\}}` as: .. math:: M = \sum_{i=1}^K w_i b_i b_i^T = B \cdot diag(w) \cdot B^T \quad w_i \geq 0 -Learning :math:`M` in this form makes it PSD by design, as it is a nonnegative sum of PSD matrices. -The optimization problem of :math:`w` over the triplets constraints :math:`C` is formulated as a -classic margin-based hinge loss function over the relative constrains, a regularization :math:`\ell_1` -is added to yield an sparse representation. The formulation is the following: +Learning :math:`M` in this form makes it PSD by design, as it is a +nonnegative sum of PSD matrices. The basis set :math:`B` is fixed on advance +and it is possible to construct it from the data. The optimization problem +over :math:`w` is formulated as a classic margin-based hinge loss function +involving the set :math:`C` of triplets. A regularization :math:`\ell_1` +is added to yield a sparse combination. The formulation is the following: .. math:: - \min_{w} \sum_{(x_a,x_b,x_c)\in C} [1 + d_w(x_a,x_b)-d_w(x_a,x_c)]_+ + \beta||w||_1 + \min_{w\geq 0} \sum_{(x_i,x_j,x_k)\in C} [1 + d_w(x_i,x_j)-d_w(x_i,x_k)]_+ + \beta||w||_1 -Where :math:`[\cdot]_+` is the hinge loss. +where :math:`[\cdot]_+` is the hinge loss. .. topic:: Example Code: @@ -883,13 +886,13 @@ extension leads to more stable estimation when the dimension is high and only a small amount of constraints is given. The loss function of each constraint -:math:`d(\mathbf{x}_a, \mathbf{x}_b) < d(\mathbf{x}_c, \mathbf{x}_d)` is +:math:`d(\mathbf{x}_i, \mathbf{x}_j) < d(\mathbf{x}_k, \mathbf{x}_l)` is denoted as: .. math:: - H(d_\mathbf{M}(\mathbf{x}_a, \mathbf{x}_b) - - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_d)) + H(d_\mathbf{M}(\mathbf{x}_i, \mathbf{x}_j) + - d_\mathbf{M}(\mathbf{x}_k, \mathbf{x}_l)) where :math:`H(\cdot)` is the squared Hinge loss function defined as: @@ -899,8 +902,8 @@ where :math:`H(\cdot)` is the squared Hinge loss function defined as: \,\,x^2 \qquad x>0\end{aligned}\right.\\ The summed loss function :math:`L(C)` is the simple sum over all constraints -:math:`C = \{(\mathbf{x}_a , \mathbf{x}_b , \mathbf{x}_c , \mathbf{x}_d) -: d(\mathbf{x}_a , \mathbf{x}_b) < d(\mathbf{x}_c , \mathbf{x}_d)\}`. The +:math:`C = \{(\mathbf{x}_i , \mathbf{x}_j , \mathbf{x}_k , \mathbf{x}_l) +: d(\mathbf{x}_i , \mathbf{x}_j) < d(\mathbf{x}_k , \mathbf{x}_l)\}`. The original paper suggested here should be a weighted sum since the confidence or probability of each constraint might differ. However, for the sake of simplicity and assumption of no extra knowledge provided, we just deploy @@ -912,9 +915,9 @@ knowledge: .. math:: - \min_\mathbf{M}(D_{ld}(\mathbf{M, M_0}) + \sum_{(\mathbf{x}_a, - \mathbf{x}_b, \mathbf{x}_c, \mathbf{x}_d)\in C}H(d_\mathbf{M}( - \mathbf{x}_a, \mathbf{x}_b) - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_c))\\ + \min_\mathbf{M}(D_{ld}(\mathbf{M, M_0}) + \sum_{(\mathbf{x}_i, + \mathbf{x}_j, \mathbf{x}_k, \mathbf{x}_l)\in C}H(d_\mathbf{M}( + \mathbf{x}_i, \mathbf{x}_j) - d_\mathbf{M}(\mathbf{x}_k, \mathbf{x}_l))\\ where :math:`\mathbf{M}_0` is the prior metric matrix, set as identity by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence: diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 2d19cd9c..704005a8 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -235,7 +235,7 @@ def _generate_bases_dist_diff(self, triplets, X): # calculate all unique pairs and their indices uniqPairs, indices = np.unique(triplets_pairs_sorted, return_inverse=True, axis=0) - # calculate diferences only for unique pairs + # calculate differences only for unique pairs diff = X[uniqPairs[:, 0], :] - X[uniqPairs[:, 1], :] diff_pos = diff[indices[:n_triplets], :] @@ -285,10 +285,11 @@ def _generate_bases_dist_diff(self, triplets, X): class SCML(_BaseSCML, _TripletsClassifierMixin): """Sparse Compositional Metric Learning (SCML) - `SCML` learns a metric from triplet constraints by optimizing sparse - positive weights assigned to a set of `K` locally discriminative rank-one - PSD bases. This can be formulated as an optimization problem with only `K` - parameters, that can be solved with an efficient stochastic composite scheme. + `SCML` learns an squared Mahalanobis distance from triplet constraints by + optimizing sparse positive weights assigned to a set of :math:`K` rank-one + PSD bases. This can be formulated as an optimization problem with only + :math:`K` parameters, that can be solved with an efficient stochastic + composite scheme. Read more in the :ref:`User Guide `. From e54a741c1325fe9b2c4bb3e946ce14127af44109 Mon Sep 17 00:00:00 2001 From: grudloff Date: Mon, 23 Mar 2020 20:02:42 +0100 Subject: [PATCH 42/70] fix and improve tests --- test/metric_learn_test.py | 77 ++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index c903eb91..5e9d3afc 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -23,8 +23,7 @@ from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, SCML_Supervised, LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, - MMC_Supervised, SDML, RCA, ITML, LSML, SCML, - Constraints) + MMC_Supervised, SDML, RCA, ITML, LSML, SCML) # Import this specially for testing. from metric_learn.constraints import wrap_pairs from metric_learn.lmnn import _sum_outer_products @@ -76,7 +75,6 @@ def test_singular_returns_pseudo_inverse(self): assert_allclose(pseudo_inverse.dot(cov_matrix).dot(pseudo_inverse), pseudo_inverse) - class TestSCML(MetricTestCase): def test_iris(self): scml = SCML_Supervised() @@ -85,6 +83,9 @@ def test_iris(self): csep = class_separation(scml.transform(self.iris_points), self.iris_labels) self.assertLess(csep, 0.3) + + # TODO: merge into one by the use of parametrize + def test_bad_basis(self): scml = SCML(basis='bad_basis') triplets = np.ones((3, 3, 3)) @@ -95,9 +96,23 @@ def test_bad_basis(self): scml.fit(triplets) assert msg == raised_error.value.args[0] - def test_big_n_basis(self): - scml = SCML(n_basis=4) - triplets = np.random.rand(3, 3, 3) + def test_bad_basis_supervised(self): + scml = SCML_Supervised(basis='bad_basis') + X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) + y = np.array([1, 0, 1, 0]) + authorized_basis = ['triplet_diffs', 'lda'] + msg = ("`basis` must be one of the options '{}' or an array of shape " + "(n_basis, n_features).".format("', '".join(authorized_basis))) + with pytest.raises(ValueError) as raised_error: + scml.fit(X, y) + assert msg == raised_error.value.args[0] + + def test_dimension_reduction_msg(self): + scml = SCML(n_basis=2) + triplets = np.array([[[0, 1], [2, 1], [0, 0]], + [[2, 1], [0, 1], [2, 0]], + [[0, 0], [2, 0], [0, 1]], + [[2, 0], [0, 0], [2, 1]]]) n_basis = 1 msg = ("The number of bases with nonzero weight is less than the " "number of features of the input, in consequence the " @@ -108,12 +123,10 @@ def test_big_n_basis(self): assert msg == raised_warning[0].message.args[0] def test_n_basis_wrong_type(self): - X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) - y = np.array([1, 0, 1, 0]) - - constraints = Constraints(y) - triplets = constraints.generate_knntriplets(X, k_genuine=1, k_impostor=1) - triplets = X[triplets] + triplets = np.array([[[0, 1], [2, 1], [0, 0]], + [[2, 1], [0, 1], [2, 0]], + [[0, 0], [2, 0], [0, 1]], + [[2, 0], [0, 0], [2, 1]]]) n_basis = 4.0 @@ -124,13 +137,15 @@ def test_n_basis_wrong_type(self): scml.fit(triplets) assert msg == raised_error.value.args[0] - def test_bad_basis_supervised(self): - scml = SCML_Supervised(basis='bad_basis') - X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) - y = np.array([1, 0, 1, 0]) - authorized_basis = ['triplet_diffs', 'lda'] - msg = ("`basis` must be one of the options '{}' or an array of shape " - "(n_basis, n_features).".format("', '".join(authorized_basis))) + def test_n_basis_wrong_type_supervised(self): + X = np.array([[0, 0], [1, 1], [3, 3]]) + y = np.array([1, 2, 3]) + + n_basis = 4.0 + + scml = SCML_Supervised(n_basis=n_basis) + msg = ("n_basis should be an integer, instead it is of type %s" + % type(n_basis)) with pytest.raises(ValueError) as raised_error: scml.fit(X, y) assert msg == raised_error.value.args[0] @@ -140,7 +155,7 @@ def test_small_n_basis_supervised(self): y = np.array([1, 2, 3]) labels, class_count = np.unique(y, return_counts=True) - n_class = len(labels) + n_class = 3 scml = SCML_Supervised(n_basis=n_class) msg = ("The number of basis should be greater than the number of classes") @@ -153,7 +168,7 @@ def test_big_n_basis_supervised(self): y = np.array([1, 2, 3]) labels, class_count = np.unique(y, return_counts=True) - n_class = len(labels) + n_class = 3 num_eig = min(n_class-1, X.shape[1]) n_basis = X.shape[0]*2*num_eig @@ -165,17 +180,21 @@ def test_big_n_basis_supervised(self): scml.fit(X, y) assert msg == raised_error.value.args[0] - def test_n_basis_wrong_type_supervised(self): - X = np.array([[0, 0], [1, 1], [3, 3]]) - y = np.array([1, 2, 3]) + def test_array_basis(self): + """ Test that the proper error is raised when the shape of the input basis + array is not consistent with the input + """ + triplets = np.random.rand(3, 3, 2) - n_basis = 4.0 + basis = np.eye(3) - scml = SCML_Supervised(n_basis=n_basis) - msg = ("n_basis should be an integer, instead it is of type %s" - % type(n_basis)) + scml = SCML(n_basis=3, basis=basis) + + msg = ('The dimensionality ({}) of the provided bases must match the ' + 'dimensionality of the data ({}).' + .format(basis.shape[1], triplets.shape[2])) with pytest.raises(ValueError) as raised_error: - scml.fit(X, y) + scml.fit(triplets) assert msg == raised_error.value.args[0] def test_array_basis_supervised(self): From b84f8b1eebeee7e344fbacc9f740a586899e0908 Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 24 Mar 2020 13:03:50 +0100 Subject: [PATCH 43/70] use components_from_metric --- metric_learn/scml.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 704005a8..ad2ee8d9 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -5,6 +5,7 @@ from __future__ import print_function, absolute_import, division import numpy as np from .base_metric import _TripletsClassifierMixin, MahalanobisMixin +from ._util import components_from_metric from sklearn.base import TransformerMixin from .constraints import Constraints from sklearn.preprocessing import normalize @@ -121,6 +122,7 @@ def _fit(self, triplets, basis=None, n_basis=None): print("max iteration reached.") # return L matrix yielded from best weights + self.n_iter_ = iter self.components_ = self._components_from_basis_weights(basis, best_w) return self @@ -170,7 +172,7 @@ def _components_from_basis_weights(self, basis, w): return np.sqrt(w.T)*basis # equivalent to np.diag(np.sqrt(w)).dot(basis) else: # if metric is full rank - return np.linalg.cholesky(np.matmul(basis.T, w.T*basis)).T + return components_from_metric(np.matmul(basis.T, w.T*basis)) def _to_index_points(self, triplets): shape = triplets.shape From 4af49dadb70d99899233233176b037c8ec783b23 Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 24 Mar 2020 13:25:10 +0100 Subject: [PATCH 44/70] change TestSCML to object and parametrize tests --- metric_learn/scml.py | 3 +- test/metric_learn_test.py | 121 ++++++++++++++------------------------ 2 files changed, 45 insertions(+), 79 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index ad2ee8d9..f4992647 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -593,7 +593,8 @@ def _generate_bases_LDA(self, X, y): # get k_class same class neighbors neigh.fit(X=X[sel_c]) - neighbors = neigh.kneighbors(X=cX, n_neighbors=k_class[1, c], + # Only take the neighbors once for the bigest scale + neighbors = neigh.kneighbors(X=cX, n_neighbors=k_class[-1, c], return_distance=False) # add index set of neighbors for every cluster center for both scales diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 5e9d3afc..05b639b5 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -75,44 +75,36 @@ def test_singular_returns_pseudo_inverse(self): assert_allclose(pseudo_inverse.dot(cov_matrix).dot(pseudo_inverse), pseudo_inverse) -class TestSCML(MetricTestCase): - def test_iris(self): - scml = SCML_Supervised() - scml.fit(self.iris_points, self.iris_labels) - - csep = class_separation(scml.transform(self.iris_points), self.iris_labels) - self.assertLess(csep, 0.3) - - # TODO: merge into one by the use of parametrize - - def test_bad_basis(self): - scml = SCML(basis='bad_basis') - triplets = np.ones((3, 3, 3)) - authorized_basis = ['triplet_diffs'] - msg = ("`basis` must be one of the options '{}' or an array of shape " - "(n_basis, n_features).".format("', '".join(authorized_basis))) - with pytest.raises(ValueError) as raised_error: - scml.fit(triplets) - assert msg == raised_error.value.args[0] - - def test_bad_basis_supervised(self): - scml = SCML_Supervised(basis='bad_basis') - X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) - y = np.array([1, 0, 1, 0]) - authorized_basis = ['triplet_diffs', 'lda'] +class TestSCML(): + def test_iris(self): + X, y = load_iris(return_X_y=True) + scml = SCML_Supervised(n_basis=80, k_genuine=8, k_impostor=11, + random_state=42) + scml.fit(X, y) + csep = class_separation(scml.transform(X), y) + assert csep == 0.24555420119538296 + + @pytest.mark.parametrize(('estimator', 'data', 'authorized_basis'), + [(SCML, (np.ones((3, 3, 3)),), ['triplet_diffs']), + (SCML_Supervised, (np.array([[0, 0], [0, 1], + [2, 0], [2, 1]]), + np.array([1, 0, 1, 0])), + ['triplet_diffs', 'lda'])]) + def test_bad_basis(self, estimator, data, authorized_basis): + model = estimator(basis='bad_basis') msg = ("`basis` must be one of the options '{}' or an array of shape " "(n_basis, n_features).".format("', '".join(authorized_basis))) with pytest.raises(ValueError) as raised_error: - scml.fit(X, y) + model.fit(*data) assert msg == raised_error.value.args[0] def test_dimension_reduction_msg(self): scml = SCML(n_basis=2) triplets = np.array([[[0, 1], [2, 1], [0, 0]], - [[2, 1], [0, 1], [2, 0]], - [[0, 0], [2, 0], [0, 1]], - [[2, 0], [0, 0], [2, 1]]]) + [[2, 1], [0, 1], [2, 0]], + [[0, 0], [2, 0], [0, 1]], + [[2, 0], [0, 0], [2, 1]]]) n_basis = 1 msg = ("The number of bases with nonzero weight is less than the " "number of features of the input, in consequence the " @@ -122,39 +114,28 @@ def test_dimension_reduction_msg(self): scml.fit(triplets) assert msg == raised_warning[0].message.args[0] - def test_n_basis_wrong_type(self): - triplets = np.array([[[0, 1], [2, 1], [0, 0]], - [[2, 1], [0, 1], [2, 0]], - [[0, 0], [2, 0], [0, 1]], - [[2, 0], [0, 0], [2, 1]]]) - - n_basis = 4.0 - - scml = SCML(n_basis=n_basis) - msg = ("n_basis should be an integer, instead it is of type %s" - % type(n_basis)) - with pytest.raises(ValueError) as raised_error: - scml.fit(triplets) - assert msg == raised_error.value.args[0] - - def test_n_basis_wrong_type_supervised(self): - X = np.array([[0, 0], [1, 1], [3, 3]]) - y = np.array([1, 2, 3]) + @pytest.mark.parametrize(('estimator', 'data'), + [(SCML, (np.array([[[0, 1], [2, 1], [0, 0]], + [[2, 1], [0, 1], [2, 0]], + [[0, 0], [2, 0], [0, 1]], + [[2, 0], [0, 0], [2, 1]]]),)), + (SCML_Supervised, (np.array([[0, 0], [1, 1], + [3, 3]]), + np.array([1, 2, 3])))]) + def test_n_basis_wrong_type(self, estimator, data): n_basis = 4.0 - - scml = SCML_Supervised(n_basis=n_basis) + model = estimator(n_basis=n_basis) msg = ("n_basis should be an integer, instead it is of type %s" % type(n_basis)) with pytest.raises(ValueError) as raised_error: - scml.fit(X, y) + model.fit(*data) assert msg == raised_error.value.args[0] - def test_small_n_basis_supervised(self): + def test_small_n_basis_lda(self): X = np.array([[0, 0], [1, 1], [3, 3]]) y = np.array([1, 2, 3]) - labels, class_count = np.unique(y, return_counts=True) n_class = 3 scml = SCML_Supervised(n_basis=n_class) @@ -163,11 +144,10 @@ def test_small_n_basis_supervised(self): scml.fit(X, y) assert msg == raised_error.value.args[0] - def test_big_n_basis_supervised(self): + def test_big_n_basis_lda(self): X = np.array([[0, 0], [1, 1], [3, 3]]) y = np.array([1, 2, 3]) - labels, class_count = np.unique(y, return_counts=True) n_class = 3 num_eig = min(n_class-1, X.shape[1]) @@ -180,39 +160,24 @@ def test_big_n_basis_supervised(self): scml.fit(X, y) assert msg == raised_error.value.args[0] - def test_array_basis(self): - """ Test that the proper error is raised when the shape of the input basis - array is not consistent with the input - """ - triplets = np.random.rand(3, 3, 2) - - basis = np.eye(3) - - scml = SCML(n_basis=3, basis=basis) - - msg = ('The dimensionality ({}) of the provided bases must match the ' - 'dimensionality of the data ({}).' - .format(basis.shape[1], triplets.shape[2])) - with pytest.raises(ValueError) as raised_error: - scml.fit(triplets) - assert msg == raised_error.value.args[0] - - def test_array_basis_supervised(self): + @pytest.mark.parametrize(('estimator', 'data'), + [(SCML, (np.random.rand(3, 3, 2),)), + (SCML_Supervised, (np.array([[0, 0], [0, 1], + [2, 0], [2, 1]]), + np.array([1, 0, 1, 0])))]) + def test_array_basis(self, estimator, data): """ Test that the proper error is raised when the shape of the input basis array is not consistent with the input """ - X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) - y = np.array([1, 0, 1, 0]) - basis = np.eye(3) - scml = SCML_Supervised(n_basis=3, basis=basis, k_genuine=1, k_impostor=1) + scml = estimator(n_basis=3, basis=basis) msg = ('The dimensionality ({}) of the provided bases must match the ' 'dimensionality of the data ({}).' - .format(basis.shape[1], X.shape[1])) + .format(basis.shape[1], data[0].shape[-1])) with pytest.raises(ValueError) as raised_error: - scml.fit(X, y) + scml.fit(*data) assert msg == raised_error.value.args[0] From 13f7088333cc9461a9aa94985f2043640fa50411 Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 24 Mar 2020 15:41:53 +0100 Subject: [PATCH 45/70] fix test_iris --- test/metric_learn_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 05b639b5..c4a12fc6 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -79,11 +79,11 @@ def test_singular_returns_pseudo_inverse(self): class TestSCML(): def test_iris(self): X, y = load_iris(return_X_y=True) - scml = SCML_Supervised(n_basis=80, k_genuine=8, k_impostor=11, + scml = SCML_Supervised(n_basis=80, k_genuine=7, k_impostor=5, random_state=42) scml.fit(X, y) csep = class_separation(scml.transform(X), y) - assert csep == 0.24555420119538296 + assert csep < 0.23 @pytest.mark.parametrize(('estimator', 'data', 'authorized_basis'), [(SCML, (np.ones((3, 3, 3)),), ['triplet_diffs']), From 78e5084472a30f6ba7b51985dcafa125dd47a384 Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 24 Mar 2020 15:52:38 +0100 Subject: [PATCH 46/70] use model._authorized_basis and other fixes --- metric_learn/scml.py | 2 +- test/metric_learn_test.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index f4992647..c7760a4b 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -593,7 +593,7 @@ def _generate_bases_LDA(self, X, y): # get k_class same class neighbors neigh.fit(X=X[sel_c]) - # Only take the neighbors once for the bigest scale + # Only take the neighbors once for the biggest scale neighbors = neigh.kneighbors(X=cX, n_neighbors=k_class[-1, c], return_distance=False) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index c4a12fc6..c443e5f1 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -76,7 +76,7 @@ def test_singular_returns_pseudo_inverse(self): pseudo_inverse) -class TestSCML(): +class TestSCML(object): def test_iris(self): X, y = load_iris(return_X_y=True) scml = SCML_Supervised(n_basis=80, k_genuine=7, k_impostor=5, @@ -85,16 +85,16 @@ def test_iris(self): csep = class_separation(scml.transform(X), y) assert csep < 0.23 - @pytest.mark.parametrize(('estimator', 'data', 'authorized_basis'), - [(SCML, (np.ones((3, 3, 3)),), ['triplet_diffs']), + @pytest.mark.parametrize(('estimator', 'data'), + [(SCML, (np.ones((3, 3, 3)),)), (SCML_Supervised, (np.array([[0, 0], [0, 1], [2, 0], [2, 1]]), - np.array([1, 0, 1, 0])), - ['triplet_diffs', 'lda'])]) - def test_bad_basis(self, estimator, data, authorized_basis): + np.array([1, 0, 1, 0])))]) + def test_bad_basis(self, estimator, data): model = estimator(basis='bad_basis') msg = ("`basis` must be one of the options '{}' or an array of shape " - "(n_basis, n_features).".format("', '".join(authorized_basis))) + "(n_basis, n_features)." + .format("', '".join(model._authorized_basis))) with pytest.raises(ValueError) as raised_error: model.fit(*data) assert msg == raised_error.value.args[0] From daaf5b0fd926556765c8b63dfd479e1e542aab11 Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 24 Mar 2020 16:52:49 +0100 Subject: [PATCH 47/70] verbose test --- test/metric_learn_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index c443e5f1..e59708bf 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -180,6 +180,22 @@ def test_array_basis(self, estimator, data): scml.fit(*data) assert msg == raised_error.value.args[0] + @pytest.mark.parametrize(('estimator', 'data'), + [(SCML, (np.array([[0, 1, 2], [0, 1, 3], [1, 0, 2], + [1, 0, 3], [2, 3, 1], [2, 3, 0], + [3, 2, 1], [3, 2, 0]]),)), + (SCML_Supervised, (np.array([0, 1, 2, 3]), + np.array([0, 0, 1, 1])))]) + def test_verbose(self, estimator, data, capsys): + # assert there is proper output when verbose = True + model = estimator(preprocessor=np.array([[0, 0], [1, 1], [2, 2], [3, 3]]), + max_iter=1, verbose=True) + model.fit(*data) + out, _ = capsys.readouterr() + expected_out = ('[Global] iter 0\t obj 1.000000\t num_imp 8\n' + 'max iteration reached.\n') + assert out == expected_out + class TestLSML(MetricTestCase): def test_iris(self): From 9338a7d088e1ecb523cdf5559a78da28bf123691 Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 24 Mar 2020 18:12:55 +0100 Subject: [PATCH 48/70] revert sum_where --- metric_learn/scml.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index c7760a4b..299113b2 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -15,16 +15,6 @@ from sklearn.utils import check_array, check_random_state import warnings -# hack around lack of where in older numpy versions -try: - np.sum([[0, 1], [1, 1]], where=[False, True], axis=1) -except TypeError: - def sum_where(X, where): - return np.sum(X[where]) -else: - def sum_where(X, where): - return np.sum(X, where=where) - class _BaseSCML(MahalanobisMixin): @@ -88,7 +78,7 @@ def _fit(self, triplets, basis=None, n_basis=None): slack_mask = slack_val > 0 # loss function of learning task part of obj function - obj2 = sum_where(slack_val, slack_mask)/n_triplets + obj2 = np.sum(slack_val[slack_mask])/n_triplets obj = obj1 + obj2 if self.verbose: From 33dae2503aa8c7fdd0acfdb6e639220a1853e3aa Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 25 Mar 2020 11:31:05 +0100 Subject: [PATCH 49/70] small n_basis warning instead of error --- metric_learn/scml.py | 7 ++++--- test/metric_learn_test.py | 16 +++++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 299113b2..25364edc 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -546,9 +546,10 @@ def _generate_bases_LDA(self, X, y): raise ValueError("n_basis should be an integer, instead it is of type %s" % type(self.n_basis)) - if n_basis <= n_class: - raise ValueError("The number of basis should be greater than the" - " number of classes") + if n_basis < n_class: + warnings.warn("The number of basis is less than the number of classes," + " this will lead to less basis than the amount yielded by" + " LDA") elif n_basis >= X.shape[0]*2*num_eig: raise ValueError("The selected number of basis needs a greater number of" " clusters than the number of available samples") diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index e59708bf..f8c64c5e 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -133,16 +133,18 @@ def test_n_basis_wrong_type(self, estimator, data): assert msg == raised_error.value.args[0] def test_small_n_basis_lda(self): - X = np.array([[0, 0], [1, 1], [3, 3]]) - y = np.array([1, 2, 3]) + X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) + y = np.array([0, 0, 1, 1]) - n_class = 3 + n_class = 2 - scml = SCML_Supervised(n_basis=n_class) - msg = ("The number of basis should be greater than the number of classes") - with pytest.raises(ValueError) as raised_error: + scml = SCML_Supervised(n_basis=n_class-1) + msg = ("The number of basis is less than the number of classes," + " this will lead to less basis than the amount yielded by" + " LDA") + with pytest.warns(UserWarning) as raised_warning: scml.fit(X, y) - assert msg == raised_error.value.args[0] + assert msg == raised_warning[0].message.args[0] def test_big_n_basis_lda(self): X = np.array([[0, 0], [1, 1], [3, 3]]) From dbcf138b41501d63be731d444e6bfa4f9ebb2d79 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 25 Mar 2020 12:38:02 +0100 Subject: [PATCH 50/70] add test iris on triplet_diffs --- test/metric_learn_test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index f8c64c5e..e8e3fcae 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -77,13 +77,14 @@ def test_singular_returns_pseudo_inverse(self): class TestSCML(object): - def test_iris(self): + @pytest.mark.parametrize('basis', ('lda', 'triplet_diffs')) + def test_iris(self, basis): X, y = load_iris(return_X_y=True) - scml = SCML_Supervised(n_basis=80, k_genuine=7, k_impostor=5, + scml = SCML_Supervised(basis=basis, n_basis=85, k_genuine=7, k_impostor=5, random_state=42) scml.fit(X, y) csep = class_separation(scml.transform(X), y) - assert csep < 0.23 + assert csep < 0.24 @pytest.mark.parametrize(('estimator', 'data'), [(SCML, (np.ones((3, 3, 3)),)), From 34917d3d4ef19f5ab6ee0555be348bb96983539e Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 25 Mar 2020 16:59:20 +0100 Subject: [PATCH 51/70] test lda & triplet_diffs --- test/metric_learn_test.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index e8e3fcae..8541e9d6 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -199,6 +199,27 @@ def test_verbose(self, estimator, data, capsys): 'max iteration reached.\n') assert out == expected_out + def test_triplet_diffs(self): + expected_n_basis = 10 + model = SCML_Supervised(n_basis=expected_n_basis) + X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) + triplets = np.array([[0, 1, 2], [0, 1, 3], [1, 0, 2], [1, 0, 3], + [2, 3, 1], [2, 3, 0], [3, 2, 1], [3, 2, 0]]) + basis, n_basis = model._generate_bases_dist_diff(triplets, X) + expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2) + assert n_basis == expected_n_basis + np.testing.assert_allclose(basis, expected_basis) + + def test_lda(self): + expected_n_basis = 7 + model = SCML_Supervised(n_basis=expected_n_basis) + X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) + y = np.array([0, 0, 1, 1]) + basis, n_basis = model._generate_bases_LDA(X, y) + expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2) + assert n_basis == expected_n_basis + np.testing.assert_allclose(np.abs(basis), expected_basis) + class TestLSML(MetricTestCase): def test_iris(self): From f6f848d8c686ecccf3ea77a2a46c3bb3037a3b04 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 25 Mar 2020 17:41:50 +0100 Subject: [PATCH 52/70] improved messages --- metric_learn/scml.py | 16 ++++++++-------- test/metric_learn_test.py | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 25364edc..99f13113 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -546,18 +546,18 @@ def _generate_bases_LDA(self, X, y): raise ValueError("n_basis should be an integer, instead it is of type %s" % type(self.n_basis)) - if n_basis < n_class: - warnings.warn("The number of basis is less than the number of classes," - " this will lead to less basis than the amount yielded by" - " LDA") - elif n_basis >= X.shape[0]*2*num_eig: - raise ValueError("The selected number of basis needs a greater number of" - " clusters than the number of available samples") - # Number of clusters needed for 2 scales given the number of basis # yielded by every LDA n_clusters = int(np.ceil(n_basis/(2 * num_eig))) + if n_basis < n_class: + warnings.warn("The number of basis is less than the number of classes, " + "which may lead to poor discriminative performance.") + elif n_basis >= X.shape[0]*2*num_eig: + raise ValueError("The needed number of clusters to generate the selected" + "number of basis is unfeasible to achieve as it is " + "greater than the number of available samples") + kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state, algorithm='elkan').fit(X) cX = kmeans.cluster_centers_ diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 8541e9d6..0073b83b 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -140,9 +140,8 @@ def test_small_n_basis_lda(self): n_class = 2 scml = SCML_Supervised(n_basis=n_class-1) - msg = ("The number of basis is less than the number of classes," - " this will lead to less basis than the amount yielded by" - " LDA") + msg = ("The number of basis is less than the number of classes, which may" + " lead to poor discriminative performance.") with pytest.warns(UserWarning) as raised_warning: scml.fit(X, y) assert msg == raised_warning[0].message.args[0] @@ -157,8 +156,9 @@ def test_big_n_basis_lda(self): n_basis = X.shape[0]*2*num_eig scml = SCML_Supervised(n_basis=n_basis) - msg = ("The selected number of basis needs a greater number of clusters" - " than the number of available samples") + msg = ("The needed number of clusters to generate the selected" + "number of basis is unfeasible to achieve as it is " + "greater than the number of available samples") with pytest.raises(ValueError) as raised_error: scml.fit(X, y) assert msg == raised_error.value.args[0] From 38fd80beab8ac49059680547a1fe00f2d03e6c8e Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 25 Mar 2020 17:42:26 +0100 Subject: [PATCH 53/70] remove quadruplets and triplets from pipeline test --- test/test_sklearn_compat.py | 79 ++++++++++++++++++------------------- test/test_utils.py | 7 +++- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index c561ae36..d53d0c0f 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -22,7 +22,9 @@ mock_preprocessor, tuples_learners, ids_tuples_learners, pairs_learners, ids_pairs_learners, remove_y, - triplets_learners, quadruplets_learners) + triplets_learners, quadruplets_learners, + metric_learners_pipeline, + ids_metric_learners_pipeline) class Stable_RCA_Supervised(RCA_Supervised): @@ -330,55 +332,50 @@ def test_estimators_fit_returns_self(estimator, build_dataset, @pytest.mark.parametrize('with_preprocessor', [True, False]) -@pytest.mark.parametrize('estimator, build_dataset', metric_learners, - ids=ids_metric_learners) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners_pipeline, + ids=ids_metric_learners_pipeline) def test_pipeline_consistency(estimator, build_dataset, with_preprocessor): # Adapted from scikit learn # check that make_pipeline(est) gives same score as est - # we do this test on all except quadruplets (since they don't have a y - # in fit): - no_label_learners = quadruplets_learners + triplets_learners - if estimator.__class__.__name__ not in [e.__class__.__name__ - for (e, _) in - no_label_learners]: - input_data, y, preprocessor, _ = build_dataset(with_preprocessor) - - def make_random_state(estimator, in_pipeline): - rs = {} - name_estimator = estimator.__class__.__name__ - if name_estimator[-11:] == '_Supervised': - name_param = 'random_state' - if in_pipeline: - name_param = name_estimator.lower() + '__' + name_param - rs[name_param] = check_random_state(0) - return rs - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor, - **make_random_state(estimator, False)) - pipeline = make_pipeline(estimator) - estimator.fit(input_data, y) - estimator.set_params(preprocessor=preprocessor) - pipeline.set_params(**make_random_state(estimator, True)) - pipeline.fit(input_data, y) + input_data, y, preprocessor, _ = build_dataset(with_preprocessor) - if hasattr(estimator, 'score'): - result = estimator.score(input_data, y) - result_pipe = pipeline.score(input_data, y) - assert_allclose_dense_sparse(result, result_pipe) + def make_random_state(estimator, in_pipeline): + rs = {} + name_estimator = estimator.__class__.__name__ + if name_estimator[-11:] == '_Supervised': + name_param = 'random_state' + if in_pipeline: + name_param = name_estimator.lower() + '__' + name_param + rs[name_param] = check_random_state(0) + return rs - if hasattr(estimator, 'predict'): - result = estimator.predict(input_data) - result_pipe = pipeline.predict(input_data) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor, + **make_random_state(estimator, False)) + pipeline = make_pipeline(estimator) + estimator.fit(input_data, y) + estimator.set_params(preprocessor=preprocessor) + pipeline.set_params(**make_random_state(estimator, True)) + pipeline.fit(input_data, y) + + if hasattr(estimator, 'score'): + result = estimator.score(input_data, y) + result_pipe = pipeline.score(input_data, y) + assert_allclose_dense_sparse(result, result_pipe) + + if hasattr(estimator, 'predict'): + result = estimator.predict(input_data) + result_pipe = pipeline.predict(input_data) + assert_allclose_dense_sparse(result, result_pipe) + + if issubclass(estimator.__class__, TransformerMixin): + if hasattr(estimator, 'transform'): + result = estimator.transform(input_data) + result_pipe = pipeline.transform(input_data) assert_allclose_dense_sparse(result, result_pipe) - if issubclass(estimator.__class__, TransformerMixin): - if hasattr(estimator, 'transform'): - result = estimator.transform(input_data) - result_pipe = pipeline.transform(input_data) - assert_allclose_dense_sparse(result, result_pipe) - @pytest.mark.parametrize('with_preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', metric_learners, diff --git a/test/test_utils.py b/test/test_utils.py index 6dfa22df..35a39bac 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -139,8 +139,8 @@ def build_quadruplets(with_preprocessor=False): (MMC_Supervised(max_iter=5), build_classification), (RCA_Supervised(num_chunks=5), build_classification), (SDML_Supervised(prior='identity', balance_param=1e-5), - build_classification), (SCML_Supervised(), - build_classification)] + build_classification), + (SCML_Supervised(), build_classification)] ids_classifiers = list(map(lambda x: x.__class__.__name__, [learner for (learner, _) in classifiers])) @@ -163,6 +163,9 @@ def build_quadruplets(with_preprocessor=False): metric_learners = tuples_learners + supervised_learners ids_metric_learners = ids_tuples_learners + ids_supervised_learners +metric_learners_pipeline = pairs_learners + supervised_learners +ids_metric_learners_pipeline = ids_pairs_learners + ids_supervised_learners + def remove_y(estimator, X, y): """Quadruplets and triplets learners have no y in fit, but to write test for From ad47f7f390f414d1ab5e3e1078fbed6b6b2a9f28 Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 25 Mar 2020 18:57:24 +0100 Subject: [PATCH 54/70] test big n_features --- test/metric_learn_test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 0073b83b..8fd29580 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -13,6 +13,7 @@ from sklearn.utils.testing import assert_warns_message from sklearn.exceptions import ConvergenceWarning, ChangedBehaviorWarning from sklearn.utils.validation import check_X_y +from sklearn.preprocessing import StandardScaler try: from inverse_covariance import quic assert(quic) @@ -86,6 +87,16 @@ def test_iris(self, basis): csep = class_separation(scml.transform(X), y) assert csep < 0.24 + def test_big_n_features(self): + X, y = make_classification(n_samples=100, n_classes=3, n_features=60, + n_informative=60, n_redundant=0, n_repeated=0, + random_state=42) + X = StandardScaler().fit_transform(X) + scml = SCML_Supervised(random_state=42) + scml.fit(X, y) + csep = class_separation(scml.transform(X), y) + assert csep < 0.7 + @pytest.mark.parametrize(('estimator', 'data'), [(SCML, (np.ones((3, 3, 3)),)), (SCML_Supervised, (np.array([[0, 0], [0, 1], From 9541b75e23b82c9a83e0ba632d117149b6371279 Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 26 Mar 2020 11:28:12 +0100 Subject: [PATCH 55/70] Correct output iters --- metric_learn/scml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 99f13113..e7544b44 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -67,7 +67,7 @@ def _fit(self, triplets, basis=None, n_basis=None): rng = check_random_state(self.random_state) rand_int = rng.randint(low=0, high=n_triplets, size=self.max_iter) for iter in range(self.max_iter): - if iter % self.output_iter == 0: + if (iter + 1) % self.output_iter == 0: # regularization part of obj function obj1 = np.sum(w)*self.beta @@ -83,7 +83,7 @@ def _fit(self, triplets, basis=None, n_basis=None): obj = obj1 + obj2 if self.verbose: count = np.sum(slack_mask) - print("[Global] iter %d\t obj %.6f\t num_imp %d" % (iter, + print("[Global] iter %d\t obj %.6f\t num_imp %d" % (iter+1, obj, count)) # update the best From ca6c69d706d8c6f6acc02e99c5f6b1ae7d6b9e47 Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 26 Mar 2020 12:03:12 +0100 Subject: [PATCH 56/70] output_iter on supervised and improved verbose --- metric_learn/scml.py | 8 ++++---- test/metric_learn_test.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index e7544b44..1998d625 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -83,8 +83,8 @@ def _fit(self, triplets, basis=None, n_basis=None): obj = obj1 + obj2 if self.verbose: count = np.sum(slack_mask) - print("[Global] iter %d\t obj %.6f\t num_imp %d" % (iter+1, - obj, count)) + print("[%s] iter %d\t obj %.6f\t num_imp %d" % + (self.__class__.__name__, iter+1, obj, count)) # update the best if obj < best_obj: @@ -473,8 +473,8 @@ def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='lda', self.k_genuine = k_genuine self.k_impostor = k_impostor _BaseSCML.__init__(self, beta=beta, basis=basis, n_basis=n_basis, - max_iter=max_iter, verbose=verbose, - preprocessor=preprocessor, + max_iter=max_iter, output_iter=output_iter, + verbose=verbose, preprocessor=preprocessor, random_state=random_state) def fit(self, X, y): diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 8fd29580..91ac26c7 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -203,11 +203,11 @@ def test_array_basis(self, estimator, data): def test_verbose(self, estimator, data, capsys): # assert there is proper output when verbose = True model = estimator(preprocessor=np.array([[0, 0], [1, 1], [2, 2], [3, 3]]), - max_iter=1, verbose=True) + max_iter=1, output_iter=1, verbose=True) model.fit(*data) out, _ = capsys.readouterr() - expected_out = ('[Global] iter 0\t obj 1.000000\t num_imp 8\n' - 'max iteration reached.\n') + expected_out = ('[%s] iter 1\t obj 1.000000\t num_imp 8\n' + 'max iteration reached.\n' % estimator.__name__) assert out == expected_out def test_triplet_diffs(self): From 9902dfee2d76bff2f455e91f070114b5781a0c0d Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 26 Mar 2020 12:05:41 +0100 Subject: [PATCH 57/70] flake8 fix --- test/test_sklearn_compat.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index d53d0c0f..7f7d7037 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -22,7 +22,6 @@ mock_preprocessor, tuples_learners, ids_tuples_learners, pairs_learners, ids_pairs_learners, remove_y, - triplets_learners, quadruplets_learners, metric_learners_pipeline, ids_metric_learners_pipeline) @@ -353,7 +352,7 @@ def make_random_state(estimator, in_pipeline): estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor, - **make_random_state(estimator, False)) + **make_random_state(estimator, False)) pipeline = make_pipeline(estimator) estimator.fit(input_data, y) estimator.set_params(preprocessor=preprocessor) From fdf3067723279d44fedc5b4480b89a11cc1d924f Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 26 Mar 2020 13:37:30 +0100 Subject: [PATCH 58/70] bases generation test comments --- test/metric_learn_test.py | 5 +++++ test/test_utils.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 91ac26c7..be2d00ed 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -217,6 +217,8 @@ def test_triplet_diffs(self): triplets = np.array([[0, 1, 2], [0, 1, 3], [1, 0, 2], [1, 0, 3], [2, 3, 1], [2, 3, 0], [3, 2, 1], [3, 2, 0]]) basis, n_basis = model._generate_bases_dist_diff(triplets, X) + # All points are along the same line, so the only possible basis will be + # the vector along that line normalized. expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2) assert n_basis == expected_n_basis np.testing.assert_allclose(basis, expected_basis) @@ -227,6 +229,9 @@ def test_lda(self): X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) y = np.array([0, 0, 1, 1]) basis, n_basis = model._generate_bases_LDA(X, y) + # All points are along the same line, so the only possible basis will be + # the vector along that line normalized. In this case it is possible to + # obtain it with positive or negative orientations. expected_basis = np.ones((expected_n_basis, 2))/np.sqrt(2) assert n_basis == expected_n_basis np.testing.assert_allclose(np.abs(basis), expected_basis) diff --git a/test/test_utils.py b/test/test_utils.py index 35a39bac..fdcb864a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -139,7 +139,7 @@ def build_quadruplets(with_preprocessor=False): (MMC_Supervised(max_iter=5), build_classification), (RCA_Supervised(num_chunks=5), build_classification), (SDML_Supervised(prior='identity', balance_param=1e-5), - build_classification), + build_classification), (SCML_Supervised(), build_classification)] ids_classifiers = list(map(lambda x: x.__class__.__name__, [learner for (learner, _) in From 517cce4fe9b8e7a6810daf68341111ff4d213d6a Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 26 Mar 2020 13:45:39 +0100 Subject: [PATCH 59/70] change big_n_basis_lda error msg --- metric_learn/scml.py | 6 +++--- test/metric_learn_test.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 1998d625..478e79a5 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -554,9 +554,9 @@ def _generate_bases_LDA(self, X, y): warnings.warn("The number of basis is less than the number of classes, " "which may lead to poor discriminative performance.") elif n_basis >= X.shape[0]*2*num_eig: - raise ValueError("The needed number of clusters to generate the selected" - "number of basis is unfeasible to achieve as it is " - "greater than the number of available samples") + raise ValueError("Not enough samples to generate %d LDA bases, n_basis" + "should be smaller than %d" % + (n_basis, X.shape[0]*2*num_eig)) kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state, algorithm='elkan').fit(X) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index be2d00ed..9240ffa6 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -167,9 +167,9 @@ def test_big_n_basis_lda(self): n_basis = X.shape[0]*2*num_eig scml = SCML_Supervised(n_basis=n_basis) - msg = ("The needed number of clusters to generate the selected" - "number of basis is unfeasible to achieve as it is " - "greater than the number of available samples") + msg = ("Not enough samples to generate %d LDA bases, n_basis" + "should be smaller than %d" % + (n_basis, n_basis)) with pytest.raises(ValueError) as raised_error: scml.fit(X, y) assert msg == raised_error.value.args[0] From 06d92f2e86dde2cada014ef571621c33497f64ce Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 26 Mar 2020 16:18:22 +0100 Subject: [PATCH 60/70] test generated n_basis and basis shape --- test/metric_learn_test.py | 46 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 9240ffa6..62dcc0f5 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -26,7 +26,7 @@ ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised, SDML, RCA, ITML, LSML, SCML) # Import this specially for testing. -from metric_learn.constraints import wrap_pairs +from metric_learn.constraints import wrap_pairs, Constraints from metric_learn.lmnn import _sum_outer_products @@ -210,7 +210,7 @@ def test_verbose(self, estimator, data, capsys): 'max iteration reached.\n' % estimator.__name__) assert out == expected_out - def test_triplet_diffs(self): + def test_triplet_diffs_toy(self): expected_n_basis = 10 model = SCML_Supervised(n_basis=expected_n_basis) X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) @@ -223,7 +223,7 @@ def test_triplet_diffs(self): assert n_basis == expected_n_basis np.testing.assert_allclose(basis, expected_basis) - def test_lda(self): + def test_lda_toy(self): expected_n_basis = 7 model = SCML_Supervised(n_basis=expected_n_basis) X = np.array([[0, 0], [1, 1], [2, 2], [3, 3]]) @@ -236,6 +236,46 @@ def test_lda(self): assert n_basis == expected_n_basis np.testing.assert_allclose(np.abs(basis), expected_basis) + @pytest.mark.parametrize('n_samples', [100, 500, 1000]) + @pytest.mark.parametrize('n_features', [10, 50, 100]) + @pytest.mark.parametrize('n_classes', [5, 10, 15]) + def test_triplet_diffs(self, n_samples, n_features, n_classes): + X, y = make_classification(n_samples=n_samples, n_classes=n_classes, + n_features=n_features, n_informative=n_features, + n_redundant=0, n_repeated=0) + X = StandardScaler().fit_transform(X) + + model = SCML_Supervised() + constraints = Constraints(y) + triplets = constraints.generate_knntriplets(X, model.k_genuine, + model.k_impostor) + basis, n_basis = model._generate_bases_dist_diff(triplets, X) + + expected_n_basis = n_features*80 + expected_shape = (expected_n_basis, n_features) + + assert n_basis == expected_n_basis + assert basis.shape == expected_shape + + @pytest.mark.parametrize('n_samples', [100, 500, 1000]) + @pytest.mark.parametrize('n_features', [10, 50, 100]) + @pytest.mark.parametrize('n_classes', [5, 10, 15]) + def test_lda(self, n_samples, n_features, n_classes): + X, y = make_classification(n_samples=n_samples, n_classes=n_classes, + n_features=n_features, n_informative=n_features, + n_redundant=0, n_repeated=0) + X = StandardScaler().fit_transform(X) + + model = SCML_Supervised() + basis, n_basis = model._generate_bases_LDA(X, y) + + num_eig = min(n_classes-1, n_features) + expected_n_basis = min(20*n_features, n_samples*2*num_eig - 1) + expected_shape = (expected_n_basis, n_features) + + assert n_basis == expected_n_basis + assert basis.shape == expected_shape + class TestLSML(MetricTestCase): def test_iris(self): From 5fd80e9906a1f30974ba5772c26e17dc0f5e98fc Mon Sep 17 00:00:00 2001 From: grudloff Date: Thu, 26 Mar 2020 18:08:22 +0100 Subject: [PATCH 61/70] add mini batch optimization --- metric_learn/scml.py | 36 +++++++++++++++++++----------------- test/metric_learn_test.py | 2 +- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 478e79a5..783b4015 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -22,14 +22,15 @@ class _BaseSCML(MahalanobisMixin): _authorized_basis = ['triplet_diffs'] def __init__(self, beta=1e-5, basis='triplet_diffs', n_basis=None, - gamma=5e-3, max_iter=100000, output_iter=5000, verbose=False, - preprocessor=None, random_state=None): + gamma=5e-3, max_iter=100000, output_iter=5000, batch_size=10, + verbose=False, preprocessor=None, random_state=None): self.beta = beta self.basis = basis self.n_basis = n_basis self.gamma = gamma self.max_iter = max_iter self.output_iter = output_iter + self.batch_size = batch_size self.verbose = verbose self.preprocessor = preprocessor self.random_state = random_state @@ -65,9 +66,12 @@ def _fit(self, triplets, basis=None, n_basis=None): best_obj = np.inf rng = check_random_state(self.random_state) - rand_int = rng.randint(low=0, high=n_triplets, size=self.max_iter) - for iter in range(self.max_iter): - if (iter + 1) % self.output_iter == 0: + max_iter = int(self.max_iter/self.batch_size) + output_iter = int(self.output_iter/self.batch_size) + rand_int = rng.randint(low=0, high=n_triplets, + size=(max_iter, self.batch_size)) + for iter in range(max_iter): + if (iter + 1) % output_iter == 0: # regularization part of obj function obj1 = np.sum(w)*self.beta @@ -84,26 +88,23 @@ def _fit(self, triplets, basis=None, n_basis=None): if self.verbose: count = np.sum(slack_mask) print("[%s] iter %d\t obj %.6f\t num_imp %d" % - (self.__class__.__name__, iter+1, obj, count)) + (self.__class__.__name__, (iter+1)*self.batch_size, obj, count)) # update the best if obj < best_obj: best_obj = obj best_w = w - # TODO: - # Maybe allow the usage of mini-batch opt? - idx = rand_int[iter] slack_val = 1 + np.matmul(dist_diff[idx, :], w.T) - if slack_val > 0: - avg_grad_w = (iter * avg_grad_w + dist_diff[idx, :]) / (iter+1) - else: - avg_grad_w = iter * avg_grad_w / (iter+1) + slack_mask = np.squeeze(slack_val > 0, axis=1) + avg_grad_w = ((iter * avg_grad_w + np.sum(dist_diff[idx[slack_mask], :], + axis=0, keepdims=True)) + / (iter+1)) - scale_f = -np.sqrt(iter+1) / self.gamma + scale_f = -np.sqrt(iter+1) / (self.gamma*self.batch_size) # proximal operator with negative trimming equivalent w = scale_f * np.minimum(avg_grad_w + self.beta, 0) @@ -469,13 +470,14 @@ class SCML_Supervised(_BaseSCML, TransformerMixin): def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='lda', n_basis=None, gamma=5e-3, max_iter=100000, output_iter=5000, - verbose=False, preprocessor=None, random_state=None): + batch_size=10, verbose=False, preprocessor=None, + random_state=None): self.k_genuine = k_genuine self.k_impostor = k_impostor _BaseSCML.__init__(self, beta=beta, basis=basis, n_basis=n_basis, max_iter=max_iter, output_iter=output_iter, - verbose=verbose, preprocessor=preprocessor, - random_state=random_state) + batch_size=batch_size, verbose=verbose, + preprocessor=preprocessor, random_state=random_state) def fit(self, X, y): """Create constraints from labels and learn the SCML model. diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 62dcc0f5..c15ca8dc 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -203,7 +203,7 @@ def test_array_basis(self, estimator, data): def test_verbose(self, estimator, data, capsys): # assert there is proper output when verbose = True model = estimator(preprocessor=np.array([[0, 0], [1, 1], [2, 2], [3, 3]]), - max_iter=1, output_iter=1, verbose=True) + max_iter=1, output_iter=1, batch_size=1, verbose=True) model.fit(*data) out, _ = capsys.readouterr() expected_out = ('[%s] iter 1\t obj 1.000000\t num_imp 8\n' From 18289e0f74f37e678c58a974a086dccaea6ea901 Mon Sep 17 00:00:00 2001 From: grudloff Date: Fri, 27 Mar 2020 10:11:22 +0100 Subject: [PATCH 62/70] correct iter convention --- metric_learn/scml.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 783b4015..56de18bf 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -22,7 +22,7 @@ class _BaseSCML(MahalanobisMixin): _authorized_basis = ['triplet_diffs'] def __init__(self, beta=1e-5, basis='triplet_diffs', n_basis=None, - gamma=5e-3, max_iter=100000, output_iter=5000, batch_size=10, + gamma=5e-3, max_iter=10000, output_iter=500, batch_size=10, verbose=False, preprocessor=None, random_state=None): self.beta = beta self.basis = basis @@ -66,12 +66,10 @@ def _fit(self, triplets, basis=None, n_basis=None): best_obj = np.inf rng = check_random_state(self.random_state) - max_iter = int(self.max_iter/self.batch_size) - output_iter = int(self.output_iter/self.batch_size) rand_int = rng.randint(low=0, high=n_triplets, - size=(max_iter, self.batch_size)) - for iter in range(max_iter): - if (iter + 1) % output_iter == 0: + size=(self.max_iter, self.batch_size)) + for iter in range(self.max_iter): + if (iter + 1) % self.output_iter == 0: # regularization part of obj function obj1 = np.sum(w)*self.beta @@ -88,7 +86,7 @@ def _fit(self, triplets, basis=None, n_basis=None): if self.verbose: count = np.sum(slack_mask) print("[%s] iter %d\t obj %.6f\t num_imp %d" % - (self.__class__.__name__, (iter+1)*self.batch_size, obj, count)) + (self.__class__.__name__, (iter+1), obj, count)) # update the best if obj < best_obj: @@ -469,7 +467,7 @@ class SCML_Supervised(_BaseSCML, TransformerMixin): _authorized_basis = _BaseSCML._authorized_basis + ['lda'] def __init__(self, k_genuine=3, k_impostor=10, beta=1e-5, basis='lda', - n_basis=None, gamma=5e-3, max_iter=100000, output_iter=5000, + n_basis=None, gamma=5e-3, max_iter=10000, output_iter=500, batch_size=10, verbose=False, preprocessor=None, random_state=None): self.k_genuine = k_genuine From bdf981e338577f5d4fae243158215e4aef7d0318 Mon Sep 17 00:00:00 2001 From: grudloff Date: Fri, 27 Mar 2020 10:11:49 +0100 Subject: [PATCH 63/70] eliminate n_samples = 1000 --- test/metric_learn_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index c15ca8dc..6912fb49 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -236,7 +236,7 @@ def test_lda_toy(self): assert n_basis == expected_n_basis np.testing.assert_allclose(np.abs(basis), expected_basis) - @pytest.mark.parametrize('n_samples', [100, 500, 1000]) + @pytest.mark.parametrize('n_samples', [100, 500]) @pytest.mark.parametrize('n_features', [10, 50, 100]) @pytest.mark.parametrize('n_classes', [5, 10, 15]) def test_triplet_diffs(self, n_samples, n_features, n_classes): @@ -257,7 +257,7 @@ def test_triplet_diffs(self, n_samples, n_features, n_classes): assert n_basis == expected_n_basis assert basis.shape == expected_shape - @pytest.mark.parametrize('n_samples', [100, 500, 1000]) + @pytest.mark.parametrize('n_samples', [100, 500]) @pytest.mark.parametrize('n_features', [10, 50, 100]) @pytest.mark.parametrize('n_classes', [5, 10, 15]) def test_lda(self, n_samples, n_features, n_classes): From c551344768b1b5b8fa5d4d12af669dbaf65fc4be Mon Sep 17 00:00:00 2001 From: grudloff Date: Fri, 27 Mar 2020 15:19:32 +0100 Subject: [PATCH 64/70] batch grad refactored --- metric_learn/scml.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 56de18bf..2a806988 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -73,10 +73,10 @@ def _fit(self, triplets, basis=None, n_basis=None): # regularization part of obj function obj1 = np.sum(w)*self.beta - # Every triplet distance difference in the space given by L - # plus a slack of one + # Every triplet distance difference in the space given by L + # plus a slack of one slack_val = 1 + np.matmul(dist_diff, w.T) - # Mask of places with positive slack + # Mask of places with positive slack slack_mask = slack_val > 0 # loss function of learning task part of obj function @@ -96,13 +96,13 @@ def _fit(self, triplets, basis=None, n_basis=None): idx = rand_int[iter] slack_val = 1 + np.matmul(dist_diff[idx, :], w.T) - slack_mask = np.squeeze(slack_val > 0, axis=1) - avg_grad_w = ((iter * avg_grad_w + np.sum(dist_diff[idx[slack_mask], :], - axis=0, keepdims=True)) - / (iter+1)) - scale_f = -np.sqrt(iter+1) / (self.gamma*self.batch_size) + grad_w = np.sum(dist_diff[idx[slack_mask], :], + axis=0, keepdims=True)/self.batch_size + avg_grad_w = (iter * avg_grad_w + grad_w) / (iter+1) + + scale_f = -np.sqrt(iter+1) / self.gamma # proximal operator with negative trimming equivalent w = scale_f * np.minimum(avg_grad_w + self.beta, 0) From c02e6e5b50882a3de07c2aa495e4085c706f9288 Mon Sep 17 00:00:00 2001 From: grudloff Date: Mon, 30 Mar 2020 15:43:42 +0200 Subject: [PATCH 65/70] adagrad adaptive learning --- metric_learn/scml.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 2a806988..b2e3007d 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -60,9 +60,16 @@ def _fit(self, triplets, basis=None, n_basis=None): n_triplets = triplets.shape[0] + # weight vector w = np.zeros((1, n_basis)) + # avarage obj gradient wrt weights avg_grad_w = np.zeros((1, n_basis)) + # l2 norm in time of all obj gradients wrt weights + ada_grad_w = np.zeros((1, n_basis)) + # slack for not dividing by zero + delta = 0.001 + best_obj = np.inf rng = check_random_state(self.random_state) @@ -102,7 +109,9 @@ def _fit(self, triplets, basis=None, n_basis=None): axis=0, keepdims=True)/self.batch_size avg_grad_w = (iter * avg_grad_w + grad_w) / (iter+1) - scale_f = -np.sqrt(iter+1) / self.gamma + ada_grad_w = np.sqrt(np.square(ada_grad_w) + np.square(grad_w)) + + scale_f = -(iter+1) / self.gamma / (delta + ada_grad_w) # proximal operator with negative trimming equivalent w = scale_f * np.minimum(avg_grad_w + self.beta, 0) From 9fd186cefbf783b0f32c138d86963022c9cdffe9 Mon Sep 17 00:00:00 2001 From: grudloff Date: Mon, 30 Mar 2020 21:44:03 +0200 Subject: [PATCH 66/70] int input checks and tests --- metric_learn/scml.py | 54 +++++++++++++++++++++++++++------------ test/metric_learn_test.py | 51 ++++++++++++++++++++++++++++++++++-- 2 files changed, 87 insertions(+), 18 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index b2e3007d..08809b7b 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -43,6 +43,20 @@ def _fit(self, triplets, basis=None, n_basis=None): dual averaging method. """ + if not isinstance(self.max_iter, int): + raise ValueError("max_iter should be an integer, instead it is of type" + " %s" % type(self.max_iter)) + if not isinstance(self.output_iter, int): + raise ValueError("output_iter should be an integer, instead it is of " + "type %s" % type(self.output_iter)) + if not isinstance(self.batch_size, int): + raise ValueError("batch_size should be an integer, instead it is of type" + " %s" % type(self.batch_size)) + + if(self.output_iter > self.max_iter): + raise ValueError("The value of output_iter must be equal or smaller than" + " max_iter.") + # Currently prepare_inputs makes triplets contain points and not indices triplets = self._prepare_inputs(triplets, type_of_inputs='tuples') @@ -76,6 +90,23 @@ def _fit(self, triplets, basis=None, n_basis=None): rand_int = rng.randint(low=0, high=n_triplets, size=(self.max_iter, self.batch_size)) for iter in range(self.max_iter): + + idx = rand_int[iter] + + slack_val = 1 + np.matmul(dist_diff[idx, :], w.T) + slack_mask = np.squeeze(slack_val > 0, axis=1) + + grad_w = np.sum(dist_diff[idx[slack_mask], :], + axis=0, keepdims=True)/self.batch_size + avg_grad_w = (iter * avg_grad_w + grad_w) / (iter+1) + + ada_grad_w = np.sqrt(np.square(ada_grad_w) + np.square(grad_w)) + + scale_f = -(iter+1) / self.gamma / (delta + ada_grad_w) + + # proximal operator with negative trimming equivalent + w = scale_f * np.minimum(avg_grad_w + self.beta, 0) + if (iter + 1) % self.output_iter == 0: # regularization part of obj function obj1 = np.sum(w)*self.beta @@ -100,22 +131,6 @@ def _fit(self, triplets, basis=None, n_basis=None): best_obj = obj best_w = w - idx = rand_int[iter] - - slack_val = 1 + np.matmul(dist_diff[idx, :], w.T) - slack_mask = np.squeeze(slack_val > 0, axis=1) - - grad_w = np.sum(dist_diff[idx[slack_mask], :], - axis=0, keepdims=True)/self.batch_size - avg_grad_w = (iter * avg_grad_w + grad_w) / (iter+1) - - ada_grad_w = np.sqrt(np.square(ada_grad_w) + np.square(grad_w)) - - scale_f = -(iter+1) / self.gamma / (delta + ada_grad_w) - - # proximal operator with negative trimming equivalent - w = scale_f * np.minimum(avg_grad_w + self.beta, 0) - if self.verbose: print("max iteration reached.") @@ -506,6 +521,13 @@ def fit(self, X, y): basis, n_basis = self._initialize_basis_supervised(X, y) + if not isinstance(self.k_genuine, int): + raise ValueError("k_genuine should be an integer, instead it is of type" + " %s" % type(self.k_genuine)) + if not isinstance(self.k_impostor, int): + raise ValueError("k_impostor should be an integer, instead it is of " + "type %s" % type(self.k_impostor)) + constraints = Constraints(y) triplets = constraints.generate_knntriplets(X, self.k_genuine, self.k_impostor) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 6912fb49..f6cce305 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -203,10 +203,11 @@ def test_array_basis(self, estimator, data): def test_verbose(self, estimator, data, capsys): # assert there is proper output when verbose = True model = estimator(preprocessor=np.array([[0, 0], [1, 1], [2, 2], [3, 3]]), - max_iter=1, output_iter=1, batch_size=1, verbose=True) + max_iter=1, output_iter=1, batch_size=1, basis='triplet_diffs', + random_state=42, verbose=True) model.fit(*data) out, _ = capsys.readouterr() - expected_out = ('[%s] iter 1\t obj 1.000000\t num_imp 8\n' + expected_out = ('[%s] iter 1\t obj 0.569946\t num_imp 2\n' 'max iteration reached.\n' % estimator.__name__) assert out == expected_out @@ -276,6 +277,52 @@ def test_lda(self, n_samples, n_features, n_classes): assert n_basis == expected_n_basis assert basis.shape == expected_shape + @pytest.mark.parametrize('name', ['max_iter', 'output_iter', 'batch_size', + 'n_basis']) + def test_int_inputs(self, name): + value = 1.0 + d = {name: value} + scml = SCML(**d) + triplets = np.array([[[0, 1], [2, 1], [0, 0]], + [[2, 1], [0, 1], [2, 0]], + [[0, 0], [2, 0], [0, 1]], + [[2, 0], [0, 0], [2, 1]]]) + + msg = name + msg += (" should be an integer, instead it is of type" + " %s" % type(value)) + with pytest.raises(ValueError) as raised_error: + scml.fit(triplets) + assert msg == raised_error.value.args[0] + + @pytest.mark.parametrize('name', ['max_iter', 'output_iter', 'batch_size', + 'k_genuine', 'k_impostor', 'n_basis']) + def test_int_inputs_supervised(self, name): + value = 1.0 + d = {name: value} + scml = SCML_Supervised(**d) + X = np.array([[0, 0], [1, 1], [3, 3], [4, 4]]) + y = np.array([1, 1, 0, 0]) + msg = name + msg += (" should be an integer, instead it is of type" + " %s" % type(value)) + with pytest.raises(ValueError) as raised_error: + scml.fit(X, y) + assert msg == raised_error.value.args[0] + + def test_large_output_iter(self): + scml = SCML(max_iter=1, output_iter=2) + triplets = np.array([[[0, 1], [2, 1], [0, 0]], + [[2, 1], [0, 1], [2, 0]], + [[0, 0], [2, 0], [0, 1]], + [[2, 0], [0, 0], [2, 1]]]) + msg = ("The value of output_iter must be equal or smaller than" + " max_iter.") + + with pytest.raises(ValueError) as raised_error: + scml.fit(triplets) + assert msg == raised_error.value.args[0] + class TestLSML(MetricTestCase): def test_iris(self): From 8dbaad9bfd44cbc801c96e31cf0e94cc58d1cf97 Mon Sep 17 00:00:00 2001 From: grudloff Date: Tue, 31 Mar 2020 14:11:22 +0200 Subject: [PATCH 67/70] flake8 fix --- test/metric_learn_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index f6cce305..699707b0 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -203,8 +203,8 @@ def test_array_basis(self, estimator, data): def test_verbose(self, estimator, data, capsys): # assert there is proper output when verbose = True model = estimator(preprocessor=np.array([[0, 0], [1, 1], [2, 2], [3, 3]]), - max_iter=1, output_iter=1, batch_size=1, basis='triplet_diffs', - random_state=42, verbose=True) + max_iter=1, output_iter=1, batch_size=1, + basis='triplet_diffs', random_state=42, verbose=True) model.fit(*data) out, _ = capsys.readouterr() expected_out = ('[%s] iter 1\t obj 0.569946\t num_imp 2\n' From 95e5fe86b3744922cf87388d530a025e5291143a Mon Sep 17 00:00:00 2001 From: grudloff Date: Wed, 1 Apr 2020 17:15:42 +0200 Subject: [PATCH 68/70] no double division and smaller triplets arrays --- metric_learn/scml.py | 2 +- test/metric_learn_test.py | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/metric_learn/scml.py b/metric_learn/scml.py index 08809b7b..7bbd101a 100644 --- a/metric_learn/scml.py +++ b/metric_learn/scml.py @@ -102,7 +102,7 @@ def _fit(self, triplets, basis=None, n_basis=None): ada_grad_w = np.sqrt(np.square(ada_grad_w) + np.square(grad_w)) - scale_f = -(iter+1) / self.gamma / (delta + ada_grad_w) + scale_f = -(iter+1) / (self.gamma * (delta + ada_grad_w)) # proximal operator with negative trimming equivalent w = scale_f * np.minimum(avg_grad_w + self.beta, 0) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 699707b0..1499e899 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -283,10 +283,7 @@ def test_int_inputs(self, name): value = 1.0 d = {name: value} scml = SCML(**d) - triplets = np.array([[[0, 1], [2, 1], [0, 0]], - [[2, 1], [0, 1], [2, 0]], - [[0, 0], [2, 0], [0, 1]], - [[2, 0], [0, 0], [2, 1]]]) + triplets = np.array([[[0, 1], [2, 1], [0, 0]]]) msg = name msg += (" should be an integer, instead it is of type" @@ -312,10 +309,7 @@ def test_int_inputs_supervised(self, name): def test_large_output_iter(self): scml = SCML(max_iter=1, output_iter=2) - triplets = np.array([[[0, 1], [2, 1], [0, 0]], - [[2, 1], [0, 1], [2, 0]], - [[0, 0], [2, 0], [0, 1]], - [[2, 0], [0, 0], [2, 1]]]) + triplets = np.array([[[0, 1], [2, 1], [0, 0]]]) msg = ("The value of output_iter must be equal or smaller than" " max_iter.") From 2aed606c16aa8a160d179c0c538c518f9446f735 Mon Sep 17 00:00:00 2001 From: CJ Carey Date: Wed, 17 Jun 2020 11:25:16 -0400 Subject: [PATCH 69/70] minor grammar fixes --- doc/weakly_supervised.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst index 95f9320a..82793b5b 100644 --- a/doc/weakly_supervised.rst +++ b/doc/weakly_supervised.rst @@ -708,7 +708,7 @@ Algorithms Sparse Compositional Metric Learning (:py:class:`SCML `) -`SCML` learns an squared Mahalanobis distance from triplet constraints by +`SCML` learns a squared Mahalanobis distance from triplet constraints by optimizing sparse positive weights assigned to a set of :math:`K` rank-one PSD bases. This can be formulated as an optimization problem with only :math:`K` parameters, that can be solved with an efficient stochastic @@ -722,7 +722,7 @@ weighted by a :math:`K` dimensional vector :math:`w = \{w_i\}_{i=\{1,...,K\}}` a M = \sum_{i=1}^K w_i b_i b_i^T = B \cdot diag(w) \cdot B^T \quad w_i \geq 0 Learning :math:`M` in this form makes it PSD by design, as it is a -nonnegative sum of PSD matrices. The basis set :math:`B` is fixed on advance +nonnegative sum of PSD matrices. The basis set :math:`B` is fixed in advance and it is possible to construct it from the data. The optimization problem over :math:`w` is formulated as a classic margin-based hinge loss function involving the set :math:`C` of triplets. A regularization :math:`\ell_1` From cba1cf62fd43fff04c37d2c86be516e90733ec28 Mon Sep 17 00:00:00 2001 From: CJ Carey Date: Wed, 17 Jun 2020 11:32:54 -0400 Subject: [PATCH 70/70] minor formatting tweaks --- test/metric_learn_test.py | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 1499e899..318b200e 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -117,11 +117,9 @@ def test_dimension_reduction_msg(self): [[2, 1], [0, 1], [2, 0]], [[0, 0], [2, 0], [0, 1]], [[2, 0], [0, 0], [2, 1]]]) - n_basis = 1 msg = ("The number of bases with nonzero weight is less than the " "number of features of the input, in consequence the " - "learned transformation reduces the dimension to %d." - % n_basis) + "learned transformation reduces the dimension to 1.") with pytest.warns(UserWarning) as raised_warning: scml.fit(triplets) assert msg == raised_warning[0].message.args[0] @@ -135,7 +133,6 @@ def test_dimension_reduction_msg(self): [3, 3]]), np.array([1, 2, 3])))]) def test_n_basis_wrong_type(self, estimator, data): - n_basis = 4.0 model = estimator(n_basis=n_basis) msg = ("n_basis should be an integer, instead it is of type %s" @@ -149,7 +146,6 @@ def test_small_n_basis_lda(self): y = np.array([0, 0, 1, 1]) n_class = 2 - scml = SCML_Supervised(n_basis=n_class-1) msg = ("The number of basis is less than the number of classes, which may" " lead to poor discriminative performance.") @@ -162,9 +158,8 @@ def test_big_n_basis_lda(self): y = np.array([1, 2, 3]) n_class = 3 - num_eig = min(n_class-1, X.shape[1]) - - n_basis = X.shape[0]*2*num_eig + num_eig = min(n_class - 1, X.shape[1]) + n_basis = X.shape[0] * 2 * num_eig scml = SCML_Supervised(n_basis=n_basis) msg = ("Not enough samples to generate %d LDA bases, n_basis" @@ -184,7 +179,6 @@ def test_array_basis(self, estimator, data): array is not consistent with the input """ basis = np.eye(3) - scml = estimator(n_basis=3, basis=basis) msg = ('The dimensionality ({}) of the provided bases must match the ' @@ -252,11 +246,9 @@ def test_triplet_diffs(self, n_samples, n_features, n_classes): model.k_impostor) basis, n_basis = model._generate_bases_dist_diff(triplets, X) - expected_n_basis = n_features*80 - expected_shape = (expected_n_basis, n_features) - + expected_n_basis = n_features * 80 assert n_basis == expected_n_basis - assert basis.shape == expected_shape + assert basis.shape == (expected_n_basis, n_features) @pytest.mark.parametrize('n_samples', [100, 500]) @pytest.mark.parametrize('n_features', [10, 50, 100]) @@ -270,12 +262,10 @@ def test_lda(self, n_samples, n_features, n_classes): model = SCML_Supervised() basis, n_basis = model._generate_bases_LDA(X, y) - num_eig = min(n_classes-1, n_features) - expected_n_basis = min(20*n_features, n_samples*2*num_eig - 1) - expected_shape = (expected_n_basis, n_features) - + num_eig = min(n_classes - 1, n_features) + expected_n_basis = min(20 * n_features, n_samples * 2 * num_eig - 1) assert n_basis == expected_n_basis - assert basis.shape == expected_shape + assert basis.shape == (expected_n_basis, n_features) @pytest.mark.parametrize('name', ['max_iter', 'output_iter', 'batch_size', 'n_basis']) @@ -285,9 +275,8 @@ def test_int_inputs(self, name): scml = SCML(**d) triplets = np.array([[[0, 1], [2, 1], [0, 0]]]) - msg = name - msg += (" should be an integer, instead it is of type" - " %s" % type(value)) + msg = ("%s should be an integer, instead it is of type" + " %s" % (name, type(value))) with pytest.raises(ValueError) as raised_error: scml.fit(triplets) assert msg == raised_error.value.args[0] @@ -300,9 +289,8 @@ def test_int_inputs_supervised(self, name): scml = SCML_Supervised(**d) X = np.array([[0, 0], [1, 1], [3, 3], [4, 4]]) y = np.array([1, 1, 0, 0]) - msg = name - msg += (" should be an integer, instead it is of type" - " %s" % type(value)) + msg = ("%s should be an integer, instead it is of type" + " %s" % (name, type(value))) with pytest.raises(ValueError) as raised_error: scml.fit(X, y) assert msg == raised_error.value.args[0]