From bdaf924a8385d47d4164e78649f4139f0f93896a Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 8 Jun 2017 14:01:29 +0200 Subject: [PATCH 01/55] ENH improve memory usage of t-sne --- sklearn/manifold/_barnes_hut_tsne.pyx | 168 +++++++++-------------- sklearn/manifold/_utils.pyx | 56 +++----- sklearn/manifold/setup.py | 1 + sklearn/manifold/t_sne.py | 188 +++++++++++++------------- sklearn/manifold/tests/test_t_sne.py | 88 ++++++++---- sklearn/mixture/base.py | 6 +- 6 files changed, 248 insertions(+), 259 deletions(-) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index 62cb036f7ab7d..7b478b455a572 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -11,8 +11,8 @@ from libc.stdlib cimport malloc, free from libc.stdio cimport printf from libc.math cimport sqrt, log -cimport numpy as np import numpy as np +cimport numpy as np cdef char* EMPTY_STRING = "" @@ -219,7 +219,7 @@ cdef inline int offset2index(int* offset, int n_dimensions) nogil: for dim in range(n_dimensions): index += (2 ** dim) * offset[n_dimensions - dim - 1] if DEBUGFLAG: - printf("o2i index %i dim %i offset", index, dim) + printf("o2i index %i dim %i offset", index, dim) for j in range(n_dimensions): printf(" %i", offset[j]) printf(" n_dimensions %i\n", n_dimensions) @@ -250,8 +250,8 @@ cdef int insert(Node *root, float pos[3], long point_index, long depth, long cdef int not_identical = 1 cdef int n_dimensions = root.tree.n_dimensions if DEBUGFLAG: - printf("[t-SNE] [d=%i] Inserting pos %i [%f, %f] duplicate_count=%i " - "into child %p\n", depth, point_index, pos[0], pos[1], + printf("[t-SNE] [d=%li] Inserting pos %li [%f, %f] duplicate_count=%li" + " into child %p\n", depth, point_index, pos[0], pos[1], duplicate_count, root) # Increment the total number points including this # node and below it @@ -286,7 +286,7 @@ cdef int insert(Node *root, float pos[3], long point_index, long depth, long if (root.size == 0) & root.is_leaf: # Root node is empty and a leaf if DEBUGFLAG: - printf("[t-SNE] [d=%i] Inserting [%f, %f] into blank cell\n", depth, + printf("[t-SNE] [d=%li] Inserting [%f, %f] into blank cell\n", depth, pos[0], pos[1]) for ax in range(n_dimensions): root.leaf_point_position[ax] = pos[ax] @@ -296,9 +296,9 @@ cdef int insert(Node *root, float pos[3], long point_index, long depth, long else: # Root node is occupied or not a leaf if DEBUGFLAG: - printf("[t-SNE] [d=%i] Node %p is occupied or is a leaf.\n", depth, + printf("[t-SNE] [d=%li] Node %p is occupied or is a leaf.\n", depth, root) - printf("[t-SNE] [d=%i] Node %p leaf = %i. Size %i\n", depth, root, + printf("[t-SNE] [d=%li] Node %p leaf = %i. Size %li\n", depth, root, root.is_leaf, root.size) if root.is_leaf & (root.size > 0): # is a leaf node and is occupied @@ -308,15 +308,15 @@ cdef int insert(Node *root, float pos[3], long point_index, long depth, long if not_identical == 1: root.size += duplicate_count if DEBUGFLAG: - printf("[t-SNE] Warning: [d=%i] Detected identical " - "points. Returning. Leaf now has size %i\n", + printf("[t-SNE] Warning: [d=%li] Detected identical " + "points. Returning. Leaf now has size %li\n", depth, root.size) return 0 # If necessary, subdivide this node before # descending if root.is_leaf: if DEBUGFLAG: - printf("[t-SNE] [d=%i] Subdividing this leaf node %p\n", depth, + printf("[t-SNE] [d=%li] Subdividing this leaf node %p\n", depth, root) subdivide(root) # We have two points to relocate: the one previously @@ -325,12 +325,12 @@ cdef int insert(Node *root, float pos[3], long point_index, long depth, long if root.size > 0: child = select_child(root, root.leaf_point_position, root.point_index) if DEBUGFLAG: - printf("[t-SNE] [d=%i] Relocating old point to node %p\n", + printf("[t-SNE] [d=%li] Relocating old point to node %p\n", depth, child) insert(child, root.leaf_point_position, root.point_index, depth + 1, root.size) # Insert the new point if DEBUGFLAG: - printf("[t-SNE] [d=%i] Selecting node for new point\n", depth) + printf("[t-SNE] [d=%li] Selecting node for new point\n", depth) child = select_child(root, pos, point_index) if root.size > 0: # Remove the point from this node @@ -351,7 +351,7 @@ cdef int insert_many(Tree* tree, float[:,:] pos_array) nogil: for ax in range(tree.n_dimensions): row[ax] = pos_array[i, ax] if DEBUGFLAG: - printf("[t-SNE] inserting point %i: [%f, %f]\n", i, row[0], row[1]) + printf("[t-SNE] inserting point %li: [%f, %f]\n", i, row[0], row[1]) err = insert(tree.root_node, row, i, 0, 1) if err != 0: printf("[t-SNE] ERROR\n%s", EMPTY_STRING) @@ -416,7 +416,7 @@ cdef long count_points(Node* root, long count) nogil: count += root.size if DEBUGFLAG : printf("[t-SNE] %p is a leaf node, no children\n", root) - printf("[t-SNE] %i points in node %p\n", count, root) + printf("[t-SNE] %li points in node %p\n", count, root) return count # Otherwise, get the children for idx in range(root.tree.n_cell_per_node): @@ -425,7 +425,7 @@ cdef long count_points(Node* root, long count) nogil: printf("[t-SNE] Counting points for child %p\n", child) if child.is_leaf and child.size > 0: if DEBUGFLAG: - printf("[t-SNE] Child has size %d\n", child.size) + printf("[t-SNE] Child has size %ld\n", child.size) count += child.size elif not child.is_leaf: if DEBUGFLAG: @@ -436,13 +436,14 @@ cdef long count_points(Node* root, long count) nogil: # one point, and then the other neighboring cells # don't get filled in if DEBUGFLAG: - printf("[t-SNE] %i points in this node\n", count) + printf("[t-SNE] %li points in this node\n", count) return count -cdef float compute_gradient(float[:,:] val_P, +cdef float compute_gradient(float[:] val_P, float[:,:] pos_reference, - np.int64_t[:,:] neighbors, + np.int64_t[:] neighbors, + np.int64_t[:] indptr, float[:,:] tot_force, Node* root_node, float theta, @@ -453,46 +454,46 @@ cdef float compute_gradient(float[:,:] val_P, # in two components, the positive and negative forces cdef long i, coord cdef int ax - cdef long n = pos_reference.shape[0] + cdef long n_samples = pos_reference.shape[0] cdef int n_dimensions = root_node.tree.n_dimensions if root_node.tree.verbose > 11: - printf("[t-SNE] Allocating %i elements in force arrays\n", - n * n_dimensions * 2) + printf("[t-SNE] Allocating %li elements in force arrays\n", + n_samples * n_dimensions * 2) cdef float* sum_Q = malloc(sizeof(float)) - cdef float* neg_f = malloc(sizeof(float) * n * n_dimensions) - cdef float* neg_f_fast = malloc(sizeof(float) * n * n_dimensions) - cdef float* pos_f = malloc(sizeof(float) * n * n_dimensions) + cdef float* neg_f = malloc(sizeof(float) * n_samples * n_dimensions) + cdef float* pos_f = malloc(sizeof(float) * n_samples * n_dimensions) cdef clock_t t1, t2 cdef float sQ, error sum_Q[0] = 0.0 t1 = clock() - compute_gradient_negative(val_P, pos_reference, neg_f, root_node, sum_Q, + compute_gradient_negative(pos_reference, neg_f, root_node, sum_Q, dof, theta, start, stop) t2 = clock() if root_node.tree.verbose > 15: printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1))) sQ = sum_Q[0] t1 = clock() - error = compute_gradient_positive(val_P, pos_reference, neighbors, pos_f, - n_dimensions, dof, sQ, start, root_node.tree.verbose) + error = compute_gradient_positive(val_P, pos_reference, neighbors, indptr, + pos_f, n_dimensions, dof, sQ, start, + root_node.tree.verbose) t2 = clock() if root_node.tree.verbose > 15: printf("[t-SNE] Computing positive gradient: %e ticks\n", ((float) (t2 - t1))) - for i in range(start, n): + for i in range(start, n_samples): for ax in range(n_dimensions): coord = i * n_dimensions + ax tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sum_Q[0]) free(sum_Q) free(neg_f) - free(neg_f_fast) free(pos_f) - return sQ + return error -cdef float compute_gradient_positive(float[:,:] val_P, +cdef float compute_gradient_positive(float[:] val_P, float[:,:] pos_reference, - np.int64_t[:,:] neighbors, + np.int64_t[:] neighbors, + np.int64_t[:] indptr, float* pos_f, int n_dimensions, float dof, @@ -507,43 +508,41 @@ cdef float compute_gradient_positive(float[:,:] val_P, cdef: int ax long i, j, k - long K = neighbors.shape[1] - long n = val_P.shape[0] - float[3] buff - float D, Q, pij + long n_samples = indptr.shape[0] - 1 + float dij, qij, pij float C = 0.0 float exponent = (dof + 1.0) / -2.0 cdef clock_t t1, t2 + cdef float* buff = malloc(sizeof(float) * n_dimensions) t1 = clock() - for i in range(start, n): + for i in range(start, n_samples): for ax in range(n_dimensions): pos_f[i * n_dimensions + ax] = 0.0 - for k in range(K): - j = neighbors[i, k] + for k in range(indptr[i], indptr[i+1]): + j = neighbors[k] # we don't need to exclude the i==j case since we've # already thrown it out from the list of neighbors - D = 0.0 - Q = 0.0 - pij = val_P[i, j] + dij = 0.0 + pij = val_P[k] for ax in range(n_dimensions): buff[ax] = pos_reference[i, ax] - pos_reference[j, ax] - D += buff[ax] ** 2.0 - Q = (((1.0 + D) / dof) ** exponent) - D = pij * Q - Q /= sum_Q - C += pij * log((pij + EPSILON) / (Q + EPSILON)) + dij += buff[ax] * buff[ax] + qij = (((1.0 + dij) / dof) ** exponent) + dij = pij * qij + qij /= sum_Q + C += pij * log((pij + EPSILON) / (qij + EPSILON)) for ax in range(n_dimensions): - pos_f[i * n_dimensions + ax] += D * buff[ax] + pos_f[i * n_dimensions + ax] += dij * buff[ax] t2 = clock() dt = ((float) (t2 - t1)) if verbose > 10: printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt) - return C + free(buff) + return C -cdef void compute_gradient_negative(float[:,:] val_P, - float[:,:] pos_reference, +cdef void compute_gradient_negative(float[:,:] pos_reference, float* neg_f, Node *root_node, float* sum_Q, @@ -613,8 +612,8 @@ cdef void compute_gradient_negative(float[:,:] val_P, dta += t2 - t1 dtb += t3 - t2 if root_node.tree.verbose > 20: - printf("[t-SNE] Tree: %i clock ticks | ", dta) - printf("Force computation: %i clock ticks\n", dtb) + printf("[t-SNE] Tree: %li clock ticks | ", dta) + printf("Force computation: %li clock ticks\n", dtb) free(iQ) free(force) free(pos) @@ -680,37 +679,6 @@ cdef void compute_non_edge_forces(Node* node, l) -cdef float compute_error(float[:, :] val_P, - float[:, :] pos_reference, - np.int64_t[:,:] neighbors, - float sum_Q, - int n_dimensions, - int verbose) nogil: - cdef int i, j, ax - cdef int I = neighbors.shape[0] - cdef int K = neighbors.shape[1] - cdef float pij, Q - cdef float C = 0.0 - cdef clock_t t1, t2 - cdef float dt, delta - t1 = clock() - for i in range(I): - for k in range(K): - j = neighbors[i, k] - pij = val_P[i, j] - Q = 0.0 - for ax in range(n_dimensions): - delta = (pos_reference[i, ax] - pos_reference[j, ax]) - Q += delta * delta - Q = (1.0 / (sum_Q + Q * sum_Q)) - C += pij * log((pij + EPSILON) / (Q + EPSILON)) - t2 = clock() - dt = ((float) (t2 - t1)) - if verbose > 10: - printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt) - return C - - def calculate_edge(pos_output): # Make the boundaries slightly outside of the data # to avoid floating point error near the edge @@ -724,10 +692,12 @@ def calculate_edge(pos_output): right_edge = center + width / 2.0 return left_edge, right_edge, width -def gradient(float[:,:] pij_input, - float[:,:] pos_output, - np.int64_t[:,:] neighbors, - float[:,:] forces, + +def gradient(float[:] val_P, + float[:,:] pos_output, + np.int64_t[:] neighbors, + np.int64_t[:] indptr, + float[:,:] forces, float theta, int n_dimensions, int verbose, @@ -740,24 +710,18 @@ def gradient(float[:,:] pij_input, n = pos_output.shape[0] left_edge, right_edge, width = calculate_edge(pos_output) assert width.itemsize == 4 - assert pij_input.itemsize == 4 + assert val_P.itemsize == 4 assert pos_output.itemsize == 4 assert forces.itemsize == 4 - m = "Number of neighbors must be < # of points - 1" - assert n - 1 >= neighbors.shape[1], m - m = "neighbors array and pos_output shapes are incompatible" - assert n == neighbors.shape[0], m m = "Forces array and pos_output shapes are incompatible" assert n == forces.shape[0], m m = "Pij and pos_output shapes are incompatible" - assert n == pij_input.shape[0], m - m = "Pij and pos_output shapes are incompatible" - assert n == pij_input.shape[1], m + assert n == indptr.shape[0] - 1, m if verbose > 10: printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions) cdef Tree* qt = init_tree(left_edge, width, n_dimensions, verbose) if verbose > 10: - printf("[t-SNE] Inserting %i points\n", pos_output.shape[0]) + printf("[t-SNE] Inserting %li points\n", pos_output.shape[0]) err = insert_many(qt, pos_output) assert err == 0, "[t-SNE] Insertion failed" if verbose > 10: @@ -765,18 +729,16 @@ def gradient(float[:,:] pij_input, # in the generated C code that triggers error with gcc 4.9 # and -Werror=format-security printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING) - sum_Q = compute_gradient(pij_input, pos_output, neighbors, forces, - qt.root_node, theta, dof, skip_num_points, -1) - C = compute_error(pij_input, pos_output, neighbors, sum_Q, n_dimensions, - verbose) + C = compute_gradient(val_P, pos_output, neighbors, indptr, forces, + qt.root_node, theta, dof, skip_num_points, -1) if verbose > 10: # XXX: format hack to workaround lack of `const char *` type # in the generated C code # and -Werror=format-security printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING) cdef long count = count_points(qt.root_node, 0) - m = ("Tree consistency failed: unexpected number of points=%i " - "at root node=%i" % (count, qt.root_node.cumulative_size)) + m = ("Tree consistency failed: unexpected number of points=%li " + "at root node=%li" % (count, qt.root_node.cumulative_size)) assert count == qt.root_node.cumulative_size, m m = "Tree consistency failed: unexpected number of points on the tree" assert count == qt.n_points, m diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx index b85da09dbaefd..452ae38f31220 100644 --- a/sklearn/manifold/_utils.pyx +++ b/sklearn/manifold/_utils.pyx @@ -12,18 +12,18 @@ cdef float PERPLEXITY_TOLERANCE = 1e-5 @cython.boundscheck(False) cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( - np.ndarray[np.float32_t, ndim=2] affinities, - np.ndarray[np.int64_t, ndim=2] neighbors, + np.ndarray[np.float32_t, ndim=2] affinities, + np.ndarray[np.int64_t, ndim=2] neighbors, float desired_perplexity, int verbose): - """Binary search for sigmas of conditional Gaussians. - + """Binary search for sigmas of conditional Gaussians. + This approximation reduces the computational complexity from O(N^2) to O(uN). See the exact method '_binary_search_perplexity' for more details. Parameters ---------- - affinities : array-like, shape (n_samples, n_samples) + affinities : array-like, shape (n_samples, K) Distances between training samples. neighbors : array-like, shape (n_samples, K) or None @@ -46,16 +46,13 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( cdef long n_steps = 100 cdef long n_samples = affinities.shape[0] - # This array is later used as a 32bit array. It has multiple intermediate - # floating point additions that benefit from the extra precision - cdef np.ndarray[np.float64_t, ndim=2] P = np.zeros((n_samples, n_samples), - dtype=np.float64) - # Precisions of conditional Gaussian distrubutions + # Precisions of conditional Gaussian distributions cdef float beta cdef float beta_min cdef float beta_max cdef float beta_sum = 0.0 - # Now we go to log scale + + # Use log scale cdef float desired_entropy = math.log(desired_perplexity) cdef float entropy_diff @@ -69,6 +66,11 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( if using_neighbors: K = neighbors.shape[1] + # This array is later used as a 32bit array. It has multiple intermediate + # floating point additions that benefit from the extra precision + cdef np.ndarray[np.float64_t, ndim=2] P = np.zeros((n_samples, K), + dtype=np.float64) + for i in range(n_samples): beta_min = -NPY_INFINITY beta_max = NPY_INFINITY @@ -79,34 +81,20 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( # Compute current entropy and corresponding probabilities # computed just over the nearest neighbors or over all data # if we're not using neighbors - if using_neighbors: - for k in range(K): - j = neighbors[i, k] - P[i, j] = math.exp(-affinities[i, j] * beta) - else: - for j in range(K): - P[i, j] = math.exp(-affinities[i, j] * beta) - P[i, i] = 0.0 sum_Pi = 0.0 - if using_neighbors: - for k in range(K): - j = neighbors[i, k] - sum_Pi += P[i, j] - else: - for j in range(K): + for j in range(K): + if j != i or using_neighbors: + P[i, j] = math.exp(-affinities[i, j] * beta) sum_Pi += P[i, j] + if sum_Pi == 0.0: sum_Pi = EPSILON_DBL sum_disti_Pi = 0.0 - if using_neighbors: - for k in range(K): - j = neighbors[i, k] - P[i, j] /= sum_Pi - sum_disti_Pi += affinities[i, j] * P[i, j] - else: - for j in range(K): - P[i, j] /= sum_Pi - sum_disti_Pi += affinities[i, j] * P[i, j] + + for j in range(K): + P[i, j] /= sum_Pi + sum_disti_Pi += affinities[i, j] * P[i, j] + entropy = math.log(sum_Pi) + beta * sum_disti_Pi entropy_diff = entropy - desired_entropy diff --git a/sklearn/manifold/setup.py b/sklearn/manifold/setup.py index a2562cd3c02d8..bec1e25eee77c 100644 --- a/sklearn/manifold/setup.py +++ b/sklearn/manifold/setup.py @@ -31,6 +31,7 @@ def configuration(parent_package="", top_path=None): return config + if __name__ == "__main__": from numpy.distutils.core import setup setup(**configuration().todict()) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 8d4056627c494..73e44d48fcb1f 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -13,7 +13,8 @@ import scipy.sparse as sp from scipy.spatial.distance import pdist from scipy.spatial.distance import squareform -from ..neighbors import BallTree +from scipy.sparse import csr_matrix +from ..neighbors import NearestNeighbors from ..base import BaseEstimator from ..utils import check_array from ..utils import check_random_state @@ -70,10 +71,11 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): Parameters ---------- - distances : array, shape (n_samples * (n_samples-1) / 2,) - Distances of samples are stored as condensed matrices, i.e. - we omit the diagonal and duplicate entries and store everything - in a one-dimensional array. + distances : array, shape (n_samples, K) + Distances of samples to its K nearest neighbors. + + neighbors : array, shape (n_samples, K) + K nearest-neighbors for each samples. desired_perplexity : float Desired perplexity of the joint probability distributions. @@ -83,21 +85,27 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): Returns ------- - P : array, shape (n_samples * (n_samples-1) / 2,) - Condensed joint probability matrix. + P : csr sparse matrix, shape (n_samples, n_samples) + Condensed joint probability matrix with only nearest neighbors. """ # Compute conditional probabilities such that they approximately match # the desired perplexity + n_samples, K = neighbors.shape distances = distances.astype(np.float32, copy=False) neighbors = neighbors.astype(np.int64, copy=False) conditional_P = _utils._binary_search_perplexity( distances, neighbors, desired_perplexity, verbose) - m = "All probabilities should be finite" - assert np.all(np.isfinite(conditional_P)), m - P = conditional_P + conditional_P.T - sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) - P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) - assert np.all(np.abs(P) <= 1.0) + assert np.all(np.isfinite(conditional_P)), \ + "All probabilities should be finite" + + P = csr_matrix((conditional_P.ravel(), neighbors.ravel(), + range(0, n_samples * K + 1, K)), + shape=(n_samples, n_samples)) + + P = P + P.T + sum_P = np.maximum(P.sum(), MACHINE_EPSILON) + P /= sum_P + assert np.all(np.abs(P.data) <= 1.0) return P @@ -140,11 +148,11 @@ def _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components, X_embedded = params.reshape(n_samples, n_components) # Q is a heavy-tailed distribution: Student's t-distribution - n = pdist(X_embedded, "sqeuclidean") - n += 1. - n /= degrees_of_freedom - n **= (degrees_of_freedom + 1.0) / -2.0 - Q = np.maximum(n / (2.0 * np.sum(n)), MACHINE_EPSILON) + dist = pdist(X_embedded, "sqeuclidean") + dist += 1. + dist /= degrees_of_freedom + dist **= (degrees_of_freedom + 1.0) / -2.0 + Q = np.maximum(dist / (2.0 * np.sum(dist)), MACHINE_EPSILON) # Optimization trick below: np.dot(x, y) is faster than # np.sum(x * y) because it calls BLAS @@ -153,11 +161,12 @@ def _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components, kl_divergence = 2.0 * np.dot(P, np.log(P / Q)) # Gradient: dC/dY - grad = np.ndarray((n_samples, n_components)) - PQd = squareform((P - Q) * n) + # pdist always returns double precision distances. Thus we need to take + grad = np.ndarray((n_samples, n_components), dtype=params.dtype) + PQd = squareform((P - Q) * dist) for i in range(skip_num_points, n_samples): - np.dot(np.ravel(PQd[i], order='K'), X_embedded[i] - X_embedded, - out=grad[i]) + grad[i] = np.dot(np.ravel(PQd[i], order='K'), + X_embedded[i] - X_embedded) grad = grad.ravel() c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom grad *= c @@ -221,9 +230,8 @@ def _kl_divergence_error(params, P, neighbors, degrees_of_freedom, n_samples, return kl_divergence -def _kl_divergence_bh(params, P, neighbors, degrees_of_freedom, n_samples, - n_components, angle=0.5, skip_num_points=0, - verbose=False): +def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, + angle=0.5, skip_num_points=0, verbose=False): """t-SNE objective function: KL divergence of p_ijs and q_ijs. Uses Barnes-Hut tree methods to calculate the gradient that @@ -234,13 +242,9 @@ def _kl_divergence_bh(params, P, neighbors, degrees_of_freedom, n_samples, params : array, shape (n_params,) Unraveled embedding. - P : array, shape (n_samples * (n_samples-1) / 2,) + P : csr sparse matrix, shape (n_samples, n_sample) Condensed joint probability matrix. - neighbors : int64 array, shape (n_samples, K) - Array with element [i, j] giving the index for the jth - closest neighbor to point i. - degrees_of_freedom : float Degrees of freedom of the Student's-t distribution. @@ -278,14 +282,13 @@ def _kl_divergence_bh(params, P, neighbors, degrees_of_freedom, n_samples, """ params = params.astype(np.float32, copy=False) X_embedded = params.reshape(n_samples, n_components) - neighbors = neighbors.astype(np.int64, copy=False) - if len(P.shape) == 1: - sP = squareform(P).astype(np.float32) - else: - sP = P.astype(np.float32) + + val_P = P.data.astype(np.float32, copy=False) + neighbors = P.indices.astype(np.int64, copy=False) + indptr = P.indptr.astype(np.int64, copy=False) grad = np.zeros(X_embedded.shape, dtype=np.float32) - error = _barnes_hut_tsne.gradient(sP, X_embedded, neighbors, + error = _barnes_hut_tsne.gradient(val_P, X_embedded, neighbors, indptr, grad, angle, n_components, verbose, dof=degrees_of_freedom) c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom @@ -629,11 +632,9 @@ class TSNE(BaseEstimator): >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) >>> model = TSNE(n_components=2, random_state=0) >>> np.set_printoptions(suppress=True) - >>> model.fit_transform(X) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE - array([[ 0.00017619, 0.00004014], - [ 0.00010268, 0.00020546], - [ 0.00018298, -0.00008335], - [ 0.00009501, -0.00001388]]) + >>> X_embedded = model.fit_transform(X) + >>> X_embedded.shape + (4, 2) References ---------- @@ -699,6 +700,12 @@ def _fit(self, X, skip_num_points=0): raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") + if self.metric == "precomputed": + if isinstance(self.init, string_types) and self.init == 'pca': + raise ValueError("The parameter init=\"pca\" cannot be " + "used with metric=\"precomputed\".") + if X.shape[0] != X.shape[1]: + raise ValueError("X should be a square distance matrix") if self.method == 'barnes_hut' and sp.issparse(X): raise TypeError('A sparse matrix was passed, but dense ' 'data is required for method="barnes_hut". Use ' @@ -708,37 +715,33 @@ def _fit(self, X, skip_num_points=0): 'reduction techniques (e.g. TruncatedSVD)') else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=np.float64) + dtype=[np.float32, np.float64]) random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: - raise ValueError("early_exaggeration must be at least 1, but is " - "%f" % self.early_exaggeration) + raise ValueError("early_exaggeration must be at least 1, but is {}" + .format(self.early_exaggeration)) if self.n_iter < 200: raise ValueError("n_iter should be at least 200") - if self.metric == "precomputed": - if isinstance(self.init, string_types) and self.init == 'pca': - raise ValueError("The parameter init=\"pca\" cannot be used " - "with metric=\"precomputed\".") - if X.shape[0] != X.shape[1]: - raise ValueError("X should be a square distance matrix") - distances = X - else: - if self.verbose: - print("[t-SNE] Computing pairwise distances...") - - if self.metric == "euclidean": - distances = pairwise_distances(X, metric=self.metric, - squared=True) + if self.method == "exact": + if self.metric == "precomputed": + distances = X else: - distances = pairwise_distances(X, metric=self.metric) + if self.verbose: + print("[t-SNE] Computing pairwise distances...") + + if self.metric == "euclidean": + distances = pairwise_distances(X, metric=self.metric, + squared=True) + else: + distances = pairwise_distances(X, metric=self.metric) - if not np.all(distances >= 0): - raise ValueError("All distances should be positive, either " - "the metric or precomputed distances given " - "as X are not correct") + if not np.all(distances >= 0): + raise ValueError("All distances should be positive, either " + "the metric or precomputed distances given " + "as X are not correct") # Degrees of freedom of the Student's t-distribution. The suggestion # degrees_of_freedom = n_components - 1 comes from @@ -753,34 +756,40 @@ def _fit(self, X, skip_num_points=0): if self.method == 'barnes_hut': if self.verbose: print("[t-SNE] Computing %i nearest neighbors..." % k) + + # Find the nearest neighbors for every point + # TODO: argument for class knn_estimator=None + # TODO: assert that the knn metric is euclidean if self.metric == 'precomputed': - # Use the precomputed distances to find - # the k nearest neighbors and their distances - neighbors_nn = np.argsort(distances, axis=1)[:, :k] + knn = NearestNeighbors(metric=self.metric) else: - # Find the nearest neighbors for every point - bt = BallTree(X) - # LvdM uses 3 * perplexity as the number of neighbors - # And we add one to not count the data point itself - # In the event that we have very small # of points - # set the neighbors to n - 1 - distances_nn, neighbors_nn = bt.query(X, k=k + 1) - neighbors_nn = neighbors_nn[:, 1:] - P = _joint_probabilities_nn(distances, neighbors_nn, + knn = NearestNeighbors(algorithm='ball_tree', + metric=self.metric) + knn.fit(X) + # LvdM uses 3 * perplexity as the number of neighbors + # And we add one to not count the data point itself + # In the event that we have very small # of points + # set the neighbors to n - 1 + distances_nn, neighbors_nn = knn.kneighbors( + None, n_neighbors=k) + if self.metric != "precomputed": + distances_nn **= 2 + + P = _joint_probabilities_nn(distances_nn, neighbors_nn, self.perplexity, self.verbose) else: P = _joint_probabilities(distances, self.perplexity, self.verbose) - assert np.all(np.isfinite(P)), "All probabilities should be finite" - assert np.all(P >= 0), "All probabilities should be zero or positive" - assert np.all(P <= 1), ("All probabilities should be less " - "or then equal to one") + assert np.all(np.isfinite(P)), "All probabilities should be finite" + assert np.all(P >= 0), "All probabilities should be non-negative" + assert np.all(P <= 1), ("All probabilities should be less " + "or then equal to one") if isinstance(self.init, np.ndarray): X_embedded = self.init elif self.init == 'pca': pca = PCA(n_components=self.n_components, svd_solver='randomized', random_state=random_state) - X_embedded = pca.fit_transform(X) + X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) elif self.init == 'random': X_embedded = None else: @@ -812,8 +821,8 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, if X_embedded is None: # Initialize embedding randomly - X_embedded = 1e-4 * random_state.randn(n_samples, - self.n_components) + X_embedded = 1e-4 * random_state.randn( + n_samples, self.n_components).astype(np.float32) params = X_embedded.ravel() opt_args = {"n_iter": 50, "momentum": 0.5, "it": 0, @@ -822,20 +831,16 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, "verbose": self.verbose, "n_iter_check": 25, "kwargs": dict(skip_num_points=skip_num_points)} if self.method == 'barnes_hut': - m = "Must provide an array of neighbors to use Barnes-Hut" - assert neighbors is not None, m obj_func = _kl_divergence_bh - objective_error = _kl_divergence_error - sP = squareform(P).astype(np.float32) - neighbors = neighbors.astype(np.int64) - args = [sP, neighbors, degrees_of_freedom, n_samples, + args = [P, degrees_of_freedom, n_samples, self.n_components] + opt_args['args'] = args opt_args['min_grad_norm'] = 1e-3 opt_args['n_iter_without_progress'] = 30 # Don't always calculate the cost since that calculation # can be nearly as expensive as the gradient - opt_args['objective_error'] = objective_error + opt_args['objective_error'] = _kl_divergence_error opt_args['kwargs']['angle'] = self.angle opt_args['kwargs']['verbose'] = self.verbose else: @@ -855,11 +860,10 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, opt_args['it'] = it + 1 params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args) + if self.verbose: print("[t-SNE] KL divergence after %d iterations with early " "exaggeration: %f" % (it + 1, kl_divergence)) - # Save the final number of iterations - self.n_iter_ = it # Final optimization P /= self.early_exaggeration @@ -867,6 +871,8 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, opt_args['it'] = it + 1 params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args) + # Save the final number of iterations + self.n_iter_ = it if self.verbose: print("[t-SNE] Error after %d iterations: %f" diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 52c056a5adadf..639715a708771 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -140,20 +140,26 @@ def test_binary_search_neighbors(): # Test that when we use all the neighbors the results are identical k = n_samples - neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64) - P2 = _binary_search_perplexity(distances, neighbors_nn, + neighbors_nn = np.argsort(distances, axis=1)[:, 1:k].astype(np.int64) + distances_nn = np.array([distances[k, neighbors_nn[k]] + for k in range(n_samples)]) + P2 = _binary_search_perplexity(distances_nn, neighbors_nn, desired_perplexity, verbose=0) - assert_array_almost_equal(P1, P2, decimal=4) + P_nn = np.array([P1[k, neighbors_nn[k]] for k in range(n_samples)]) + assert_array_almost_equal(P_nn, P2, decimal=4) # Test that the highest P_ij are the same when few neighbors are used for k in np.linspace(80, n_samples, 10): k = int(k) topn = k * 10 # check the top 10 *k entries out of k * k entries neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64) - P2k = _binary_search_perplexity(distances, neighbors_nn, + distances_nn = np.array([distances[k, neighbors_nn[k]] + for k in range(n_samples)]) + P2k = _binary_search_perplexity(distances_nn, neighbors_nn, desired_perplexity, verbose=0) idx = np.argsort(P1.ravel())[::-1] P1top = P1.ravel()[idx][:topn] + idx = np.argsort(P2k.ravel())[::-1] P2top = P2k.ravel()[idx][:topn] assert_array_almost_equal(P1top, P2top, decimal=2) @@ -175,6 +181,8 @@ def test_binary_perplexity_stability(): P = _binary_search_perplexity(distances.copy(), neighbors_nn.copy(), 3, verbose=0) P1 = _joint_probabilities_nn(distances, neighbors_nn, 3, verbose=0) + # Convert the sparse matrix to a dense one for testing + P1 = P1.toarray() if last_P is None: last_P = P last_P1 = P1 @@ -193,9 +201,9 @@ def test_gradient(): alpha = 1.0 distances = random_state.randn(n_samples, n_features).astype(np.float32) - distances = distances.dot(distances.T) + distances = np.abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) - X_embedded = random_state.randn(n_samples, n_components) + X_embedded = random_state.randn(n_samples, n_components).astype(np.float32) P = _joint_probabilities(distances, desired_perplexity=25.0, verbose=0) @@ -280,13 +288,16 @@ def test_fit_csr_matrix(): def test_preserve_trustworthiness_approximately_with_precomputed_distances(): # Nearest neighbors should be preserved approximately. random_state = check_random_state(0) - X = random_state.randn(100, 2) - D = squareform(pdist(X), "sqeuclidean") - tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, - metric="precomputed", random_state=0, verbose=0) - X_embedded = tsne.fit_transform(D) - assert_almost_equal(trustworthiness(D, X_embedded, n_neighbors=1, - precomputed=True), 1.0, decimal=1) + for i in range(5): + X = random_state.randn(100, 2) + D = squareform(pdist(X), "sqeuclidean") + tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, + early_exaggeration=2.0, metric="precomputed", + random_state=i, verbose=0) + X_embedded = tsne.fit_transform(D) + t = trustworthiness(D, X_embedded, n_neighbors=1, + precomputed=True) + assert t > .95 def test_early_exaggeration_too_small(): @@ -332,10 +343,14 @@ def test_init_ndarray_precomputed(): def test_distance_not_available(): # 'metric' must be valid. - tsne = TSNE(metric="not available") + tsne = TSNE(metric="not available", method='exact') assert_raises_regexp(ValueError, "Unknown metric not available.*", tsne.fit_transform, np.array([[0.0], [1.0]])) + tsne = TSNE(metric="not available", method='barnes_hut') + assert_raises_regexp(ValueError, "Metric 'not available' not valid.*", + tsne.fit_transform, np.array([[0.0], [1.0]])) + def test_pca_initialization_not_compatible_with_precomputed_kernel(): # Precomputed distance matrices must be square matrices. @@ -418,7 +433,13 @@ def _run_answer_test(pos_input, pos_output, neighbors, grad_output, pij_input = squareform(pij_input).astype(np.float32) grad_bh = np.zeros(pos_output.shape, dtype=np.float32) - _barnes_hut_tsne.gradient(pij_input, pos_output, neighbors, + from scipy.sparse import csr_matrix + P = csr_matrix(pij_input) + + neighbors = P.indices.astype(np.int64) + indptr = P.indptr.astype(np.int64) + + _barnes_hut_tsne.gradient(P.data, pos_output, neighbors, indptr, grad_bh, 0.5, 2, 1, skip_num_points=0) assert_array_almost_equal(grad_bh, grad_output, decimal=4) @@ -439,7 +460,7 @@ def test_verbose(): sys.stdout = old_stdout assert("[t-SNE]" in out) - assert("Computing pairwise distances" in out) + assert("nearest neighbors..." in out) assert("Computed conditional probabilities" in out) assert("Mean sigma" in out) assert("Finished" in out) @@ -483,8 +504,13 @@ def test_64bit(): for dt in [np.float32, np.float64]: X = random_state.randn(100, 2).astype(dt) tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, - random_state=0, method=method) - tsne.fit_transform(X) + random_state=0, method=method, verbose=0) + X_embedded = tsne.fit_transform(X) + effective_type = X_embedded.dtype + + # tsne cython code is only single precision, so the output will + # always be single precision, irrespectively of the input dtype + assert effective_type == np.float32 def test_barnes_hut_angle(): @@ -499,10 +525,10 @@ def test_barnes_hut_angle(): random_state = check_random_state(0) distances = random_state.randn(n_samples, n_features) distances = distances.astype(np.float32) - distances = distances.dot(distances.T) + distances = abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) params = random_state.randn(n_samples, n_components) - P = _joint_probabilities(distances, perplexity, False) + P = _joint_probabilities(distances, perplexity, verbose=0) kl, gradex = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) @@ -510,12 +536,18 @@ def test_barnes_hut_angle(): bt = BallTree(distances) distances_nn, neighbors_nn = bt.query(distances, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] - Pbh = _joint_probabilities_nn(distances, neighbors_nn, - perplexity, False) - kl, gradbh = _kl_divergence_bh(params, Pbh, neighbors_nn, - degrees_of_freedom, n_samples, - n_components, angle=angle, - skip_num_points=0, verbose=False) + distances_nn = np.array([distances[i, neighbors_nn[i]] + for i in range(n_samples)]) + assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\ + abs(distances[0, neighbors_nn[0]] - distances_nn[0]) + Pbh = _joint_probabilities_nn(distances_nn, neighbors_nn, + perplexity, verbose=0) + kl, gradbh = _kl_divergence_bh(params, Pbh, degrees_of_freedom, + n_samples, n_components, angle=angle, + skip_num_points=0, verbose=0) + + P = squareform(P) + Pbh = Pbh.toarray() assert_array_almost_equal(Pbh, P, decimal=5) assert_array_almost_equal(gradex, gradbh, decimal=5) @@ -569,8 +601,8 @@ def test_n_iter_without_progress(): # Use a dummy negative n_iter_without_progress and check output on stdout random_state = check_random_state(0) X = random_state.randn(100, 2) - tsne = TSNE(n_iter_without_progress=-1, verbose=2, - random_state=1, method='exact') + tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e7, + random_state=1, method='exact', n_iter=200) old_stdout = sys.stdout sys.stdout = StringIO() diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py index e88b00cd325b3..d00ccc9de9765 100644 --- a/sklearn/mixture/base.py +++ b/sklearn/mixture/base.py @@ -340,7 +340,7 @@ def predict(self, X): return self._estimate_weighted_log_prob(X).argmax(axis=1) def predict_proba(self, X): - """Predict posterior probability of each component given the data. + """Predict posterior probability of data per each component. Parameters ---------- @@ -351,8 +351,8 @@ def predict_proba(self, X): Returns ------- resp : array, shape (n_samples, n_components) - Returns the probability of each Gaussian (state) in - the model given each sample. + Returns the probability of the sample for each Gaussian + (state) in the model. """ self._check_is_fitted() X = _check_X(X, None, self.means_.shape[1]) From 3f527051a28b532fbcd1036629c13c17b22b1723 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 8 Jun 2017 18:27:56 +0200 Subject: [PATCH 02/55] ENH add a benchmark for t-SNE --- benchmarks/.gitignore | 4 ++ benchmarks/bench_tsne_mnist.py | 109 +++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 benchmarks/.gitignore create mode 100644 benchmarks/bench_tsne_mnist.py diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 0000000000000..9a7eaac081c3d --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1,4 @@ +/bhtsne +*.npy +*.json +/mnist_tsne_benchmark_data/ diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py new file mode 100644 index 0000000000000..f9417dfd4de10 --- /dev/null +++ b/benchmarks/bench_tsne_mnist.py @@ -0,0 +1,109 @@ +""" +============================= +MNIST dataset T-SNE benchmark +============================= + +""" +from __future__ import division, print_function + +# License: BSD 3 clause + +import os +from time import time +import numpy as np +import json +import argparse + +from sklearn.externals.joblib import Memory +from sklearn.datasets import fetch_mldata +from sklearn.manifold import TSNE +from sklearn.utils import check_array + + +try: + from memory_profiler import profile +except ImportError: + def profile(f): + return f + +try: + # If you want comparison with the reference implementation, build the + # binary from source (https://github.com/lvdmaaten/bhtsne) in the folder + # benchmarks/bhtsne and add an empty `__init__.py` file in the floder. + # + # $ git clone git@github.com:lvdmaaten/bhtsne.git + # $ cd bhtsne + # $ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 + # $ touch __init__.py + # $ cd .. + # + from bhtsne.bhtsne import run_bh_tsne + + @profile + def bhtsne(data, **kwargs): + return run_bh_tsne(data, **kwargs) +except ImportError: + bhtsne = None + + +memory = Memory('mnist_tsne_benchmark_data', mmap_mode='r') + + +@memory.cache +def load_data(dtype=np.float32, order='C'): + """Load the data, then cache and memmap the train/test split""" + print("Loading dataset...") + data = fetch_mldata('MNIST original') + X = check_array(data['data'], dtype=dtype, order=order) + y = data["target"] + + # Normalize features + X /= 255 + return X, y + + +@profile +def tsne_fit_transform(model, data): + return model.fit_transform(data) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser('Benchmark for t-SNE') + parser.add_argument('--order', type=str, default='C', + help='Order of the input data') + parser.add_argument('--perplexity', type=float, default=30) + parser.add_argument('--log-max-nsamples', type=int, default=5) + parser.add_argument('--verbose', type=int, default=2) + args = parser.parse_args() + + X, y = load_data(order=args.order) + + results = [] + basename, _ = os.path.splitext(__file__) + log_filename = basename + '.json' + for n in [100, 1000, 5000, 10000]: + X_train = X[:n] + n = X_train.shape[0] + print("Fitting TSNE on %d samples..." % n) + tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, + verbose=0) + t0 = time() + X_embedded = tsne_fit_transform(tsne, X_train) + duration = time() - t0 + print("Fitting T-SNE on %d samples took %0.3fs" % (n, duration)) + results.append(dict(method="TSNE", duration=duration, n_samples=n)) + with open(log_filename, 'w', encoding='utf-8') as f: + json.dump(results, f) + np.save('mnist_tsne_%d.npy' % n, X_embedded) + + if bhtsne is not None: + t0 = time() + X_embedded = bhtsne(X_train, initial_dims=X_train.shape[1], + perplexity=args.perplexity, verbose=False) + duration = time() - t0 + print("Fitting bhtsne on %d samples took %0.3fs" % (n, duration)) + results.append(dict(method="bhtsne", duration=duration, + n_samples=n)) + with open(log_filename, 'w', encoding='utf-8') as f: + json.dump(results, f) + np.save('mnist_bhtsne_%d.npy' % n, X_embedded) From cb258ff0569c80ec0b01c21a0e1db6a1971e903a Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Fri, 9 Jun 2017 12:09:59 +0200 Subject: [PATCH 03/55] ENH improve bench script+add params for TSNE - Improve benchmark script and add argparse options - Add n_jobs parameter to TSNE to compute kneighbors in parallel - Add neighbors_method in TSNE to allow user to parametrize the kneighbors computations. --- benchmarks/bench_tsne_mnist.py | 121 ++++++++++++++++----------- sklearn/manifold/t_sne.py | 74 ++++++++++------ sklearn/manifold/tests/test_t_sne.py | 5 +- 3 files changed, 125 insertions(+), 75 deletions(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index f9417dfd4de10..2459c1c74063b 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -20,32 +20,6 @@ from sklearn.utils import check_array -try: - from memory_profiler import profile -except ImportError: - def profile(f): - return f - -try: - # If you want comparison with the reference implementation, build the - # binary from source (https://github.com/lvdmaaten/bhtsne) in the folder - # benchmarks/bhtsne and add an empty `__init__.py` file in the floder. - # - # $ git clone git@github.com:lvdmaaten/bhtsne.git - # $ cd bhtsne - # $ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 - # $ touch __init__.py - # $ cd .. - # - from bhtsne.bhtsne import run_bh_tsne - - @profile - def bhtsne(data, **kwargs): - return run_bh_tsne(data, **kwargs) -except ImportError: - bhtsne = None - - memory = Memory('mnist_tsne_benchmark_data', mmap_mode='r') @@ -62,7 +36,6 @@ def load_data(dtype=np.float32, order='C'): return X, y -@profile def tsne_fit_transform(model, data): return model.fit_transform(data) @@ -72,38 +45,88 @@ def tsne_fit_transform(model, data): parser.add_argument('--order', type=str, default='C', help='Order of the input data') parser.add_argument('--perplexity', type=float, default=30) - parser.add_argument('--log-max-nsamples', type=int, default=5) - parser.add_argument('--verbose', type=int, default=2) + parser.add_argument('--bhtsne', action='store_true', + help="if set and the reference bhtsne code is " + "correctly installed, run it in the benchmark.") + parser.add_argument('--all', action='store_true', + help="if set, run the benchmark with the whole MNIST." + "dataset. Note that it will take up to 1hour.") + parser.add_argument('--profile', action='store_true', + help="if set, run the benchmark with a memory " + "profiler.") + parser.add_argument('--verbose', type=int, default=0) + parser.add_argument('--n_jobs', type=int, nargs="+", default=1, + help="Number of CPU used to fit sklearn.TSNE") args = parser.parse_args() X, y = load_data(order=args.order) + methods = [] + + # Put TSNE in methods + if isinstance(args.n_jobs, int): + tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, + verbose=args.verbose, n_jobs=args.n_jobs) + methods += [("sklearn.TSNE", tsne.fit_transform)] + elif isinstance(args.n_jobs, list): + for n_jobs in args.n_jobs: + tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, + verbose=args.verbose, n_jobs=n_jobs) + methods += [("sklearn.TSNE_n_jobs{}".format(n_jobs), + tsne.fit_transform)] + + if args.bhtsne: + try: + from bhtsne.bhtsne import run_bh_tsne + except ImportError: + raise ImportError( + """ + If you want comparison with the reference implementation, build the + binary from source (https://github.com/lvdmaaten/bhtsne) in the folder + benchmarks/bhtsne and add an empty `__init__.py` file in the folder: + + $ git clone git@github.com:lvdmaaten/bhtsne.git + $ cd bhtsne + $ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 + $ touch __init__.py + $ cd .. + """ + ) + + def bhtsne(X): + """wrapper for LvdM bhtsne implementation.""" + return run_bh_tsne(X, initial_dims=X.shape[1], + perplexity=args.perplexity, verbose=False) + methods += [("LvdM.bhtsne", bhtsne)] + + if args.profile: + + try: + from memory_profiler import profile + except ImportError: + raise ImportError("To run the benchmark with `--profile`, you " + "need to install `memory_profiler`. Please " + "run `pip install memory_profiler`.") + methods = [(n, profile(m)) for n, m in methods] + + data_size = [100, 1000, 5000, 10000] + if args.all: + data_size += [70000] + results = [] basename, _ = os.path.splitext(__file__) log_filename = basename + '.json' - for n in [100, 1000, 5000, 10000]: + for n in data_size: X_train = X[:n] n = X_train.shape[0] - print("Fitting TSNE on %d samples..." % n) - tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, - verbose=0) - t0 = time() - X_embedded = tsne_fit_transform(tsne, X_train) - duration = time() - t0 - print("Fitting T-SNE on %d samples took %0.3fs" % (n, duration)) - results.append(dict(method="TSNE", duration=duration, n_samples=n)) - with open(log_filename, 'w', encoding='utf-8') as f: - json.dump(results, f) - np.save('mnist_tsne_%d.npy' % n, X_embedded) - - if bhtsne is not None: + for name, method in methods: + print("Fitting {} on {} samples...".format(name, n)) t0 = time() - X_embedded = bhtsne(X_train, initial_dims=X_train.shape[1], - perplexity=args.perplexity, verbose=False) + X_embedded = method(X_train) duration = time() - t0 - print("Fitting bhtsne on %d samples took %0.3fs" % (n, duration)) - results.append(dict(method="bhtsne", duration=duration, - n_samples=n)) + print("Fitting {} on {} samples took {:.3f}s" + .format(name, n, duration)) + results.append(dict(method=name, duration=duration, n_samples=n)) with open(log_filename, 'w', encoding='utf-8') as f: json.dump(results, f) - np.save('mnist_bhtsne_%d.npy' % n, X_embedded) + np.save('mnist_{}_{}.npy'.format(name, n), X_embedded) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 73e44d48fcb1f..710127e5f9749 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -8,6 +8,7 @@ # * Fast Optimization for t-SNE: # http://cseweb.ucsd.edu/~lvdmaaten/workshops/nips2010/papers/vandermaaten.pdf +import warnings import numpy as np from scipy import linalg import scipy.sparse as sp @@ -15,6 +16,7 @@ from scipy.spatial.distance import squareform from scipy.sparse import csr_matrix from ..neighbors import NearestNeighbors +from ..neighbors.base import NeighborsBase from ..base import BaseEstimator from ..utils import check_array from ..utils import check_random_state @@ -612,6 +614,18 @@ class TSNE(BaseEstimator): in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing computation time and angle greater 0.8 has quickly increasing error. + n_jobs : integer (default: 1) + Only used if method='barnes_hut' + Number of CPU used to compute the nearest neighbors of each point. + If ``n_jobs=-1``, use all the CPUs. + + neighbors_method : string or NeighborsBase object (default: 'ball_tree') + Only used if method='barnes_hut' + Method used to compute k nearest neighbors for Barnes-Hut T-SNE. + If it is a string, it should be compatible with the algorithm parameter + of `NearestNeighbors`. If it is an object, it should implement a + `kneighbors` method returning distances_nn and neighbors_nn. + Attributes ---------- @@ -654,7 +668,8 @@ def __init__(self, n_components=2, perplexity=30.0, early_exaggeration=4.0, learning_rate=1000.0, n_iter=1000, n_iter_without_progress=30, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, - random_state=None, method='barnes_hut', angle=0.5): + random_state=None, method='barnes_hut', angle=0.5, n_jobs=1, + neighbors_method='ball_tree'): if not ((isinstance(init, string_types) and init in ["pca", "random"]) or isinstance(init, np.ndarray)): @@ -673,6 +688,8 @@ def __init__(self, n_components=2, perplexity=30.0, self.random_state = random_state self.method = method self.angle = angle + self.n_jobs = n_jobs + self.neighbors_method = neighbors_method def _fit(self, X, skip_num_points=0): """Fit the model using X as training data. @@ -749,30 +766,43 @@ def _fit(self, X, skip_num_points=0): # Laurens van der Maaten, 2009. degrees_of_freedom = max(self.n_components - 1.0, 1) n_samples = X.shape[0] - # the number of nearest neighbors to find - k = min(n_samples - 1, int(3. * self.perplexity + 1)) neighbors_nn = None if self.method == 'barnes_hut': + # Cpmpute the number of nearest neighbors to find. + # LvdM uses 3 * perplexity as the number of neighbors. + # In the event that we have very small # of points + # set the neighbors to n - 1. + k = min(n_samples - 1, int(3. * self.perplexity + 1)) + if self.verbose: - print("[t-SNE] Computing %i nearest neighbors..." % k) + print("[t-SNE] Computing {} nearest neighbors...".format(k)) # Find the nearest neighbors for every point - # TODO: argument for class knn_estimator=None - # TODO: assert that the knn metric is euclidean - if self.metric == 'precomputed': - knn = NearestNeighbors(metric=self.metric) + if isinstance(self.neighbors_method, string_types): + if (self.metric == 'precomputed' and + self.neighbors_method == "ball_tree"): + warnings.warn("Cannot use neighbors_method='ball_tree' " + "with metric='precomputed'. Switching to " + "neighbors_method='brute'.", RuntimeWarning) + self.neighbors_method = "brute" + knn = NearestNeighbors(algorithm=self.neighbors_method, + n_neighbors=k, metric=self.metric, + n_jobs=self.n_jobs) + elif isinstance(self.neighbors_method, NeighborsBase): + knn = self.neighbors_method else: - knn = NearestNeighbors(algorithm='ball_tree', - metric=self.metric) + ValueError("neighbors_method should be either a string or " + "a subclass of NeighborsBase. {} is not valid." + .format(self.neighbors_method)) knn.fit(X) - # LvdM uses 3 * perplexity as the number of neighbors - # And we add one to not count the data point itself - # In the event that we have very small # of points - # set the neighbors to n - 1 distances_nn, neighbors_nn = knn.kneighbors( None, n_neighbors=k) + if self.metric != "precomputed": + # knn return the euclidean distance but we need it squared. + # TODO: the computation are valid for euclidean distance. + # Should we enforce that with an assert? distances_nn **= 2 P = _joint_probabilities_nn(distances_nn, neighbors_nn, @@ -791,10 +821,11 @@ def _fit(self, X, skip_num_points=0): random_state=random_state) X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) elif self.init == 'random': - X_embedded = None + X_embedded = 1e-4 * random_state.randn( + n_samples, self.n_components).astype(np.float32) else: - raise ValueError("Unsupported initialization scheme: %s" - % self.init) + raise ValueError("Unsupported initialization scheme: {}" + .format(self.init)) return self._tsne(P, degrees_of_freedom, n_samples, random_state, X_embedded=X_embedded, @@ -807,8 +838,8 @@ def _fit(self, X, skip_num_points=0): def n_iter_final(self): return self.n_iter_ - def _tsne(self, P, degrees_of_freedom, n_samples, random_state, - X_embedded=None, neighbors=None, skip_num_points=0): + def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, + neighbors=None, skip_num_points=0): """Runs t-SNE.""" # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P # and the Student's t-distributions Q. The optimization algorithm that @@ -818,11 +849,6 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, # * final optimization with momentum 0.8 # The embedding is initialized with iid samples from Gaussians with # standard deviation 1e-4. - - if X_embedded is None: - # Initialize embedding randomly - X_embedded = 1e-4 * random_state.randn( - n_samples, self.n_components).astype(np.float32) params = X_embedded.ravel() opt_args = {"n_iter": 50, "momentum": 0.5, "it": 0, diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 639715a708771..5a05b2951e0b8 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -293,7 +293,7 @@ def test_preserve_trustworthiness_approximately_with_precomputed_distances(): D = squareform(pdist(X), "sqeuclidean") tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, early_exaggeration=2.0, metric="precomputed", - random_state=i, verbose=0) + random_state=i, verbose=0, neighbors_method='brute') X_embedded = tsne.fit_transform(D) t = trustworthiness(D, X_embedded, n_neighbors=1, precomputed=True) @@ -337,7 +337,8 @@ def test_init_ndarray(): def test_init_ndarray_precomputed(): # Initialize TSNE with ndarray and metric 'precomputed' # Make sure no FutureWarning is thrown from _fit - tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed") + tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed", + neighbors_method='brute') tsne.fit(np.zeros((100, 100))) From f77a9d77ae4443b3d05bd08cf56e6f53271f6529 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Fri, 9 Jun 2017 13:42:20 +0200 Subject: [PATCH 04/55] ENH add PCA preprocessing to benchmark [ci skip] --- benchmarks/bench_tsne_mnist.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 2459c1c74063b..7274df4634cf8 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -17,6 +17,7 @@ from sklearn.externals.joblib import Memory from sklearn.datasets import fetch_mldata from sklearn.manifold import TSNE +from sklearn.decomposition import PCA from sklearn.utils import check_array @@ -55,12 +56,17 @@ def tsne_fit_transform(model, data): help="if set, run the benchmark with a memory " "profiler.") parser.add_argument('--verbose', type=int, default=0) - parser.add_argument('--n_jobs', type=int, nargs="+", default=1, + parser.add_argument('--n_jobs', type=int, nargs="+", default=2, help="Number of CPU used to fit sklearn.TSNE") + parser.add_argument('--pca-components', type=int, default=50, + help="Number of principal components for preprocessing.") args = parser.parse_args() X, y = load_data(order=args.order) + if args.pca_components > 0: + X = PCA(n_components=args.pca_components).fit_transform(X) + methods = [] # Put TSNE in methods @@ -95,7 +101,7 @@ def tsne_fit_transform(model, data): def bhtsne(X): """wrapper for LvdM bhtsne implementation.""" - return run_bh_tsne(X, initial_dims=X.shape[1], + return run_bh_tsne(X, use_pca=False, perplexity=args.perplexity, verbose=False) methods += [("LvdM.bhtsne", bhtsne)] From 22fb977ed7ddcd01612acd287f14d981a2a2e1f7 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 10 Jun 2017 10:56:19 +0200 Subject: [PATCH 05/55] ENH improve benchmark script --- benchmarks/bench_tsne_mnist.py | 56 +++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 7274df4634cf8..829351082d287 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -17,6 +17,7 @@ from sklearn.externals.joblib import Memory from sklearn.datasets import fetch_mldata from sklearn.manifold import TSNE +from sklearn.manifold.t_sne import trustworthiness from sklearn.decomposition import PCA from sklearn.utils import check_array @@ -51,21 +52,25 @@ def tsne_fit_transform(model, data): "correctly installed, run it in the benchmark.") parser.add_argument('--all', action='store_true', help="if set, run the benchmark with the whole MNIST." - "dataset. Note that it will take up to 1hour.") + "dataset. Note that it will take up to 1 hour.") parser.add_argument('--profile', action='store_true', help="if set, run the benchmark with a memory " - "profiler.") + "profiler.") parser.add_argument('--verbose', type=int, default=0) parser.add_argument('--n_jobs', type=int, nargs="+", default=2, help="Number of CPU used to fit sklearn.TSNE") parser.add_argument('--pca-components', type=int, default=50, - help="Number of principal components for preprocessing.") + help="Number of principal components for " + "preprocessing.") args = parser.parse_args() X, y = load_data(order=args.order) if args.pca_components > 0: + t0 = time() X = PCA(n_components=args.pca_components).fit_transform(X) + print("PCA preprocessing down to {} dimensions took {:0.3f}s" + .format(args.pca_components, time() - t0)) methods = [] @@ -73,37 +78,36 @@ def tsne_fit_transform(model, data): if isinstance(args.n_jobs, int): tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, verbose=args.verbose, n_jobs=args.n_jobs) - methods += [("sklearn.TSNE", tsne.fit_transform)] + methods += [("sklearn TSNE", tsne.fit_transform)] elif isinstance(args.n_jobs, list): for n_jobs in args.n_jobs: tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, verbose=args.verbose, n_jobs=n_jobs) - methods += [("sklearn.TSNE_n_jobs{}".format(n_jobs), + methods += [("sklearn TSNE (n_jobs={})".format(n_jobs), tsne.fit_transform)] if args.bhtsne: try: from bhtsne.bhtsne import run_bh_tsne except ImportError: - raise ImportError( - """ - If you want comparison with the reference implementation, build the - binary from source (https://github.com/lvdmaaten/bhtsne) in the folder - benchmarks/bhtsne and add an empty `__init__.py` file in the folder: - - $ git clone git@github.com:lvdmaaten/bhtsne.git - $ cd bhtsne - $ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 - $ touch __init__.py - $ cd .. - """ - ) + raise ImportError("""\ +If you want comparison with the reference implementation, build the +binary from source (https://github.com/lvdmaaten/bhtsne) in the folder +benchmarks/bhtsne and add an empty `__init__.py` file in the folder: + +$ git clone git@github.com:lvdmaaten/bhtsne.git +$ cd bhtsne +$ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 +$ touch __init__.py +$ cd .. +""") def bhtsne(X): - """wrapper for LvdM bhtsne implementation.""" - return run_bh_tsne(X, use_pca=False, - perplexity=args.perplexity, verbose=False) - methods += [("LvdM.bhtsne", bhtsne)] + """Wrapper for the reference lvdmaaten/bhtsne implementation.""" + # PCA preprocessing is done elsewhere in the benchmark script + return run_bh_tsne(X, use_pca=False, perplexity=args.perplexity, + verbose=False) + methods += [("lvdmaaten/bhtsne", bhtsne)] if args.profile: @@ -130,9 +134,11 @@ def bhtsne(X): t0 = time() X_embedded = method(X_train) duration = time() - t0 - print("Fitting {} on {} samples took {:.3f}s" - .format(name, n, duration)) + tw = trustworthiness(X_train, X_embedded) + print("Fitting {} on {} samples took {:.3f}s, " + "trustworthiness: {:0.3f}".format(name, n, duration, tw)) results.append(dict(method=name, duration=duration, n_samples=n)) with open(log_filename, 'w', encoding='utf-8') as f: json.dump(results, f) - np.save('mnist_{}_{}.npy'.format(name, n), X_embedded) + np.save('mnist_{}_{}.npy'.format(name.replace("/", '-'), n), + X_embedded) From ed4182ce29048ff570b6b0ea8753ca3f531d2b21 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 10 Jun 2017 18:46:40 +0200 Subject: [PATCH 06/55] WIP on fixing the optimization schedule and default parameters --- benchmarks/bench_tsne_mnist.py | 18 +++++---- sklearn/manifold/t_sne.py | 67 ++++++++++++++++++---------------- 2 files changed, 47 insertions(+), 38 deletions(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 829351082d287..5009cd4664d3c 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -39,7 +39,8 @@ def load_data(dtype=np.float32, order='C'): def tsne_fit_transform(model, data): - return model.fit_transform(data) + transformed = model.fit_transform(data) + return transformed, model.n_iter_ if __name__ == "__main__": @@ -78,13 +79,14 @@ def tsne_fit_transform(model, data): if isinstance(args.n_jobs, int): tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, verbose=args.verbose, n_jobs=args.n_jobs) - methods += [("sklearn TSNE", tsne.fit_transform)] + methods += [("sklearn TSNE", + lambda data: tsne_fit_transform(tsne, data))] elif isinstance(args.n_jobs, list): for n_jobs in args.n_jobs: tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, verbose=args.verbose, n_jobs=n_jobs) methods += [("sklearn TSNE (n_jobs={})".format(n_jobs), - tsne.fit_transform)] + lambda data: tsne_fit_transform(tsne, data))] if args.bhtsne: try: @@ -105,8 +107,9 @@ def tsne_fit_transform(model, data): def bhtsne(X): """Wrapper for the reference lvdmaaten/bhtsne implementation.""" # PCA preprocessing is done elsewhere in the benchmark script + n_iter = -1 # TODO find a way to report the number of iterations return run_bh_tsne(X, use_pca=False, perplexity=args.perplexity, - verbose=False) + verbose=False), n_iter methods += [("lvdmaaten/bhtsne", bhtsne)] if args.profile: @@ -132,11 +135,12 @@ def bhtsne(X): for name, method in methods: print("Fitting {} on {} samples...".format(name, n)) t0 = time() - X_embedded = method(X_train) + X_embedded, n_iter = method(X_train) duration = time() - t0 tw = trustworthiness(X_train, X_embedded) - print("Fitting {} on {} samples took {:.3f}s, " - "trustworthiness: {:0.3f}".format(name, n, duration, tw)) + print("Fitting {} on {} samples took {:.3f}s in {:d} iterations, " + "trustworthiness: {:0.3f}".format( + name, n, duration, n_iter, tw)) results.append(dict(method=name, duration=duration, n_samples=n)) with open(log_filename, 'w', encoding='utf-8') as f: json.dump(results, f) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 710127e5f9749..bbbfa5872707e 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -302,7 +302,7 @@ def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, def _gradient_descent(objective, p0, it, n_iter, objective_error=None, n_iter_check=1, n_iter_without_progress=50, - momentum=0.5, learning_rate=1000.0, min_gain=0.01, + momentum=0.8, learning_rate=200.0, min_gain=0.01, min_grad_norm=1e-7, min_error_diff=1e-7, verbose=0, args=None, kwargs=None): """Batch gradient descent with momentum and individual gains. @@ -337,13 +337,16 @@ def _gradient_descent(objective, p0, it, n_iter, objective_error=None, Maximum number of iterations without progress before we abort the optimization. - momentum : float, within (0.0, 1.0), optional (default: 0.5) + momentum : float, within (0.0, 1.0), optional (default: 0.8) The momentum generates a weight for previous gradients that decays exponentially. - learning_rate : float, optional (default: 1000.0) - The learning rate should be extremely high for t-SNE! Values in the - range [100.0, 1000.0] are common. + learning_rate : float, optional (default: 200.0) + The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If + the learning rate is too high, the data may look like a 'ball' with any + point approximately equidistant from its nearest neighbours. If the + learning rate is too low, most points may look compressed in a dense + cloud with few outliers. min_gain : float, optional (default: 0.01) Minimum individual gain for each parameter. @@ -530,7 +533,7 @@ class TSNE(BaseEstimator): between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter. - early_exaggeration : float, optional (default: 4.0) + early_exaggeration : float, optional (default: 12.0) Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger @@ -539,16 +542,17 @@ class TSNE(BaseEstimator): optimization, the early exaggeration factor or the learning rate might be too high. - learning_rate : float, optional (default: 1000) - The learning rate can be a critical parameter. It should be - between 100 and 1000. If the cost function increases during initial - optimization, the early exaggeration factor or the learning rate - might be too high. If the cost function gets stuck in a bad local - minimum increasing the learning rate helps sometimes. + learning_rate : float, optional (default: 200.0) + The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If + the learning rate is too high, the data may look like a 'ball' with any + point approximately equidistant from its nearest neighbours. If the + learning rate is too low, most points may look compressed in a dense + cloud with few outliers. If the cost function gets stuck in a bad local + minimum increasing the learning rate may help. n_iter : int, optional (default: 1000) Maximum number of iterations for the optimization. Should be at - least 200. + least 250. n_iter_without_progress : int, optional (default: 30) Only used if method='exact' @@ -665,7 +669,7 @@ class TSNE(BaseEstimator): """ def __init__(self, n_components=2, perplexity=30.0, - early_exaggeration=4.0, learning_rate=1000.0, n_iter=1000, + early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=30, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, random_state=None, method='barnes_hut', angle=0.5, n_jobs=1, @@ -843,15 +847,14 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, """Runs t-SNE.""" # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P # and the Student's t-distributions Q. The optimization algorithm that - # we use is batch gradient descent with three stages: - # * early exaggeration with momentum 0.5 - # * early exaggeration with momentum 0.8 - # * final optimization with momentum 0.8 + # we use is batch gradient descent with two stages: + # * initial optimization with early exaggeration and momentum at 0.5 + # * final optimization with momentum at 0.8 # The embedding is initialized with iid samples from Gaussians with # standard deviation 1e-4. params = X_embedded.ravel() - opt_args = {"n_iter": 50, "momentum": 0.5, "it": 0, + opt_args = {"it": 0, "learning_rate": self.learning_rate, "n_iter_without_progress": self.n_iter_without_progress, "verbose": self.verbose, "n_iter_check": 25, @@ -876,27 +879,29 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, opt_args['min_error_diff'] = 0.0 opt_args['min_grad_norm'] = self.min_grad_norm - # Early exaggeration + # Learning schedule (part 1): do 250 iteration with lower momentum but + # higher learning rate controlled via the early exageration parameter + opt_args['n_iter'] = 250 + opt_args['momentum'] = 0.5 P *= self.early_exaggeration params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args) - opt_args['n_iter'] = 100 - opt_args['momentum'] = 0.8 - opt_args['it'] = it + 1 - params, kl_divergence, it = _gradient_descent(obj_func, params, - **opt_args) - if self.verbose: print("[t-SNE] KL divergence after %d iterations with early " "exaggeration: %f" % (it + 1, kl_divergence)) - # Final optimization + # Learning schedule (part 2): disable early exaggeration and finish + # optimization with a higher momentum at 0.8 P /= self.early_exaggeration - opt_args['n_iter'] = self.n_iter - opt_args['it'] = it + 1 - params, kl_divergence, it = _gradient_descent(obj_func, params, - **opt_args) + remaining = self.n_iter - 250 + if remaining > 0: + opt_args['n_iter'] = self.n_iter + opt_args['it'] = it + 1 + opt_args['momentum'] = 0.8 + params, kl_divergence, it = _gradient_descent(obj_func, params, + **opt_args) + # Save the final number of iterations self.n_iter_ = it From cf851f0be1d4e129c0ffb84666beb40e28456784 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 10 Jun 2017 19:04:21 +0200 Subject: [PATCH 07/55] Shuffle the data in the benchmark script This is to get all the digits represented in the subsamples. [ci skip] --- benchmarks/bench_tsne_mnist.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 5009cd4664d3c..a8c60f23bd576 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -20,19 +20,24 @@ from sklearn.manifold.t_sne import trustworthiness from sklearn.decomposition import PCA from sklearn.utils import check_array +from sklearn.utils import shuffle as _shuffle memory = Memory('mnist_tsne_benchmark_data', mmap_mode='r') @memory.cache -def load_data(dtype=np.float32, order='C'): +def load_data(dtype=np.float32, order='C', shuffle=True, seed=0): """Load the data, then cache and memmap the train/test split""" print("Loading dataset...") data = fetch_mldata('MNIST original') + X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] + if shuffle: + X, y = _shuffle(X, y, random_state=seed) + # Normalize features X /= 255 return X, y From a6a28bdbf624dfbe7454922061a960eff159b52b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 13 Jun 2017 11:12:47 +0200 Subject: [PATCH 08/55] WIP more work on opt scheduling and stopping criterion --- benchmarks/bench_tsne_mnist.py | 4 ++-- sklearn/manifold/t_sne.py | 43 ++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index a8c60f23bd576..3f956c26ce4fc 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -114,7 +114,7 @@ def bhtsne(X): # PCA preprocessing is done elsewhere in the benchmark script n_iter = -1 # TODO find a way to report the number of iterations return run_bh_tsne(X, use_pca=False, perplexity=args.perplexity, - verbose=False), n_iter + verbose=args.verbose > 0), n_iter methods += [("lvdmaaten/bhtsne", bhtsne)] if args.profile: @@ -127,7 +127,7 @@ def bhtsne(X): "run `pip install memory_profiler`.") methods = [(n, profile(m)) for n, m in methods] - data_size = [100, 1000, 5000, 10000] + data_size = [100, 500, 1000, 5000, 10000] if args.all: data_size += [70000] diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index bbbfa5872707e..c169df3b44f4e 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -8,6 +8,7 @@ # * Fast Optimization for t-SNE: # http://cseweb.ucsd.edu/~lvdmaaten/workshops/nips2010/papers/vandermaaten.pdf +from time import time import warnings import numpy as np from scipy import linalg @@ -90,6 +91,7 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): P : csr sparse matrix, shape (n_samples, n_samples) Condensed joint probability matrix with only nearest neighbors. """ + t0 = time() # Compute conditional probabilities such that they approximately match # the desired perplexity n_samples, K = neighbors.shape @@ -108,6 +110,10 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): sum_P = np.maximum(P.sum(), MACHINE_EPSILON) P /= sum_P assert np.all(np.abs(P.data) <= 1.0) + if verbose >= 2: + duration = time() - t0 + print("[t-SNE] Computed conditional probabilities in {:.3f}s" + .format(duration)) return P @@ -301,7 +307,7 @@ def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, def _gradient_descent(objective, p0, it, n_iter, objective_error=None, - n_iter_check=1, n_iter_without_progress=50, + n_iter_check=1, n_iter_without_progress=51, momentum=0.8, learning_rate=200.0, min_gain=0.01, min_grad_norm=1e-7, min_error_diff=1e-7, verbose=0, args=None, kwargs=None): @@ -333,7 +339,7 @@ def _gradient_descent(objective, p0, it, n_iter, objective_error=None, Should return a tuple of cost and gradient for a given parameter vector. - n_iter_without_progress : int, optional (default: 30) + n_iter_without_progress : int, optional (default: 51) Maximum number of iterations without progress before we abort the optimization. @@ -389,8 +395,9 @@ def _gradient_descent(objective, p0, it, n_iter, objective_error=None, gains = np.ones_like(p) error = np.finfo(np.float).max best_error = np.finfo(np.float).max - best_iter = 0 + best_iter = it + tic = time() for i in range(it, n_iter): new_error, grad = objective(p, *args, **kwargs) grad_norm = linalg.norm(grad) @@ -405,14 +412,19 @@ def _gradient_descent(objective, p0, it, n_iter, objective_error=None, p += update if (i + 1) % n_iter_check == 0: + toc = time() + duration = toc - tic + tic = toc if new_error is None: new_error = objective_error(p, *args) error_diff = np.abs(new_error - error) error = new_error if verbose >= 2: - m = "[t-SNE] Iteration %d: error = %.7f, gradient norm = %.7f" - print(m % (i + 1, error, grad_norm)) + print("[t-SNE] Iteration %d: error = %.7f," + " gradient norm = %.7f" + " (%s iterations in %0.3fs)" + % (i + 1, error, grad_norm, n_iter_check, duration)) if error < best_error: best_error = error @@ -799,9 +811,19 @@ def _fit(self, X, skip_num_points=0): ValueError("neighbors_method should be either a string or " "a subclass of NeighborsBase. {} is not valid." .format(self.neighbors_method)) + t0 = time() knn.fit(X) + duration = time() - t0 + if self.verbose: + print("[t-SNE] Indexed {} samples in {:.3f}s...".format( + n_samples, duration)) + t0 = time() distances_nn, neighbors_nn = knn.kneighbors( None, n_neighbors=k) + duration = time() - t0 + if self.verbose: + print("[t-SNE] Computed neighbors for {} samples in {:.3f}s..." + .format(n_samples, duration)) if self.metric != "precomputed": # knn return the euclidean distance but we need it squared. @@ -857,7 +879,7 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, opt_args = {"it": 0, "learning_rate": self.learning_rate, "n_iter_without_progress": self.n_iter_without_progress, - "verbose": self.verbose, "n_iter_check": 25, + "verbose": self.verbose, "n_iter_check": 50, "kwargs": dict(skip_num_points=skip_num_points)} if self.method == 'barnes_hut': obj_func = _kl_divergence_bh @@ -865,7 +887,6 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, self.n_components] opt_args['args'] = args - opt_args['min_grad_norm'] = 1e-3 opt_args['n_iter_without_progress'] = 30 # Don't always calculate the cost since that calculation # can be nearly as expensive as the gradient @@ -881,8 +902,10 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, # Learning schedule (part 1): do 250 iteration with lower momentum but # higher learning rate controlled via the early exageration parameter - opt_args['n_iter'] = 250 + exploration_n_iter = 250 + opt_args['n_iter'] = exploration_n_iter opt_args['momentum'] = 0.5 + opt_args['n_iter_without_progress'] = exploration_n_iter P *= self.early_exaggeration params, kl_divergence, it = _gradient_descent(obj_func, params, @@ -894,8 +917,8 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, # Learning schedule (part 2): disable early exaggeration and finish # optimization with a higher momentum at 0.8 P /= self.early_exaggeration - remaining = self.n_iter - 250 - if remaining > 0: + remaining = self.n_iter - exploration_n_iter + if it < exploration_n_iter or remaining > 0: opt_args['n_iter'] = self.n_iter opt_args['it'] = it + 1 opt_args['momentum'] = 0.8 From e4a0d240d7642526a2d1c9bba96fc48c179f957d Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Sun, 18 Jun 2017 23:27:15 +0200 Subject: [PATCH 09/55] ENH add basic QuadTree in neighbors --- sklearn/neighbors/quad_tree.pxd | 85 ++++ sklearn/neighbors/quad_tree.pyx | 528 ++++++++++++++++++++++ sklearn/neighbors/setup.py | 5 + sklearn/neighbors/tests/test_quad_tree.py | 56 +++ sklearn/tree/_utils.pxd | 4 +- 5 files changed, 677 insertions(+), 1 deletion(-) create mode 100644 sklearn/neighbors/quad_tree.pxd create mode 100644 sklearn/neighbors/quad_tree.pyx create mode 100644 sklearn/neighbors/tests/test_quad_tree.py diff --git a/sklearn/neighbors/quad_tree.pxd b/sklearn/neighbors/quad_tree.pxd new file mode 100644 index 0000000000000..002f7784c80fe --- /dev/null +++ b/sklearn/neighbors/quad_tree.pxd @@ -0,0 +1,85 @@ +# cython: boundscheck=False +# cython: wraparound=False +# cython: cdivision=True +# Author: Thomas Moreau +# Author: Olivier Grisel + +# See quad_tree.pyx for details. + +import numpy as np +cimport numpy as np + +ctypedef np.npy_float32 DTYPE_t # Type of X +ctypedef np.npy_intp SIZE_t # Type for indices and counters +ctypedef np.npy_int32 INT32_t # Signed 32 bit integer +ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer + +# This is effectively an ifdef statement in Cython +# It allows us to write printf debugging lines +# and remove them at compile time +cdef enum: + DEBUGFLAG = 0 + +cdef float EPSILON = 1e-6 + +cdef struct Cell: + # Base storage stucture for cells in a QuadTree object + + # Tree structure + SIZE_t parent # Parent cell of this cell + SIZE_t[8] children # Array pointing to childrens of this cell + + # Cell boundaries + DTYPE_t[3] min_bounds # Inferior boundaries of this cell (inclusive) + DTYPE_t[3] max_bounds # Superior boundaries of this cell (exclusive) + DTYPE_t[3] center # Store the center for quick split of cells + + # Cell description + SIZE_t cell_id # Id of the cell in the cells array in the Tree + DTYPE_t max_width # The value of the maximum width w + DTYPE_t[3] barycenter # Keep track of the center of mass of the cell + SIZE_t point_index # Index of the point at this cell (only defined in non empty leaf) + bint is_leaf # Does this cell have children? + SIZE_t depth # Depth of the cell in the tree + SIZE_t cumulative_size # Number of points including all cell below this one + # cdef long size # Number of points at this cell + + +cdef class QuadTree: + # The QuadTree object is a quad tree structure constructed by inserting + # recursively points in the tree and splitting cells in 4 so that each + # leaf cell contains at most one point. + + # Parameters of the tree + cdef public int n_dimensions # Number of dimensions in X + cdef public int verbose # Verbosity of the output + cdef SIZE_t n_cells_per_cell # Number of children per node. (2 ** n_dimension) + + # Tree inner structure + cdef public SIZE_t max_depth # Max depth of the tree + cdef public SIZE_t cell_count # Counter for node IDs + cdef public SIZE_t capacity # Capacity of tree, in terms of nodes + cdef public SIZE_t n_points # Total number of points + cdef Cell* cells # Array of nodes + + # Methods + cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index, + SIZE_t cell_id=*) nogil except -1 + cdef int _resize(self, SIZE_t capacity) nogil except -1 + cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1 + + # cdef np.ndarray _get_value_ndarray(self) + # cdef np.ndarray _get_node_ndarray(self) + + cdef SIZE_t insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, + SIZE_t point_index, SIZE_t size=*) nogil + cdef void init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil + cdef bint is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil + cdef SIZE_t select_child(self, DTYPE_t[3] point, Cell* cell) nogil + cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds) nogil + + cdef int check_point_in_cell(self, DTYPE_t[3] point, Cell* cell) nogil except -1 + cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, int cell_id=*, + long idx=*, float squared_theta=*) nogil + cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=*) nogil except -1 + cdef np.ndarray _get_cell_ndarray(self) diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx new file mode 100644 index 0000000000000..541ae098e31b0 --- /dev/null +++ b/sklearn/neighbors/quad_tree.pyx @@ -0,0 +1,528 @@ +# cython: boundscheck=False +# cython: wraparound=False +# cython: cdivision=True +# Author: Thomas Moreau +# Author: Olivier Grisel + + +from cpython cimport Py_INCREF, PyObject + +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy +from libc.stdio cimport printf + +from sklearn.tree._utils cimport safe_realloc, sizet_ptr_to_ndarray +from ..utils import check_array + +import numpy as np +cimport numpy as np + +cdef extern from "math.h": + float fabsf(float x) nogil + +cdef extern from "numpy/arrayobject.h": + object PyArray_NewFromDescr(object subtype, np.dtype descr, + int nd, np.npy_intp* dims, + np.npy_intp* strides, + void* data, int flags, object obj) + + +cdef SIZE_t DEFAULT = (-1) + +# Repeat struct definition for numpy +CELL_DTYPE = np.dtype({ + 'names': ['parent', 'children', 'min_bounds', 'max_bounds', 'center', + 'cell_id', 'max_width', 'barycenter', 'point_index', 'is_leaf', + 'depth', 'cumulative_size'], + 'formats': [np.intp, np.intp, np.float64, np.float64, np.float64, np.intp, + np.float64, np.float64, np.intp, np.bool, np.intp, np.intp], + 'offsets': [ + &( NULL).parent, + &( NULL).children, + &( NULL).min_bounds, + &( NULL).max_bounds, + &( NULL).center, + &( NULL).cell_id, + &( NULL).max_width, + &( NULL).barycenter, + &( NULL).point_index, + &( NULL).is_leaf, + &( NULL).depth, + &( NULL).cumulative_size + ] +}) + + +cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( + np.ndarray[DTYPE_t, ndim=2, mode='c'] X): + return ( X.data) + + + +cdef class QuadTree: + """Array-based representation of a QuadTree. + """ + def __cinit__(self, int n_dimensions, int verbose): + """Constructor.""" + # Parameters of the tree + self.n_dimensions = n_dimensions + self.verbose = verbose + self.n_cells_per_cell = 2 ** self.n_dimensions + + # Inner structures + self.max_depth = 0 + self.cell_count = 0 + self.capacity = 0 + self.n_points = 0 + self.cells = NULL + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.cells) + + cdef int _resize(self, SIZE_t capacity) nogil except -1: + """Resize all inner arrays to `capacity`, if `capacity` == -1, then + double the size of the inner arrays. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if self._resize_c(capacity) != 0: + # Acquire gil only if we need to raise + with gil: + raise MemoryError() + + # XXX using (size_t)(-1) is ugly, but SIZE_MAX is not available in C89 + # (i.e., older MSVC). + cdef int _resize_c(self, SIZE_t capacity=DEFAULT) nogil except -1: + """Guts of _resize + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if capacity == self.capacity and self.cells != NULL: + return 0 + + if capacity == DEFAULT: + if self.capacity == 0: + capacity = 9 # default initial value to min + else: + capacity = 2 * self.capacity + + safe_realloc(&self.cells, capacity) + + # if capacity smaller than cell_count, adjust the counter + if capacity < self.cell_count: + self.cell_count = capacity + + self.capacity = capacity + return 0 + + cdef int check_point_in_cell(self, DTYPE_t[3] point, Cell* cell + ) nogil except -1: + if self.verbose >= 10: + printf("[QuadTree] Checking point (%f, %f, %f) in cell %i " + "([%f/%f, %f/%f, %f/%f], size %i)\n", + point[0], point[1], point[2], cell.cell_id, + cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], + cell.cumulative_size) + + for i in range(self.n_dimensions): + if (cell.min_bounds[i] > point[i] or + cell.max_bounds[i] <= point[i]): + with gil: + msg = "[QuadTree] InsertionError: point out of cell boundary.\n" + msg += "Axis %i: cell [%f, %f]; point %f\n" + + msg %= i, cell.min_bounds[i], cell.max_bounds[i], point[i] + raise ValueError(msg) + + + cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index, + SIZE_t cell_id=0) nogil except -1: + """Insert a point in the QuadTree.""" + cdef int i + cdef DTYPE_t n_frac + cdef SIZE_t selected_child + cdef Cell* cell = &self.cells[cell_id] + cdef SIZE_t n_point = cell.cumulative_size + + if self.verbose >= 10: + printf("[QuadTree] Inserting depth %i\n", cell.depth) + + # Assert that the point is in the right range + if DEBUGFLAG: + self.check_point_in_cell(point, cell) + + + # If the cell is an empty leaf, insert the point in it + if cell.cumulative_size == 0: + cell.cumulative_size = 1 + self.n_points += 1 + for i in range(self.n_dimensions): + cell.barycenter[i] = point[i] + cell.point_index = point_index + if self.verbose >= 10: + printf("[QuadTree] inserted point in cell %i\n", cell_id) + return cell_id + + # If the cell is not a leaf, update cell internals and + # recurse in selected child + if not cell.is_leaf: + for i in range(self.n_dimensions): + # barycenter update using a weighted mean + cell.barycenter[i] = (n_point * cell.barycenter[i] + point[i]) / (n_point + 1) + + # Increase the size of the subtree starting from this cell + cell.cumulative_size += 1 + + # Insert child in the correct subtree + selected_child = self.select_child(point, cell) + if self.verbose >= 10: + printf("[QuadTree] selected child %i\n", selected_child) + if selected_child == -1: + self.n_points += 1 + return self.insert_point_in_new_child(point, cell, point_index) + return self.insert_point(point, point_index, selected_child) + + # Finally, if the cell is a leaf with a point already inserted, + # split the cell in n_cells_per_cell if the point is not a duplicate. + # If it is a duplicate, increase the size of the leaf and return. + if self.is_duplicate(point, cell.barycenter): + if self.verbose >= 10: + printf("[QuadTree] found a duplicate!\n") + cell.cumulative_size += 1 + self.n_points += 1 + return cell_id + + # In a leaf, the barycenter correspond to the only point included + # in it. + self.insert_point_in_new_child(cell.barycenter, cell, cell.point_index, cell.cumulative_size) + return self.insert_point(point, point_index, cell_id) + + # XXX: This operation is not Thread safe + cdef SIZE_t insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, + SIZE_t point_index, SIZE_t size=1) nogil: + + # Local variable definition + cdef SIZE_t cell_id, cell_child_id, parent_id + cdef DTYPE_t[3] save_point + cdef DTYPE_t width + cdef Cell* child + cdef int i + + # If the maximal capacity of the Tree have been reach, double the capacity + # We need to save the current cell id and the current point to retrieve them + # in case the reallocation + if self.cell_count + 1 > self.capacity: + parent_id = cell.cell_id + for i in range(self.n_dimensions): + save_point[i] = point[i] + self._resize(DEFAULT) + cell = &self.cells[parent_id] + point = save_point + + # Get an empty cell and initialize it + cell_id = self.cell_count + self.cell_count += 1 + child = &self.cells[cell_id] + + self.init_cell(child, cell.cell_id, cell.depth + 1) + child.cell_id = cell_id + + # Set the cell as an inner cell of the Tree + cell.is_leaf = False + cell.point_index = -1 + + # Set the correct boundary for the cell and store the point in it + cell_child_id = 0 + for i in range(self.n_dimensions): + cell_child_id *= 2 + if point[i] >= cell.center[i]: + cell_child_id += 1 + child.min_bounds[i] = cell.center[i] + child.max_bounds[i] = cell.max_bounds[i] + else: + child.min_bounds[i] = cell.min_bounds[i] + child.max_bounds[i] = cell.center[i] + child.center[i] = (child.min_bounds[i] + child.max_bounds[i]) / 2. + width = child.max_bounds[i] - child.min_bounds[i] + + child.barycenter[i] = point[i] + child.max_width = max(child.max_width, width*width) + + # TODO: max_width + child.point_index = point_index + child.cumulative_size = size + + # Store the child cell in the correct place in children + cell.children[cell_child_id] = child.cell_id + + if DEBUGFLAG: + # Assert that the point is in the right range + self.check_point_in_cell(point, child) + if self.verbose >= 10: + printf("[QuadTree] inserted point %i in new child %i\n", point_index, cell_id) + + return cell_id + + cdef void init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil: + cell.parent = parent + cell.is_leaf = True + cell.depth = depth + cell.max_width = 0 + cell.cumulative_size = 0 + for i in range(self.n_cells_per_cell): + cell.children[i] = DEFAULT + + + cdef bint is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil: + cdef int i + cdef bint res = True + for i in range(self.n_dimensions): + res &= fabsf(point1[i] - point2[i]) <= EPSILON + return res + + + cdef SIZE_t select_child(self, DTYPE_t[3] point, Cell* cell) nogil: + cdef int i + cdef SIZE_t selected_child = 0 + for i in range(self.n_dimensions): + # Select the correct child cell to insert the point by comparing + # it to the borders of the cells using precomputed center. + selected_child *= 2 + if point[i] >= cell.center[i]: + selected_child += 1 + return cell.children[selected_child] + + cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds) nogil: + cdef int i + cdef DTYPE_t width + cdef Cell* root = &self.cells[0] + self.init_cell(root, -1, 0) + for i in range(self.n_dimensions): + root.min_bounds[i] = min_bounds[i] + root.max_bounds[i] = max_bounds[i] + root.center[i] = (max_bounds[i] + min_bounds[i]) / 2. + width = max_bounds[i] - min_bounds[i] + root.max_width = max(root.max_width, width*width) + root.cell_id = 0 + + self.cell_count += 1 + + def build_tree(self, X): + """Build a tree from the points in X.""" + cdef DTYPE_t[3] pt + cdef DTYPE_t[3] min_bounds, max_bounds + + # validate X and prepare for query + # X = check_array(X, dtype=DTYPE_t, order='C') + n_samples = X.shape[0] + + capacity = 100 + self._resize(capacity) + m, M = np.min(X, axis=0) - 1e-3, np.max(X, axis=0) + 1e-3 + for i in range(self.n_dimensions): + min_bounds[i] = m[i] + max_bounds[i] = M[i] + + # Create the initial node with boundaries from the dataset + self._init_root(min_bounds, max_bounds) + + # cdef DTYPE_t[:, ::1] Xarr = get_memview_DTYPE_2D(X) + for i in range(n_samples): + for j in range(self.n_dimensions): + pt[j] = X[i, j] + self.insert_point(pt, i) + + self._resize(capacity=self.cell_count) + + def plot_tree(self): + """Plot the tree with cell boundaries and the points inserted in it.""" + self.check_coherence() + import matplotlib.pyplot as plt + + plt.figure() + for c in self.cells[:self.cell_count]: + if not c.is_leaf: + # Plot the cell division if the cell is an inner cell + plt.vlines(c.center[0], c.min_bounds[1], c.max_bounds[1]) + plt.hlines(c.center[1], c.min_bounds[0], c.max_bounds[0]) + else: + # If the cell is a leaf, display the point contained in it. + plt.scatter(c.barycenter[0], c.barycenter[1], c='b', marker='.') + + # Print bounding box of the Tree + root = self.cells[0] + plt.vlines([root.min_bounds[0], root.max_bounds[0]], root.min_bounds[1], root.max_bounds[1]) + plt.hlines([root.min_bounds[1], root.max_bounds[1]], root.min_bounds[0], root.max_bounds[0]) + plt.show() + + def check_coherence(self): + for c in self.cells[:self.cell_count]: + self.check_point_in_cell(c.barycenter, &c) + if not c.is_leaf: + n_points = 0 + for idx in range(self.n_cells_per_cell): + if c.children[idx] != -1: + child = self.cells[c.children[idx]] + n_points += child.cumulative_size + if n_points != c.cumulative_size: + raise RuntimeError( + "Cell {} is incoherent. Size={} but found {} points " + "in children. ({})" + .format(c.cell_id, c.cumulative_size, n_points, c.children)) + if self.n_points != self.cells[0].cumulative_size: + raise RuntimeError( + "QuadTree is incoherent. Size={} but found {} points " + "in children." + .format(self.n_points, self.cells[0].cumulative_size)) + + cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, SIZE_t cell_id=0, + long idx=0, float squared_theta=.5) nogil: + """Summarize the tree compared to a query point. + + Input arguments + --------------- + point: array (n_dimensions) + query point to construct the summary. + cell_id: integer, optional (default: 0) + current cell of the tree summarized. This should be set to 0 for + external calls. + idx: integer, optional (default: 0) + current index in the result array. This should be set to 0 for + external calls + + Output arguments + ---------------- + result: array (n_samples * (n_dimensions+2)) + result will contain a summary of the tree information compared to + the query point: + - results[idx:idx+n_dimensions] contains the delta between a summary + node idx and the query point. + - result[idx+n_dimensions+1] contains the squared euclidean distance + to the summary node idx. + - result[idx+n_dimensions+2] contains the size of the summary node idx. + + Return + ------ + idx: integer + number of elements in the results array. + """ + cdef: + int i, idx_d = idx + self.n_dimensions + bint duplicate = True + Cell* cell = &self.cells[cell_id] + + idx_d = idx + self.n_dimensions + results[idx_d] = 0. + for i in range(self.n_dimensions): + results[idx + i] = point[i] - cell.barycenter[i] + results[idx_d] += results[idx + i] * results[idx + i] + duplicate &= fabsf(results[idx + i]) < EPSILON + + # Do not compute self interactions + if duplicate and cell.is_leaf: + return idx + + # Check whether we can use this node as a summary + # It's a summary node if the angular size as measured from the point + # is relatively small (w.r.t. to theta) or if it is a leaf node. + # If it can be summarized, we use the cell center of mass + # Otherwise, we go a higher level of resolution and into the leaves. + if cell.is_leaf or ((cell.max_width / results[idx_d]) < squared_theta): + results[idx_d + 1] = cell.cumulative_size + return idx + 2 + self.n_dimensions + + else: + # Recursively compute the summary in nodes + for c in range(self.n_cells_per_cell): + child_id = cell.children[c] + if child_id != -1: + idx = self.summarize(point, results, child_id, idx) + + return idx + + def get_cell(self, point): + cdef DTYPE_t[3] query_pt + cdef int i + + for i in range(self.n_dimensions): + query_pt[i] = point[i] + + return self._get_cell(query_pt, 0) + + cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=0) nogil except -1: + cdef: + SIZE_t selected_child + Cell* cell = &self.cells[0] + + if cell.is_leaf: + if self.is_duplicate(cell.barycenter, point): + return cell_id + with gil: + raise ValueError("Query point not in the Tree.") + + selected_child = self.select_child(point, cell) + if selected_child > 0: + return self._get_cell(point, selected_child) + with gil: + raise ValueError("Query point not in the Tree.") + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (QuadTree, (self.n_dimensions, self.verbose, + self.__getstate__())) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is infered during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["cell_count"] = self.cell_count + d["cells"] = self._get_cell_ndarray() + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.cell_count = d["cell_count"] + + if 'cells' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + cell_ndarray = d['cells'] + + if (cell_ndarray.ndim != 1 or + cell_ndarray.dtype != CELL_DTYPE or + not cell_ndarray.flags.c_contiguous): + raise ValueError('Did not recognise loaded array layout') + + self.capacity = cell_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + cells = memcpy(self.cells, ( cell_ndarray).data, + self.capacity * sizeof(Cell)) + + cdef np.ndarray _get_cell_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef np.npy_intp shape[1] + shape[0] = self.cell_count + cdef np.npy_intp strides[1] + strides[0] = sizeof(Cell) + cdef np.ndarray arr + Py_INCREF(CELL_DTYPE) + arr = PyArray_NewFromDescr(np.ndarray, CELL_DTYPE, 1, shape, + strides, self.cells, + np.NPY_DEFAULT, None) + Py_INCREF(self) + arr.base = self + return arr \ No newline at end of file diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py index 1180b8c365dfb..ca97e0c1a85a6 100644 --- a/sklearn/neighbors/setup.py +++ b/sklearn/neighbors/setup.py @@ -31,6 +31,11 @@ def configuration(parent_package='', top_path=None): sources=['typedefs.pyx'], include_dirs=[numpy.get_include()], libraries=libraries) + config.add_extension("quad_tree", + sources=["quad_tree.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"]) config.add_subpackage('tests') diff --git a/sklearn/neighbors/tests/test_quad_tree.py b/sklearn/neighbors/tests/test_quad_tree.py new file mode 100644 index 0000000000000..48202dfcfb8b5 --- /dev/null +++ b/sklearn/neighbors/tests/test_quad_tree.py @@ -0,0 +1,56 @@ +import pickle +import numpy as np +from sklearn.neighbors.quad_tree import QuadTree + + +def test_quadtree_similar_point(): + # Introduce a point into a quad tree where a similar point already exists. + # Test will hang if it doesn't complete. + Xs = [] + + # check the case where points are actually different + Xs.append(np.array([[1, 2], [3, 4]], dtype=np.float32)) + # check the case where points are the same on X axis + Xs.append(np.array([[1.0, 2.0], [1.0, 3.0]], dtype=np.float32)) + # check the case where points are arbitrarily close on X axis + Xs.append(np.array([[1.00001, 2.0], [1.00002, 3.0]], dtype=np.float32)) + # check the case where points are the same on Y axis + Xs.append(np.array([[1.0, 2.0], [3.0, 2.0]], dtype=np.float32)) + # check the case where points are arbitrarily close on Y axis + Xs.append(np.array([[1.0, 2.00001], [3.0, 2.00002]], dtype=np.float32)) + # check the case where points are arbitrarily close on both axes + Xs.append(np.array([[1.00001, 2.00001], [1.00002, 2.00002]], + dtype=np.float32)) + + # check the case where points are arbitrarily close on both axes + # close to machine epsilon - x axis + Xs.append(np.array([[1, 0.0003817754041], [2, 0.0003817753750]], + dtype=np.float32)) + + # check the case where points are arbitrarily close on both axes + # close to machine epsilon - y axis + Xs.append(np.array([[0.0003817754041, 1.0], [0.0003817753750, 2.0]], + dtype=np.float32)) + + for X in Xs: + tree = QuadTree(n_dimensions=2, verbose=0) + tree.build_tree(X) + tree.check_coherence() + + +def test_quad_tree_pickle(): + np.random.seed(0) + X = np.random.random((10, 3)) + + tree = QuadTree(n_dimensions=2, verbose=0) + tree.build_tree(X) + + def check_pickle_protocol(protocol): + s = pickle.dumps(tree, protocol=protocol) + bt2 = pickle.loads(s) + + for x in X: + assert tree.get_cell(x) == bt2.get_cell(x) + + for protocol in (0, 1, 2): + yield check_pickle_protocol, protocol diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 017888ab41db7..04806ade180c2 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -10,7 +10,8 @@ import numpy as np cimport numpy as np -from _tree cimport Node +from _tree cimport Node +from sklearn.neighbors.quad_tree cimport Cell ctypedef np.npy_float32 DTYPE_t # Type of X ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight @@ -39,6 +40,7 @@ ctypedef fused realloc_ptr: (DOUBLE_t*) (DOUBLE_t**) (Node*) + (Cell*) (Node**) (StackRecord*) (PriorityHeapRecord*) From 02af9933cf02410799c58d43d81930c08344ce2f Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Sun, 18 Jun 2017 23:33:06 +0200 Subject: [PATCH 10/55] ENH TSNE running with new QuadTree --- benchmarks/.gitignore | 2 +- benchmarks/bench_tsne_mnist.py | 38 +- sklearn/manifold/_barnes_hut_tsne.pyx | 594 ++------------------------ sklearn/manifold/t_sne.py | 26 +- sklearn/manifold/tests/test_t_sne.py | 53 +-- 5 files changed, 83 insertions(+), 630 deletions(-) diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore index 9a7eaac081c3d..2b6f7ba9c1606 100644 --- a/benchmarks/.gitignore +++ b/benchmarks/.gitignore @@ -1,4 +1,4 @@ /bhtsne *.npy *.json -/mnist_tsne_benchmark_data/ +/mnist_tsne_output/ diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 3f956c26ce4fc..b3bdfe2f4828c 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -17,13 +17,19 @@ from sklearn.externals.joblib import Memory from sklearn.datasets import fetch_mldata from sklearn.manifold import TSNE -from sklearn.manifold.t_sne import trustworthiness +from sklearn.neighbors import NearestNeighbors from sklearn.decomposition import PCA from sklearn.utils import check_array from sklearn.utils import shuffle as _shuffle -memory = Memory('mnist_tsne_benchmark_data', mmap_mode='r') +LOG_DIR = "mnist_tsne_output" +if not os.path.exists(LOG_DIR): + os.mkdir(LOG_DIR) + + +memory = Memory(os.path.join(LOG_DIR, 'mnist_tsne_benchmark_data'), + mmap_mode='r') @memory.cache @@ -43,6 +49,20 @@ def load_data(dtype=np.float32, order='C', shuffle=True, seed=0): return X, y +def precision_at_k(X, X_embedded, k=5): + """Compute the precision at k for the dataset. + """ + + knn = NearestNeighbors(n_neighbors=k, n_jobs=-1) + _, neighbors_X = knn.fit(X).kneighbors() + _, neighbors_X_embedded = knn.fit(X_embedded).kneighbors() + + precisions = [len(np.intersect1d(X_n, Xe_n)) / k + for X_n, Xe_n in zip(neighbors_X, neighbors_X_embedded)] + + return np.mean(precisions) + + def tsne_fit_transform(model, data): transformed = model.fit_transform(data) return transformed, model.n_iter_ @@ -83,7 +103,7 @@ def tsne_fit_transform(model, data): # Put TSNE in methods if isinstance(args.n_jobs, int): tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, - verbose=args.verbose, n_jobs=args.n_jobs) + verbose=args.verbose, n_jobs=args.n_jobs, n_iter=1000) methods += [("sklearn TSNE", lambda data: tsne_fit_transform(tsne, data))] elif isinstance(args.n_jobs, list): @@ -133,21 +153,23 @@ def bhtsne(X): results = [] basename, _ = os.path.splitext(__file__) - log_filename = basename + '.json' + log_filename = os.path.join(LOG_DIR, basename + '.json') for n in data_size: X_train = X[:n] n = X_train.shape[0] for name, method in methods: print("Fitting {} on {} samples...".format(name, n)) t0 = time() + np.save("dump_X.npy", X_train) X_embedded, n_iter = method(X_train) duration = time() - t0 - tw = trustworthiness(X_train, X_embedded) + precision_5 = precision_at_k(X_train, X_embedded, k=5) print("Fitting {} on {} samples took {:.3f}s in {:d} iterations, " - "trustworthiness: {:0.3f}".format( - name, n, duration, n_iter, tw)) + "precision at 5: {:0.3f}".format( + name, n, duration, n_iter, precision_5)) results.append(dict(method=name, duration=duration, n_samples=n)) with open(log_filename, 'w', encoding='utf-8') as f: json.dump(results, f) - np.save('mnist_{}_{}.npy'.format(name.replace("/", '-'), n), + np.save(os.path.join(LOG_DIR, 'mnist_{}_{}.npy' + .format(name.replace("/", '-'), n)), X_embedded) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index 7b478b455a572..728bb78461fee 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -14,6 +14,9 @@ from libc.math cimport sqrt, log import numpy as np cimport numpy as np +from sklearn.neighbors import quad_tree +from sklearn.neighbors cimport quad_tree + cdef char* EMPTY_STRING = "" cdef extern from "math.h": @@ -37,415 +40,12 @@ cdef extern from "time.h": double CLOCKS_PER_SEC -cdef extern from "cblas.h": - float snrm2 "cblas_snrm2"(int N, float *X, int incX) nogil - - -cdef struct Node: - # Keep track of the center of mass - float* barycenter - # If this is a leaf, the position of the point within this leaf - float* leaf_point_position - # The number of points including all - # nodes below this one - long cumulative_size - # Number of points at this node - long size - # Index of the point at this node - # Only defined for non-empty leaf nodes - long point_index - # level = 0 is the root node - # And each subdivision adds 1 to the level - long level - # Left edge of this node - float* left_edge - # The center of this node, equal to le + w/2.0 - float* center - # The width of this node -- used to calculate the opening - # angle. Equal to width = re - le - float* width - # The value of the maximum width w - float max_width - - # Does this node have children? - # Default to leaf until we add points - int is_leaf - # Array of pointers to pointers of children - Node **children - # Keep a pointer to the parent - Node *parent - # Pointer to the tree this node belongs too - Tree* tree - -cdef struct Tree: - # Holds a pointer to the root node - Node* root_node - # Number of dimensions in the output - int n_dimensions - # Total number of cells - long n_cells - # Total number of points - long n_points - # Spit out diagnostic information? - int verbose - # How many cells per node? Should go as 2 ** n_dimensionss - int n_cell_per_node - -cdef Tree* init_tree(float[:] left_edge, float[:] width, int n_dimensions, - int verbose) nogil: - # tree is freed by free_tree - cdef Tree* tree = malloc(sizeof(Tree)) - tree.n_dimensions = n_dimensions - tree.n_cells = 0 - tree.n_points = 0 - tree.verbose = verbose - tree.root_node = create_root(left_edge, width, n_dimensions) - tree.root_node.tree = tree - tree.n_cells += 1 - tree.n_cell_per_node = 2 ** n_dimensions - if DEBUGFLAG: - printf("[t-SNE] Tree initialised. Left_edge = (%1.9e, %1.9e, %1.9e)\n", - left_edge[0], left_edge[1], left_edge[2]) - printf("[t-SNE] Tree initialised. Width = (%1.9e, %1.9e, %1.9e)\n", - width[0], width[1], width[2]) - return tree - -cdef Node* create_root(float[:] left_edge, float[:] width, int n_dimensions) nogil: - # Create a default root node - cdef int ax - cdef int n_cell_per_node = 2 ** n_dimensions - # root is freed by free_tree - root = malloc(sizeof(Node)) - root.is_leaf = 1 - root.parent = NULL - root.level = 0 - root.cumulative_size = 0 - root.size = 0 - root.point_index = -1 - root.max_width = 0.0 - root.width = malloc(sizeof(float) * n_dimensions) - root.left_edge = malloc(sizeof(float) * n_dimensions) - root.center = malloc(sizeof(float) * n_dimensions) - root.barycenter = malloc(sizeof(float) * n_dimensions) - root.leaf_point_position= malloc(sizeof(float) * n_dimensions) - root.children = NULL - for ax in range(n_dimensions): - root.width[ax] = width[ax] - root.left_edge[ax] = left_edge[ax] - root.center[ax] = 0.0 - root.barycenter[ax] = 0. - root.leaf_point_position[ax] = -1 - for ax in range(n_dimensions): - root.max_width = max(root.max_width, root.width[ax]) - if DEBUGFLAG: - printf("[t-SNE] Created root node %p\n", root) - return root - -cdef Node* create_child(Node *parent, int[3] offset) nogil: - # Create a new child node with default parameters - cdef int ax - # these children are freed by free_recursive - child = malloc(sizeof(Node)) - child.is_leaf = 1 - child.parent = parent - child.level = parent.level + 1 - child.size = 0 - child.cumulative_size = 0 - child.point_index = -1 - child.tree = parent.tree - child.max_width = 0.0 - child.width = malloc(sizeof(float) * parent.tree.n_dimensions) - child.left_edge = malloc(sizeof(float) * parent.tree.n_dimensions) - child.center = malloc(sizeof(float) * parent.tree.n_dimensions) - child.barycenter = malloc(sizeof(float) * parent.tree.n_dimensions) - child.leaf_point_position = malloc(sizeof(float) * parent.tree.n_dimensions) - child.children = NULL - for ax in range(parent.tree.n_dimensions): - child.width[ax] = parent.width[ax] / 2.0 - child.left_edge[ax] = parent.left_edge[ax] + offset[ax] * parent.width[ax] / 2.0 - child.center[ax] = child.left_edge[ax] + child.width[ax] / 2.0 - child.barycenter[ax] = 0. - child.leaf_point_position[ax] = -1. - for ax in range(parent.tree.n_dimensions): - child.max_width = max(child.max_width, child.width[ax]) - child.tree.n_cells += 1 - return child - -cdef Node* select_child(Node *node, float[3] pos, long index) nogil: - # Find which sub-node a position should go into - # And return the appropriate node - cdef int* offset = malloc(sizeof(int) * node.tree.n_dimensions) - cdef int ax, idx - cdef Node* child - cdef int error - for ax in range(node.tree.n_dimensions): - offset[ax] = (pos[ax] - (node.left_edge[ax] + node.width[ax] / 2.0)) > 0. - idx = offset2index(offset, node.tree.n_dimensions) - child = node.children[idx] - if DEBUGFLAG: - printf("[t-SNE] Offset [%i, %i] with LE [%f, %f]\n", - offset[0], offset[1], child.left_edge[0], child.left_edge[1]) - free(offset) - return child - - -cdef inline void index2offset(int* offset, int index, int n_dimensions) nogil: - # Convert a 1D index into N-D index; useful for indexing - # children of a quadtree, octree, N-tree - # Quite likely there's a fancy bitshift way of doing this - # since the offset is equivalent to the binary representation - # of the integer index - # We read the offset array left-to-right - # such that the least significat bit is on the right - cdef int rem, k, shift - for k in range(n_dimensions): - shift = n_dimensions -k -1 - rem = ((index >> shift) << shift) - offset[k] = rem > 0 - if DEBUGFLAG: - printf("i2o index %i k %i rem %i offset", index, k, rem) - for j in range(n_dimensions): - printf(" %i", offset[j]) - printf(" n_dimensions %i\n", n_dimensions) - index -= rem - - -cdef inline int offset2index(int* offset, int n_dimensions) nogil: - # Calculate the 1:1 index for a given offset array - # We read the offset array right-to-left - # such that the least significat bit is on the right - cdef int dim - cdef int index = 0 - for dim in range(n_dimensions): - index += (2 ** dim) * offset[n_dimensions - dim - 1] - if DEBUGFLAG: - printf("o2i index %i dim %i offset", index, dim) - for j in range(n_dimensions): - printf(" %i", offset[j]) - printf(" n_dimensions %i\n", n_dimensions) - return index - - -cdef void subdivide(Node* node) nogil: - # This instantiates 2**n_dimensions = n_cell_per_node nodes for the current node - cdef int idx = 0 - cdef int* offset = malloc(sizeof(int) * node.tree.n_dimensions) - node.is_leaf = False - node.children = malloc(sizeof(Node*) * node.tree.n_cell_per_node) - for idx in range(node.tree.n_cell_per_node): - index2offset(offset, idx, node.tree.n_dimensions) - node.children[idx] = create_child(node, offset) - free(offset) - - -cdef int insert(Node *root, float pos[3], long point_index, long depth, long - duplicate_count) nogil: - # Introduce a new point into the tree - # by recursively inserting it and subdividng as necessary - # Carefully treat the case of identical points at the same node - # by increasing the root.size and tracking duplicate_count - cdef Node *child - cdef long i - cdef int ax - cdef int not_identical = 1 - cdef int n_dimensions = root.tree.n_dimensions - if DEBUGFLAG: - printf("[t-SNE] [d=%li] Inserting pos %li [%f, %f] duplicate_count=%li" - " into child %p\n", depth, point_index, pos[0], pos[1], - duplicate_count, root) - # Increment the total number points including this - # node and below it - root.cumulative_size += duplicate_count - # Evaluate the new center of mass, weighting the previous - # center of mass against the new point data - cdef double frac_seen = (root.cumulative_size - 1) / ( - root.cumulative_size) - cdef double frac_new = 1.0 / root.cumulative_size - # Assert that duplicate_count > 0 - if duplicate_count < 1: - return -1 - # Assert that the point is inside the left & right edges - for ax in range(n_dimensions): - root.barycenter[ax] *= frac_seen - if (pos[ax] > (root.left_edge[ax] + root.width[ax] + EPSILON)): - printf("[t-SNE] Error: point (%1.9e) is above right edge of node " - "(%1.9e)\n", pos[ax], root.left_edge[ax] + root.width[ax]) - return -1 - if (pos[ax] < root.left_edge[ax] - EPSILON): - printf("[t-SNE] Error: point (%1.9e) is below left edge of node " - "(%1.9e)\n", pos[ax], root.left_edge[ax]) - return -1 - for ax in range(n_dimensions): - root.barycenter[ax] += pos[ax] * frac_new - - # If this node is unoccupied, fill it. - # Otherwise, we need to insert recursively. - # Two insertion scenarios: - # 1) Insert into this node if it is a leaf and empty - # 2) Subdivide this node if it is currently occupied - if (root.size == 0) & root.is_leaf: - # Root node is empty and a leaf - if DEBUGFLAG: - printf("[t-SNE] [d=%li] Inserting [%f, %f] into blank cell\n", depth, - pos[0], pos[1]) - for ax in range(n_dimensions): - root.leaf_point_position[ax] = pos[ax] - root.point_index = point_index - root.size = duplicate_count - return 0 - else: - # Root node is occupied or not a leaf - if DEBUGFLAG: - printf("[t-SNE] [d=%li] Node %p is occupied or is a leaf.\n", depth, - root) - printf("[t-SNE] [d=%li] Node %p leaf = %i. Size %li\n", depth, root, - root.is_leaf, root.size) - if root.is_leaf & (root.size > 0): - # is a leaf node and is occupied - for ax in range(n_dimensions): - not_identical &= (fabsf(pos[ax] - root.leaf_point_position[ax]) < EPSILON) - not_identical &= (root.point_index != point_index) - if not_identical == 1: - root.size += duplicate_count - if DEBUGFLAG: - printf("[t-SNE] Warning: [d=%li] Detected identical " - "points. Returning. Leaf now has size %li\n", - depth, root.size) - return 0 - # If necessary, subdivide this node before - # descending - if root.is_leaf: - if DEBUGFLAG: - printf("[t-SNE] [d=%li] Subdividing this leaf node %p\n", depth, - root) - subdivide(root) - # We have two points to relocate: the one previously - # at this node, and the new one we're attempting - # to insert - if root.size > 0: - child = select_child(root, root.leaf_point_position, root.point_index) - if DEBUGFLAG: - printf("[t-SNE] [d=%li] Relocating old point to node %p\n", - depth, child) - insert(child, root.leaf_point_position, root.point_index, depth + 1, root.size) - # Insert the new point - if DEBUGFLAG: - printf("[t-SNE] [d=%li] Selecting node for new point\n", depth) - child = select_child(root, pos, point_index) - if root.size > 0: - # Remove the point from this node - for ax in range(n_dimensions): - root.leaf_point_position[ax] = -1 - root.size = 0 - root.point_index = -1 - return insert(child, pos, point_index, depth + 1, 1) - -cdef int insert_many(Tree* tree, float[:,:] pos_array) nogil: - # Insert each data point into the tree one at a time - cdef long nrows = pos_array.shape[0] - cdef long i - cdef int ax - cdef float row[3] - cdef long err = 0 - for i in range(nrows): - for ax in range(tree.n_dimensions): - row[ax] = pos_array[i, ax] - if DEBUGFLAG: - printf("[t-SNE] inserting point %li: [%f, %f]\n", i, row[0], row[1]) - err = insert(tree.root_node, row, i, 0, 1) - if err != 0: - printf("[t-SNE] ERROR\n%s", EMPTY_STRING) - return err - tree.n_points += 1 - return err - -cdef int free_tree(Tree* tree) nogil: - cdef int check - cdef long* cnt = malloc(sizeof(long) * 3) - for i in range(3): - cnt[i] = 0 - free_recursive(tree, tree.root_node, cnt) - check = cnt[0] == tree.n_cells - check &= cnt[2] == tree.n_points - free(tree) - free(cnt) - return check - -cdef void free_post_children(Node *node) nogil: - free(node.width) - free(node.left_edge) - free(node.center) - free(node.barycenter) - free(node.leaf_point_position) - free(node) - -cdef void free_recursive(Tree* tree, Node *root, long* counts) nogil: - # Free up all of the tree nodes recursively - # while counting the number of nodes visited - # and total number of data points removed - cdef int idx - cdef Node* child - if not root.is_leaf: - for idx in range(tree.n_cell_per_node): - child = root.children[idx] - free_recursive(tree, child, counts) - counts[0] += 1 - if child.is_leaf: - counts[1] += 1 - if child.size > 0: - counts[2] +=1 - else: - free(child.children) - - free_post_children(child) - - if root == tree.root_node: - if not root.is_leaf: - free(root.children) - - free_post_children(root) - -cdef long count_points(Node* root, long count) nogil: - # Walk through the whole tree and count the number - # of points at the leaf nodes - if DEBUGFLAG: - printf("[t-SNE] Counting nodes at root node %p\n", root) - cdef Node* child - cdef int idx - if root.is_leaf: - count += root.size - if DEBUGFLAG : - printf("[t-SNE] %p is a leaf node, no children\n", root) - printf("[t-SNE] %li points in node %p\n", count, root) - return count - # Otherwise, get the children - for idx in range(root.tree.n_cell_per_node): - child = root.children[idx] - if DEBUGFLAG: - printf("[t-SNE] Counting points for child %p\n", child) - if child.is_leaf and child.size > 0: - if DEBUGFLAG: - printf("[t-SNE] Child has size %ld\n", child.size) - count += child.size - elif not child.is_leaf: - if DEBUGFLAG: - printf("[t-SNE] Child is not a leaf. Descending\n%s", EMPTY_STRING) - count = count_points(child, count) - # else case is we have an empty leaf node - # which happens when we create a quadtree for - # one point, and then the other neighboring cells - # don't get filled in - if DEBUGFLAG: - printf("[t-SNE] %li points in this node\n", count) - return count - - cdef float compute_gradient(float[:] val_P, float[:,:] pos_reference, np.int64_t[:] neighbors, np.int64_t[:] indptr, float[:,:] tot_force, - Node* root_node, + quad_tree.QuadTree qt, float theta, float dof, long start, @@ -455,8 +55,8 @@ cdef float compute_gradient(float[:] val_P, cdef long i, coord cdef int ax cdef long n_samples = pos_reference.shape[0] - cdef int n_dimensions = root_node.tree.n_dimensions - if root_node.tree.verbose > 11: + cdef int n_dimensions = qt.n_dimensions + if qt.verbose > 11: printf("[t-SNE] Allocating %li elements in force arrays\n", n_samples * n_dimensions * 2) cdef float* sum_Q = malloc(sizeof(float)) @@ -467,18 +67,18 @@ cdef float compute_gradient(float[:] val_P, sum_Q[0] = 0.0 t1 = clock() - compute_gradient_negative(pos_reference, neg_f, root_node, sum_Q, + compute_gradient_negative(pos_reference, neg_f, qt, sum_Q, dof, theta, start, stop) t2 = clock() - if root_node.tree.verbose > 15: + if qt.verbose > 15: printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1))) sQ = sum_Q[0] t1 = clock() error = compute_gradient_positive(val_P, pos_reference, neighbors, indptr, pos_f, n_dimensions, dof, sQ, start, - root_node.tree.verbose) + qt.verbose) t2 = clock() - if root_node.tree.verbose > 15: + if qt.verbose > 15: printf("[t-SNE] Computing positive gradient: %e ticks\n", ((float) (t2 - t1))) for i in range(start, n_samples): for ax in range(n_dimensions): @@ -544,7 +144,7 @@ cdef float compute_gradient_positive(float[:] val_P, cdef void compute_gradient_negative(float[:,:] pos_reference, float* neg_f, - Node *root_node, + quad_tree.QuadTree qt, float* sum_Q, float dof, float theta, @@ -559,25 +159,21 @@ cdef void compute_gradient_negative(float[:,:] pos_reference, float* force float* iQ float* pos - float* dist2s - long* sizes - float* deltas + float size, dist2s long* l - int n_dimensions = root_node.tree.n_dimensions + int n_dimensions = qt.n_dimensions float qijZ, mult long idx, long dta = 0 long dtb = 0 clock_t t1, t2, t3 float* neg_force + long offset = n_dimensions + 2 iQ = malloc(sizeof(float)) force = malloc(sizeof(float) * n_dimensions) pos = malloc(sizeof(float) * n_dimensions) - dist2s = malloc(sizeof(float) * n) - sizes = malloc(sizeof(long) * n) - deltas = malloc(sizeof(float) * n * n_dimensions) - l = malloc(sizeof(long)) + summary = malloc(sizeof(float) * n * offset) neg_force= malloc(sizeof(float) * n_dimensions) for i in range(start, stop): @@ -587,98 +183,40 @@ cdef void compute_gradient_negative(float[:,:] pos_reference, neg_force[ax] = 0.0 pos[ax] = pos_reference[i, ax] iQ[0] = 0.0 - l[0] = 0 # Find which nodes are summarizing and collect their centers of mass # deltas, and sizes, into vectorized arrays t1 = clock() - compute_non_edge_forces(root_node, theta, i, pos, force, dist2s, - sizes, deltas, l) + idx = qt.summarize(pos, summary, 0, 0, theta*theta) t2 = clock() # Compute the t-SNE negative force # for the digits dataset, walking the tree # is about 10-15x more expensive than the # following for loop exponent = (dof + 1.0) / -2.0 - for j in range(l[0]): - qijZ = ((1.0 + dist2s[j]) / dof) ** exponent - sum_Q[0] += sizes[j] * qijZ - mult = sizes[j] * qijZ * qijZ + for j in range(idx // offset): + + dist2s = summary[j * offset + n_dimensions] + size = summary[j * offset + n_dimensions + 1] + qijZ = ((1.0 + dist2s) / dof) ** exponent # 1/(1+dist) + sum_Q[0] += size * qijZ # size of the node * q + mult = size * qijZ * qijZ for ax in range(n_dimensions): - idx = j * n_dimensions + ax - neg_force[ax] += mult * deltas[idx] + neg_force[ax] += mult * summary[j * offset + ax] t3 = clock() for ax in range(n_dimensions): neg_f[i * n_dimensions + ax] = neg_force[ax] dta += t2 - t1 dtb += t3 - t2 - if root_node.tree.verbose > 20: + if qt.verbose > 20: printf("[t-SNE] Tree: %li clock ticks | ", dta) printf("Force computation: %li clock ticks\n", dtb) free(iQ) free(force) free(pos) - free(dist2s) - free(sizes) - free(deltas) - free(l) + free(summary) free(neg_force) -cdef void compute_non_edge_forces(Node* node, - float theta, - long point_index, - float* pos, - float* force, - float* dist2s, - long* sizes, - float* deltas, - long* l) nogil: - # Compute the t-SNE force on the point in pos given by point_index - cdef: - Node* child - int i, j - int n_dimensions = node.tree.n_dimensions - long idx, idx1 - float dist_check - - # There are no points below this node if cumulative_size == 0 - # so do not bother to calculate any force contributions - # Also do not compute self-interactions - if node.cumulative_size > 0 and not (node.is_leaf and (node.point_index == - point_index)): - # Compute distance between node center of mass and the reference point - # I've tried rewriting this in terms of BLAS functions, but it's about - # 1.5x worse when we do so, probbaly because the vectors are small - idx1 = l[0] * n_dimensions - deltas[idx1] = pos[0] - node.barycenter[0] - idx = idx1 - for i in range(1, n_dimensions): - idx += 1 - deltas[idx] = pos[i] - node.barycenter[i] - # do np.sqrt(np.sum(deltas**2.0)) - dist2s[l[0]] = snrm2(n_dimensions, &deltas[idx1], 1) - # Check whether we can use this node as a summary - # It's a summary node if the angular size as measured from the point - # is relatively small (w.r.t. to theta) or if it is a leaf node. - # If it can be summarized, we use the cell center of mass - # Otherwise, we go a higher level of resolution and into the leaves. - if node.is_leaf or ((node.max_width / dist2s[l[0]]) < theta): - # Compute the t-SNE force between the reference point and the - # current node - sizes[l[0]] = node.cumulative_size - dist2s[l[0]] = dist2s[l[0]] * dist2s[l[0]] - l[0] += 1 - else: - # Recursively apply Barnes-Hut to child nodes - for idx in range(node.tree.n_cell_per_node): - child = node.children[idx] - if child.cumulative_size == 0: - continue - compute_non_edge_forces(child, theta, - point_index, pos, force, dist2s, sizes, deltas, - l) - - def calculate_edge(pos_output): # Make the boundaries slightly outside of the data # to avoid floating point error near the edge @@ -687,9 +225,12 @@ def calculate_edge(pos_output): center = (right_edge + left_edge) * 0.5 width = np.maximum(np.subtract(right_edge, left_edge), EPSILON) # Exagerate width to avoid boundary edge + printf("WIDTH %f, %f\n", float(width[0]), float(width[1])) width = width.astype(np.float32) * 1.001 left_edge = center - width / 2.0 right_edge = center + width / 2.0 + printf("ROOT_x %f, %f\n", float(left_edge[0]), float(right_edge[0])) + printf("ROOT_y %f, %f\n", float(left_edge[1]), float(right_edge[1])) return left_edge, right_edge, width @@ -708,8 +249,6 @@ def gradient(float[:] val_P, # up in-place cdef float C n = pos_output.shape[0] - left_edge, right_edge, width = calculate_edge(pos_output) - assert width.itemsize == 4 assert val_P.itemsize == 4 assert pos_output.itemsize == 4 assert forces.itemsize == 4 @@ -719,89 +258,22 @@ def gradient(float[:] val_P, assert n == indptr.shape[0] - 1, m if verbose > 10: printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions) - cdef Tree* qt = init_tree(left_edge, width, n_dimensions, verbose) + cdef quad_tree.QuadTree qt = quad_tree.QuadTree(pos_output.shape[1], 0) if verbose > 10: printf("[t-SNE] Inserting %li points\n", pos_output.shape[0]) - err = insert_many(qt, pos_output) - assert err == 0, "[t-SNE] Insertion failed" + qt.build_tree(pos_output) if verbose > 10: # XXX: format hack to workaround lack of `const char *` type # in the generated C code that triggers error with gcc 4.9 # and -Werror=format-security printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING) C = compute_gradient(val_P, pos_output, neighbors, indptr, forces, - qt.root_node, theta, dof, skip_num_points, -1) + qt, theta, dof, skip_num_points, -1) if verbose > 10: # XXX: format hack to workaround lack of `const char *` type # in the generated C code # and -Werror=format-security printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING) - cdef long count = count_points(qt.root_node, 0) - m = ("Tree consistency failed: unexpected number of points=%li " - "at root node=%li" % (count, qt.root_node.cumulative_size)) - assert count == qt.root_node.cumulative_size, m m = "Tree consistency failed: unexpected number of points on the tree" - assert count == qt.n_points, m - free_tree(qt) + assert qt.cells[0].cumulative_size == qt.n_points, m return C - - -# Helper functions -def check_quadtree(X, np.int64_t[:] counts): - """ - Helper function to access quadtree functions for testing - """ - - X = X.astype(np.float32) - left_edge, right_edge, width = calculate_edge(X) - # Initialise a tree - qt = init_tree(left_edge, width, 2, 2) - # Insert data into the tree - insert_many(qt, X) - - cdef long count = count_points(qt.root_node, 0) - counts[0] = count - counts[1] = qt.root_node.cumulative_size - counts[2] = qt.n_points - free_tree(qt) - return counts - - -cdef int helper_test_index2offset(int* check, int index, int n_dimensions): - cdef int* offset = malloc(sizeof(int) * n_dimensions) - cdef int error_check = 1 - for i in range(n_dimensions): - offset[i] = 0 - index2offset(offset, index, n_dimensions) - for i in range(n_dimensions): - error_check &= offset[i] == check[i] - free(offset) - return error_check - - -def test_index2offset(): - ret = 1 - ret &= helper_test_index2offset([1, 0, 1], 5, 3) == 1 - ret &= helper_test_index2offset([0, 0, 0], 0, 3) == 1 - ret &= helper_test_index2offset([0, 0, 1], 1, 3) == 1 - ret &= helper_test_index2offset([0, 1, 0], 2, 3) == 1 - ret &= helper_test_index2offset([0, 1, 1], 3, 3) == 1 - ret &= helper_test_index2offset([1, 0, 0], 4, 3) == 1 - return ret - - -def test_index_offset(): - cdef int n_dimensions, idx, tidx, k - cdef int error_check = 1 - cdef int* offset - for n_dimensions in range(2, 10): - offset = malloc(sizeof(int) * n_dimensions) - for k in range(n_dimensions): - offset[k] = 0 - for idx in range(2 ** n_dimensions): - index2offset(offset, idx, n_dimensions) - tidx = offset2index(offset, n_dimensions) - error_check &= tidx == idx - assert error_check == 1 - free(offset) - return error_check diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index c169df3b44f4e..81eaf432ed201 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -30,6 +30,7 @@ MACHINE_EPSILON = np.finfo(np.double).eps +EXPLORATION_N_ITER = 250 def _joint_probabilities(distances, desired_perplexity, verbose): @@ -395,11 +396,15 @@ def _gradient_descent(objective, p0, it, n_iter, objective_error=None, gains = np.ones_like(p) error = np.finfo(np.float).max best_error = np.finfo(np.float).max - best_iter = it + best_iter = i = it tic = time() for i in range(it, n_iter): - new_error, grad = objective(p, *args, **kwargs) + try: + new_error, grad = objective(p, *args, **kwargs) + except AssertionError: + np.save("dump_X_embedded.npy", p) + raise grad_norm = linalg.norm(grad) inc = update * grad < 0.0 @@ -755,8 +760,8 @@ def _fit(self, X, skip_num_points=0): raise ValueError("early_exaggeration must be at least 1, but is {}" .format(self.early_exaggeration)) - if self.n_iter < 200: - raise ValueError("n_iter should be at least 200") + if self.n_iter < 250: + raise ValueError("n_iter should be at least 250") if self.method == "exact": if self.metric == "precomputed": @@ -878,7 +883,6 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, opt_args = {"it": 0, "learning_rate": self.learning_rate, - "n_iter_without_progress": self.n_iter_without_progress, "verbose": self.verbose, "n_iter_check": 50, "kwargs": dict(skip_num_points=skip_num_points)} if self.method == 'barnes_hut': @@ -887,7 +891,7 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, self.n_components] opt_args['args'] = args - opt_args['n_iter_without_progress'] = 30 + opt_args['n_iter_without_progress'] = EXPLORATION_N_ITER # Don't always calculate the cost since that calculation # can be nearly as expensive as the gradient opt_args['objective_error'] = _kl_divergence_error @@ -897,15 +901,14 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, obj_func = _kl_divergence opt_args['args'] = [P, degrees_of_freedom, n_samples, self.n_components] + opt_args['n_iter_without_progress'] = self.n_iter_without_progress opt_args['min_error_diff'] = 0.0 opt_args['min_grad_norm'] = self.min_grad_norm # Learning schedule (part 1): do 250 iteration with lower momentum but # higher learning rate controlled via the early exageration parameter - exploration_n_iter = 250 - opt_args['n_iter'] = exploration_n_iter + opt_args['n_iter'] = EXPLORATION_N_ITER opt_args['momentum'] = 0.5 - opt_args['n_iter_without_progress'] = exploration_n_iter P *= self.early_exaggeration params, kl_divergence, it = _gradient_descent(obj_func, params, @@ -917,11 +920,12 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, # Learning schedule (part 2): disable early exaggeration and finish # optimization with a higher momentum at 0.8 P /= self.early_exaggeration - remaining = self.n_iter - exploration_n_iter - if it < exploration_n_iter or remaining > 0: + remaining = self.n_iter - EXPLORATION_N_ITER + if it < EXPLORATION_N_ITER or remaining > 0: opt_args['n_iter'] = self.n_iter opt_args['it'] = it + 1 opt_args['momentum'] = 0.8 + opt_args['n_iter_without_progress'] = self.n_iter_without_progress params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 5a05b2951e0b8..6e5fa9800b385 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -263,7 +263,7 @@ def test_optimization_minimizes_kl_divergence(): random_state = check_random_state(0) X, _ = make_blobs(n_features=3, random_state=random_state) kl_divergences = [] - for n_iter in [200, 250, 300]: + for n_iter in [250, 300, 350]: tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0, n_iter=n_iter, random_state=0) tsne.fit_transform(X) @@ -550,51 +550,6 @@ def test_barnes_hut_angle(): P = squareform(P) Pbh = Pbh.toarray() assert_array_almost_equal(Pbh, P, decimal=5) - assert_array_almost_equal(gradex, gradbh, decimal=5) - - -def test_quadtree_similar_point(): - # Introduce a point into a quad tree where a similar point already exists. - # Test will hang if it doesn't complete. - Xs = [] - - # check the case where points are actually different - Xs.append(np.array([[1, 2], [3, 4]], dtype=np.float32)) - # check the case where points are the same on X axis - Xs.append(np.array([[1.0, 2.0], [1.0, 3.0]], dtype=np.float32)) - # check the case where points are arbitrarily close on X axis - Xs.append(np.array([[1.00001, 2.0], [1.00002, 3.0]], dtype=np.float32)) - # check the case where points are the same on Y axis - Xs.append(np.array([[1.0, 2.0], [3.0, 2.0]], dtype=np.float32)) - # check the case where points are arbitrarily close on Y axis - Xs.append(np.array([[1.0, 2.00001], [3.0, 2.00002]], dtype=np.float32)) - # check the case where points are arbitrarily close on both axes - Xs.append(np.array([[1.00001, 2.00001], [1.00002, 2.00002]], - dtype=np.float32)) - - # check the case where points are arbitrarily close on both axes - # close to machine epsilon - x axis - Xs.append(np.array([[1, 0.0003817754041], [2, 0.0003817753750]], - dtype=np.float32)) - - # check the case where points are arbitrarily close on both axes - # close to machine epsilon - y axis - Xs.append(np.array([[0.0003817754041, 1.0], [0.0003817753750, 2.0]], - dtype=np.float32)) - - for X in Xs: - counts = np.zeros(3, dtype='int64') - _barnes_hut_tsne.check_quadtree(X, counts) - m = "Tree consistency failed: unexpected number of points at root node" - assert_equal(counts[0], counts[1], m) - m = "Tree consistency failed: unexpected number of points on the tree" - assert_equal(counts[0], counts[2], m) - - -def test_index_offset(): - # Make sure translating between 1D and N-D indices are preserved - assert_equal(_barnes_hut_tsne.test_index2offset(), 1) - assert_equal(_barnes_hut_tsne.test_index_offset(), 1) @skip_if_32bit @@ -602,8 +557,8 @@ def test_n_iter_without_progress(): # Use a dummy negative n_iter_without_progress and check output on stdout random_state = check_random_state(0) X = random_state.randn(100, 2) - tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e7, - random_state=1, method='exact', n_iter=200) + tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e8, + random_state=1, method='exact', n_iter=300) old_stdout = sys.stdout sys.stdout = StringIO() @@ -649,7 +604,7 @@ def test_min_grad_norm(): start_grad_norm = line.find('gradient norm') if start_grad_norm >= 0: line = line[start_grad_norm:] - line = line.replace('gradient norm = ', '') + line = line.replace('gradient norm = ', '').split(' ')[0] gradient_norm_values.append(float(line)) # Compute how often the gradient norm is smaller than min_grad_norm From 854d708061af2fc58a0db205f6f4f3acb6cc3c15 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Mon, 19 Jun 2017 14:34:14 +0200 Subject: [PATCH 11/55] FIX picklizable QuadTree + more tests --- sklearn/neighbors/quad_tree.pxd | 30 +++++--- sklearn/neighbors/quad_tree.pyx | 89 ++++++++++++++--------- sklearn/neighbors/tests/test_quad_tree.py | 48 +++++++++--- 3 files changed, 110 insertions(+), 57 deletions(-) diff --git a/sklearn/neighbors/quad_tree.pxd b/sklearn/neighbors/quad_tree.pxd index 002f7784c80fe..94297f14cb975 100644 --- a/sklearn/neighbors/quad_tree.pxd +++ b/sklearn/neighbors/quad_tree.pxd @@ -18,10 +18,13 @@ ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer # It allows us to write printf debugging lines # and remove them at compile time cdef enum: - DEBUGFLAG = 0 + DEBUGFLAG = 1 cdef float EPSILON = 1e-6 +# XXX: Careful to not change the order of the arguments. It is important to +# have is_leaf and max_width consecutive as it permits to avoid padding by +# the compiler and keep the size coherent for both C and numpy data structures. cdef struct Cell: # Base storage stucture for cells in a QuadTree object @@ -29,21 +32,24 @@ cdef struct Cell: SIZE_t parent # Parent cell of this cell SIZE_t[8] children # Array pointing to childrens of this cell - # Cell boundaries - DTYPE_t[3] min_bounds # Inferior boundaries of this cell (inclusive) - DTYPE_t[3] max_bounds # Superior boundaries of this cell (exclusive) - DTYPE_t[3] center # Store the center for quick split of cells - # Cell description SIZE_t cell_id # Id of the cell in the cells array in the Tree + SIZE_t point_index # Index of the point at this cell (only defined + # in non empty leaf) + bint is_leaf # Does this cell have children? DTYPE_t max_width # The value of the maximum width w + SIZE_t depth # Depth of the cell in the tree + SIZE_t cumulative_size # Number of points included in the subtree with + # this cell as a root. + + # Internal constants + DTYPE_t[3] center # Store the center for quick split of cells DTYPE_t[3] barycenter # Keep track of the center of mass of the cell - SIZE_t point_index # Index of the point at this cell (only defined in non empty leaf) - bint is_leaf # Does this cell have children? - SIZE_t depth # Depth of the cell in the tree - SIZE_t cumulative_size # Number of points including all cell below this one - # cdef long size # Number of points at this cell - + + # Cell boundaries + DTYPE_t[3] min_bounds # Inferior boundaries of this cell (inclusive) + DTYPE_t[3] max_bounds # Superior boundaries of this cell (exclusive) + cdef class QuadTree: # The QuadTree object is a quad tree structure constructed by inserting diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 541ae098e31b0..7b204b3586876 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -16,6 +16,7 @@ from ..utils import check_array import numpy as np cimport numpy as np +np.import_array() cdef extern from "math.h": float fabsf(float x) nogil @@ -29,34 +30,32 @@ cdef extern from "numpy/arrayobject.h": cdef SIZE_t DEFAULT = (-1) + # Repeat struct definition for numpy CELL_DTYPE = np.dtype({ - 'names': ['parent', 'children', 'min_bounds', 'max_bounds', 'center', - 'cell_id', 'max_width', 'barycenter', 'point_index', 'is_leaf', - 'depth', 'cumulative_size'], - 'formats': [np.intp, np.intp, np.float64, np.float64, np.float64, np.intp, - np.float64, np.float64, np.intp, np.bool, np.intp, np.intp], + 'names': ['parent', 'children', 'cell_id', 'point_index', 'is_leaf', + 'max_width', 'depth', 'cumulative_size', 'center', 'barycenter', + 'min_bounds', 'max_bounds'], + 'formats': [np.intp, (np.intp, 8), np.intp, np.intp, np.int32, np.float32, + np.intp, np.intp, (np.float32, 3), (np.float32, 3), + (np.float32, 3), (np.float32, 3)], 'offsets': [ &( NULL).parent, &( NULL).children, - &( NULL).min_bounds, - &( NULL).max_bounds, - &( NULL).center, &( NULL).cell_id, - &( NULL).max_width, - &( NULL).barycenter, &( NULL).point_index, &( NULL).is_leaf, + &( NULL).max_width, &( NULL).depth, - &( NULL).cumulative_size + &( NULL).cumulative_size, + &( NULL).center, + &( NULL).barycenter, + &( NULL).min_bounds, + &( NULL).max_bounds, ] }) - -cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( - np.ndarray[DTYPE_t, ndim=2, mode='c'] X): - return ( X.data) - +assert CELL_DTYPE.itemsize == sizeof(Cell) cdef class QuadTree: @@ -81,6 +80,14 @@ cdef class QuadTree: # Free all inner structures free(self.cells) + property cumulative_size: + def __get__(self): + return self._get_cell_ndarray()['cumulative_size'][:self.cell_count] + + property leafs: + def __get__(self): + return self._get_cell_ndarray()['is_leaf'][:self.cell_count] + cdef int _resize(self, SIZE_t capacity) nogil except -1: """Resize all inner arrays to `capacity`, if `capacity` == -1, then double the size of the inner arrays. @@ -122,8 +129,8 @@ cdef class QuadTree: cdef int check_point_in_cell(self, DTYPE_t[3] point, Cell* cell ) nogil except -1: if self.verbose >= 10: - printf("[QuadTree] Checking point (%f, %f, %f) in cell %i " - "([%f/%f, %f/%f, %f/%f], size %i)\n", + printf("[QuadTree] Checking point (%f, %f, %f) in cell %li " + "([%f/%f, %f/%f, %f/%f], size %li)\n", point[0], point[1], point[2], cell.cell_id, cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], @@ -134,7 +141,7 @@ cdef class QuadTree: cell.max_bounds[i] <= point[i]): with gil: msg = "[QuadTree] InsertionError: point out of cell boundary.\n" - msg += "Axis %i: cell [%f, %f]; point %f\n" + msg += "Axis %li: cell [%f, %f]; point %f\n" msg %= i, cell.min_bounds[i], cell.max_bounds[i], point[i] raise ValueError(msg) @@ -150,7 +157,7 @@ cdef class QuadTree: cdef SIZE_t n_point = cell.cumulative_size if self.verbose >= 10: - printf("[QuadTree] Inserting depth %i\n", cell.depth) + printf("[QuadTree] Inserting depth %li\n", cell.depth) # Assert that the point is in the right range if DEBUGFLAG: @@ -165,7 +172,7 @@ cdef class QuadTree: cell.barycenter[i] = point[i] cell.point_index = point_index if self.verbose >= 10: - printf("[QuadTree] inserted point in cell %i\n", cell_id) + printf("[QuadTree] inserted point in cell %li\n", cell_id) return cell_id # If the cell is not a leaf, update cell internals and @@ -181,7 +188,7 @@ cdef class QuadTree: # Insert child in the correct subtree selected_child = self.select_child(point, cell) if self.verbose >= 10: - printf("[QuadTree] selected child %i\n", selected_child) + printf("[QuadTree] selected child %li\n", selected_child) if selected_child == -1: self.n_points += 1 return self.insert_point_in_new_child(point, cell, point_index) @@ -264,7 +271,7 @@ cdef class QuadTree: # Assert that the point is in the right range self.check_point_in_cell(point, child) if self.verbose >= 10: - printf("[QuadTree] inserted point %i in new child %i\n", point_index, cell_id) + printf("[QuadTree] inserted point %li in new child %li\n", point_index, cell_id) return cell_id @@ -331,7 +338,6 @@ cdef class QuadTree: # Create the initial node with boundaries from the dataset self._init_root(min_bounds, max_bounds) - # cdef DTYPE_t[:, ::1] Xarr = get_memview_DTYPE_2D(X) for i in range(n_samples): for j in range(self.n_dimensions): pt[j] = X[i, j] @@ -361,19 +367,23 @@ cdef class QuadTree: plt.show() def check_coherence(self): - for c in self.cells[:self.cell_count]: - self.check_point_in_cell(c.barycenter, &c) - if not c.is_leaf: + for cell in self.cells[:self.cell_count]: + self.check_point_in_cell(cell.barycenter, &cell) + if not cell.is_leaf: n_points = 0 for idx in range(self.n_cells_per_cell): - if c.children[idx] != -1: - child = self.cells[c.children[idx]] + child_id = cell.children[idx] + if child_id != -1: + child = self.cells[child_id] n_points += child.cumulative_size - if n_points != c.cumulative_size: + assert child.cell_id == child_id, ( + "Cell id not correctly initiliazed.") + if n_points != cell.cumulative_size: raise RuntimeError( "Cell {} is incoherent. Size={} but found {} points " "in children. ({})" - .format(c.cell_id, c.cumulative_size, n_points, c.children)) + .format(cell.cell_id, cell.cumulative_size, + n_points, cell.children)) if self.n_points != self.cells[0].cumulative_size: raise RuntimeError( "QuadTree is incoherent. Size={} but found {} points " @@ -457,24 +467,28 @@ cdef class QuadTree: cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=0) nogil except -1: cdef: SIZE_t selected_child - Cell* cell = &self.cells[0] + Cell* cell = &self.cells[cell_id] if cell.is_leaf: if self.is_duplicate(cell.barycenter, point): + if self.verbose > 99: + printf("[QuadTree] Found point in cell: %li\n", cell.cell_id) return cell_id with gil: raise ValueError("Query point not in the Tree.") selected_child = self.select_child(point, cell) if selected_child > 0: + if self.verbose > 99: + printf("[QuadTree] Selected_child: %li\n", selected_child) return self._get_cell(point, selected_child) with gil: raise ValueError("Query point not in the Tree.") def __reduce__(self): """Reduce re-implementation, for pickling.""" - return (QuadTree, (self.n_dimensions, self.verbose, - self.__getstate__())) + return (QuadTree, (self.n_dimensions, self.verbose), + self.__getstate__()) def __getstate__(self): """Getstate re-implementation, for pickling.""" @@ -482,6 +496,8 @@ cdef class QuadTree: # capacity is infered during the __setstate__ using nodes d["max_depth"] = self.max_depth d["cell_count"] = self.cell_count + d["capacity"] = self.capacity + d["n_points"] = self.n_points d["cells"] = self._get_cell_ndarray() return d @@ -489,6 +505,8 @@ cdef class QuadTree: """Setstate re-implementation, for unpickling.""" self.max_depth = d["max_depth"] self.cell_count = d["cell_count"] + self.capacity = d["capacity"] + self.n_points = d["n_points"] if 'cells' not in d: raise ValueError('You have loaded Tree version which ' @@ -504,6 +522,7 @@ cdef class QuadTree: self.capacity = cell_ndarray.shape[0] if self._resize_c(self.capacity) != 0: raise MemoryError("resizing tree to %d" % self.capacity) + cells = memcpy(self.cells, ( cell_ndarray).data, self.capacity * sizeof(Cell)) @@ -520,7 +539,7 @@ cdef class QuadTree: strides[0] = sizeof(Cell) cdef np.ndarray arr Py_INCREF(CELL_DTYPE) - arr = PyArray_NewFromDescr(np.ndarray, CELL_DTYPE, 1, shape, + arr = PyArray_NewFromDescr(np.ndarray, CELL_DTYPE, 1, shape, strides, self.cells, np.NPY_DEFAULT, None) Py_INCREF(self) diff --git a/sklearn/neighbors/tests/test_quad_tree.py b/sklearn/neighbors/tests/test_quad_tree.py index 48202dfcfb8b5..c6dec28747973 100644 --- a/sklearn/neighbors/tests/test_quad_tree.py +++ b/sklearn/neighbors/tests/test_quad_tree.py @@ -40,17 +40,45 @@ def test_quadtree_similar_point(): def test_quad_tree_pickle(): np.random.seed(0) - X = np.random.random((10, 3)) - tree = QuadTree(n_dimensions=2, verbose=0) - tree.build_tree(X) + for n_dimensions in (2, 3): + X = np.random.random((10, n_dimensions)) - def check_pickle_protocol(protocol): - s = pickle.dumps(tree, protocol=protocol) - bt2 = pickle.loads(s) + tree = QuadTree(n_dimensions=n_dimensions, verbose=0) + tree.build_tree(X) + + def check_pickle_protocol(protocol): + s = pickle.dumps(tree, protocol=protocol) + bt2 = pickle.loads(s) + + for x in X: + cell_x_tree = tree.get_cell(x) + cell_x_bt2 = bt2.get_cell(x) + assert cell_x_tree == cell_x_bt2 + + for protocol in (0, 1, 2): + yield check_pickle_protocol, protocol + + +def test_qt_insert_duplicate(): + np.random.seed(0) + + def check_insert_duplicate(n_dimensions=2): + + X = np.random.random((10, n_dimensions)) + Xd = np.r_[X, X[:5]] + tree = QuadTree(n_dimensions=n_dimensions, verbose=0) + tree.build_tree(Xd) + + cumulative_size = tree.cumulative_size + leafs = tree.leafs - for x in X: - assert tree.get_cell(x) == bt2.get_cell(x) + # Assert that the first 5 are indeed duplicated and that the next + # ones are single point leaf + for i, x in enumerate(X): + cell_id = tree.get_cell(x) + assert leafs[cell_id] + assert cumulative_size[cell_id] == 1 + (i < 5) - for protocol in (0, 1, 2): - yield check_pickle_protocol, protocol + for n_dimensions in (2, 3): + yield check_insert_duplicate, n_dimensions From 04ffd5462061b955b09c2c559201e57b15598755 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 20 Jun 2017 14:01:22 +0200 Subject: [PATCH 12/55] Compute first NN accuracy --- benchmarks/bench_tsne_mnist.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index b3bdfe2f4828c..3e16a6b0ef595 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -49,18 +49,12 @@ def load_data(dtype=np.float32, order='C', shuffle=True, seed=0): return X, y -def precision_at_k(X, X_embedded, k=5): - """Compute the precision at k for the dataset. - """ - - knn = NearestNeighbors(n_neighbors=k, n_jobs=-1) +def nn_accuracy(X, X_embedded, k=1): + """Accuracy of the first nearest neighbor""" + knn = NearestNeighbors(n_neighbors=1, n_jobs=-1) _, neighbors_X = knn.fit(X).kneighbors() _, neighbors_X_embedded = knn.fit(X_embedded).kneighbors() - - precisions = [len(np.intersect1d(X_n, Xe_n)) / k - for X_n, Xe_n in zip(neighbors_X, neighbors_X_embedded)] - - return np.mean(precisions) + return np.mean(neighbors_X == neighbors_X_embedded) def tsne_fit_transform(model, data): @@ -163,9 +157,9 @@ def bhtsne(X): np.save("dump_X.npy", X_train) X_embedded, n_iter = method(X_train) duration = time() - t0 - precision_5 = precision_at_k(X_train, X_embedded, k=5) + precision_5 = nn_accuracy(X_train, X_embedded) print("Fitting {} on {} samples took {:.3f}s in {:d} iterations, " - "precision at 5: {:0.3f}".format( + "nn accuracy: {:0.3f}".format( name, n, duration, n_iter, precision_5)) results.append(dict(method=name, duration=duration, n_samples=n)) with open(log_filename, 'w', encoding='utf-8') as f: From d4f62d23951262b74515674b68dd85ba00efd3e5 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Tue, 20 Jun 2017 15:45:22 +0200 Subject: [PATCH 13/55] TST independently seeded rng --- sklearn/neighbors/tests/test_quad_tree.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/tests/test_quad_tree.py b/sklearn/neighbors/tests/test_quad_tree.py index c6dec28747973..f0ce77d2700ab 100644 --- a/sklearn/neighbors/tests/test_quad_tree.py +++ b/sklearn/neighbors/tests/test_quad_tree.py @@ -1,6 +1,7 @@ import pickle import numpy as np from sklearn.neighbors.quad_tree import QuadTree +from sklearn.utils import check_random_state def test_quadtree_similar_point(): @@ -39,10 +40,10 @@ def test_quadtree_similar_point(): def test_quad_tree_pickle(): - np.random.seed(0) + rng = check_random_state(0) for n_dimensions in (2, 3): - X = np.random.random((10, n_dimensions)) + X = rng.random_sample((10, n_dimensions)) tree = QuadTree(n_dimensions=n_dimensions, verbose=0) tree.build_tree(X) @@ -61,11 +62,11 @@ def check_pickle_protocol(protocol): def test_qt_insert_duplicate(): - np.random.seed(0) + rng = check_random_state(0) def check_insert_duplicate(n_dimensions=2): - X = np.random.random((10, n_dimensions)) + X = rng.random_sample((10, n_dimensions)) Xd = np.r_[X, X[:5]] tree = QuadTree(n_dimensions=n_dimensions, verbose=0) tree.build_tree(Xd) From ad449acf323eee3d6927bf8eb2c898644ef38126 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 20 Jun 2017 16:14:32 +0200 Subject: [PATCH 14/55] Add script to plot TSNE benchmark results For qualitative visual check of the results. --- benchmarks/bench_tsne_mnist.py | 15 ++++++++++++--- benchmarks/plot_tsne_mnist.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 benchmarks/plot_tsne_mnist.py diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 3e16a6b0ef595..601c9dfe037bb 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -9,6 +9,7 @@ # License: BSD 3 clause import os +import os.path as op from time import time import numpy as np import json @@ -62,6 +63,10 @@ def tsne_fit_transform(model, data): return transformed, model.n_iter_ +def sanitize(filename): + return filename.replace("/", '-').replace(" ", "_") + + if __name__ == "__main__": parser = argparse.ArgumentParser('Benchmark for t-SNE') parser.add_argument('--order', type=str, default='C', @@ -150,11 +155,15 @@ def bhtsne(X): log_filename = os.path.join(LOG_DIR, basename + '.json') for n in data_size: X_train = X[:n] + y_train = y[:n] n = X_train.shape[0] for name, method in methods: print("Fitting {} on {} samples...".format(name, n)) t0 = time() - np.save("dump_X.npy", X_train) + np.save(os.path.join(LOG_DIR, 'mnist_{}_{}.npy' + .format('original', n)), X_train) + np.save(os.path.join(LOG_DIR, 'mnist_{}_{}.npy' + .format('original_labels', n)), y_train) X_embedded, n_iter = method(X_train) duration = time() - t0 precision_5 = nn_accuracy(X_train, X_embedded) @@ -164,6 +173,6 @@ def bhtsne(X): results.append(dict(method=name, duration=duration, n_samples=n)) with open(log_filename, 'w', encoding='utf-8') as f: json.dump(results, f) - np.save(os.path.join(LOG_DIR, 'mnist_{}_{}.npy' - .format(name.replace("/", '-'), n)), + method_name = sanitize(name) + np.save(op.join(LOG_DIR, 'mnist_{}_{}.npy'.format(method_name, n)), X_embedded) diff --git a/benchmarks/plot_tsne_mnist.py b/benchmarks/plot_tsne_mnist.py new file mode 100644 index 0000000000000..0ffd32b3de779 --- /dev/null +++ b/benchmarks/plot_tsne_mnist.py @@ -0,0 +1,30 @@ +import matplotlib.pyplot as plt +import numpy as np +import os.path as op + +import argparse + + +LOG_DIR = "mnist_tsne_output" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser('Plot benchmark results for t-SNE') + parser.add_argument( + '--labels', type=str, + default=op.join(LOG_DIR, 'mnist_original_labels_10000.npy'), + help='1D integer numpy array for labels') + parser.add_argument( + '--embedding', type=str, + default=op.join(LOG_DIR, 'mnist_sklearn_TSNE_10000.npy'), + help='2D float numpy array for embedded data') + args = parser.parse_args() + + X = np.load(args.embedding) + y = np.load(args.labels) + + for i in np.unique(y): + mask = y == i + plt.scatter(X[mask, 0], X[mask, 1], alpha=0.2, label=int(i)) + plt.legend(loc='best') + plt.show() From a98d571c0c3b0cb7f28d1d09e1a077c6bd43859e Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Tue, 20 Jun 2017 20:13:51 +0200 Subject: [PATCH 15/55] CLN remove unsused code --- sklearn/manifold/t_sne.py | 71 ++------------------------------------- 1 file changed, 2 insertions(+), 69 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 81eaf432ed201..b61a91b832f6c 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -183,62 +183,6 @@ def _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components, return kl_divergence, grad -def _kl_divergence_error(params, P, neighbors, degrees_of_freedom, n_samples, - n_components): - """t-SNE objective function: the absolute error of the - KL divergence of p_ijs and q_ijs. - - Parameters - ---------- - params : array, shape (n_params,) - Unraveled embedding. - - P : array, shape (n_samples * (n_samples-1) / 2,) - Condensed joint probability matrix. - - neighbors : array (n_samples, K) - The neighbors is not actually required to calculate the - divergence, but is here to match the signature of the - gradient function - - degrees_of_freedom : float - Degrees of freedom of the Student's-t distribution. - - n_samples : int - Number of samples. - - n_components : int - Dimension of the embedded space. - - Returns - ------- - kl_divergence : float - Kullback-Leibler divergence of p_ij and q_ij. - - grad : array, shape (n_params,) - Unraveled gradient of the Kullback-Leibler divergence with respect to - the embedding. - """ - X_embedded = params.reshape(n_samples, n_components) - - # Q is a heavy-tailed distribution: Student's t-distribution - n = pdist(X_embedded, "sqeuclidean") - n += 1. - n /= degrees_of_freedom - n **= (degrees_of_freedom + 1.0) / -2.0 - Q = np.maximum(n / (2.0 * np.sum(n)), MACHINE_EPSILON) - - # Optimization trick below: np.dot(x, y) is faster than - # np.sum(x * y) because it calls BLAS - - # Objective: C (Kullback-Leibler divergence of P and Q) - if len(P.shape) == 2: - P = squareform(P) - kl_divergence = 2.0 * np.dot(P, np.log(P / Q)) - - return kl_divergence - - def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, angle=0.5, skip_num_points=0, verbose=False): """t-SNE objective function: KL divergence of p_ijs and q_ijs. @@ -307,7 +251,7 @@ def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, return error, grad -def _gradient_descent(objective, p0, it, n_iter, objective_error=None, +def _gradient_descent(objective, p0, it, n_iter, n_iter_check=1, n_iter_without_progress=51, momentum=0.8, learning_rate=200.0, min_gain=0.01, min_grad_norm=1e-7, min_error_diff=1e-7, verbose=0, @@ -336,10 +280,6 @@ def _gradient_descent(objective, p0, it, n_iter, objective_error=None, Number of iterations before evaluating the global error. If the error is sufficiently low, we abort the optimization. - objective_error : function or callable - Should return a tuple of cost and gradient for a given parameter - vector. - n_iter_without_progress : int, optional (default: 51) Maximum number of iterations without progress before we abort the optimization. @@ -400,11 +340,7 @@ def _gradient_descent(objective, p0, it, n_iter, objective_error=None, tic = time() for i in range(it, n_iter): - try: - new_error, grad = objective(p, *args, **kwargs) - except AssertionError: - np.save("dump_X_embedded.npy", p) - raise + new_error, grad = objective(p, *args, **kwargs) grad_norm = linalg.norm(grad) inc = update * grad < 0.0 @@ -420,8 +356,6 @@ def _gradient_descent(objective, p0, it, n_iter, objective_error=None, toc = time() duration = toc - tic tic = toc - if new_error is None: - new_error = objective_error(p, *args) error_diff = np.abs(new_error - error) error = new_error @@ -894,7 +828,6 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, opt_args['n_iter_without_progress'] = EXPLORATION_N_ITER # Don't always calculate the cost since that calculation # can be nearly as expensive as the gradient - opt_args['objective_error'] = _kl_divergence_error opt_args['kwargs']['angle'] = self.angle opt_args['kwargs']['verbose'] = self.verbose else: From bd96c3ae4bc28762aa0b8e9096e25f744337b52c Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Wed, 21 Jun 2017 09:56:47 +0200 Subject: [PATCH 16/55] TST improve parameter testing --- sklearn/manifold/t_sne.py | 75 ++++++++++++++++------------ sklearn/manifold/tests/test_t_sne.py | 56 ++++++++++++++++++++- 2 files changed, 98 insertions(+), 33 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index b61a91b832f6c..d1dea6d57f008 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -625,11 +625,6 @@ def __init__(self, n_components=2, perplexity=30.0, metric="euclidean", init="random", verbose=0, random_state=None, method='barnes_hut', angle=0.5, n_jobs=1, neighbors_method='ball_tree'): - if not ((isinstance(init, string_types) and - init in ["pca", "random"]) or - isinstance(init, np.ndarray)): - msg = "'init' must be 'pca', 'random', or a numpy array" - raise ValueError(msg) self.n_components = n_components self.perplexity = perplexity self.early_exaggeration = early_exaggeration @@ -678,6 +673,10 @@ def _fit(self, X, skip_num_points=0): "used with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") + if np.any(X < 0): + raise ValueError("All distances should be positive, the " + "precomputed distances given as X is not " + "correct") if self.method == 'barnes_hut' and sp.issparse(X): raise TypeError('A sparse matrix was passed, but dense ' 'data is required for method="barnes_hut". Use ' @@ -688,6 +687,10 @@ def _fit(self, X, skip_num_points=0): else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64]) + if self.method == 'barnes_hut' and self.n_components > 3: + raise ValueError("'n_components' should be inferior to 4 for the " + "barnes_hut algorithm as it relies on " + "quad-tree or oct-tree.") random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: @@ -697,7 +700,12 @@ def _fit(self, X, skip_num_points=0): if self.n_iter < 250: raise ValueError("n_iter should be at least 250") + n_samples = X.shape[0] + + neighbors_nn = None if self.method == "exact": + # Retrieve the distance matrix, either using the precomputed one or + # computing it. if self.metric == "precomputed": distances = X else: @@ -710,20 +718,18 @@ def _fit(self, X, skip_num_points=0): else: distances = pairwise_distances(X, metric=self.metric) - if not np.all(distances >= 0): - raise ValueError("All distances should be positive, either " - "the metric or precomputed distances given " - "as X are not correct") + if np.any(distances < 0): + raise ValueError("All distances should be positive, the " + "metric given is not correct") - # Degrees of freedom of the Student's t-distribution. The suggestion - # degrees_of_freedom = n_components - 1 comes from - # "Learning a Parametric Embedding by Preserving Local Structure" - # Laurens van der Maaten, 2009. - degrees_of_freedom = max(self.n_components - 1.0, 1) - n_samples = X.shape[0] + # compute the joint probability distribution for the input space + P = _joint_probabilities(distances, self.perplexity, self.verbose) + assert np.all(np.isfinite(P)), "All probabilities should be finite" + assert np.all(P >= 0), "All probabilities should be non-negative" + assert np.all(P <= 1), ("All probabilities should be less " + "or then equal to one") - neighbors_nn = None - if self.method == 'barnes_hut': + else: # Cpmpute the number of nearest neighbors to find. # LvdM uses 3 * perplexity as the number of neighbors. # In the event that we have very small # of points @@ -747,15 +753,17 @@ def _fit(self, X, skip_num_points=0): elif isinstance(self.neighbors_method, NeighborsBase): knn = self.neighbors_method else: - ValueError("neighbors_method should be either a string or " - "a subclass of NeighborsBase. {} is not valid." - .format(self.neighbors_method)) + raise ValueError("'neighbors_method' should be either a " + "string or a subclass of NeighborsBase. {} " + "is not valid.".format(self.neighbors_method)) + t0 = time() knn.fit(X) duration = time() - t0 if self.verbose: print("[t-SNE] Indexed {} samples in {:.3f}s...".format( n_samples, duration)) + t0 = time() distances_nn, neighbors_nn = knn.kneighbors( None, n_neighbors=k) @@ -764,20 +772,17 @@ def _fit(self, X, skip_num_points=0): print("[t-SNE] Computed neighbors for {} samples in {:.3f}s..." .format(n_samples, duration)) - if self.metric != "precomputed": - # knn return the euclidean distance but we need it squared. - # TODO: the computation are valid for euclidean distance. - # Should we enforce that with an assert? + if self.metric == "euclidean": + # knn return the euclidean distance but we need it squared + # to be consistent with the 'exact' method. Note that the + # the method was derived using the euclidean method as in the + # input space. Not sure of the implication of using a different + # metric. distances_nn **= 2 + # compute the joint probability distribution for the input space P = _joint_probabilities_nn(distances_nn, neighbors_nn, self.perplexity, self.verbose) - else: - P = _joint_probabilities(distances, self.perplexity, self.verbose) - assert np.all(np.isfinite(P)), "All probabilities should be finite" - assert np.all(P >= 0), "All probabilities should be non-negative" - assert np.all(P <= 1), ("All probabilities should be less " - "or then equal to one") if isinstance(self.init, np.ndarray): X_embedded = self.init @@ -789,8 +794,14 @@ def _fit(self, X, skip_num_points=0): X_embedded = 1e-4 * random_state.randn( n_samples, self.n_components).astype(np.float32) else: - raise ValueError("Unsupported initialization scheme: {}" - .format(self.init)) + raise ValueError("'init' must be 'pca', 'random', or " + "a numpy array") + + # Degrees of freedom of the Student's t-distribution. The suggestion + # degrees_of_freedom = n_components - 1 comes from + # "Learning a Parametric Embedding by Preserving Local Structure" + # Laurens van der Maaten, 2009. + degrees_of_freedom = max(self.n_components - 1.0, 1) return self._tsne(P, degrees_of_freedom, n_samples, random_state, X_embedded=X_embedded, diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 6e5fa9800b385..9888ea332b4f5 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -321,10 +321,42 @@ def test_non_square_precomputed_distances(): tsne.fit_transform, np.array([[0.0], [1.0]])) +def test_non_positive_precomputed_distances(): + # Precomputed distance matrices must be positive. + bad_dist = np.array([[0., -1.], [1., 0.]]) + for method in ['barnes_hut', 'exact']: + tsne = TSNE(metric="precomputed", method=method) + assert_raises_regexp(ValueError, "All distances .*precomputed.*", + tsne.fit_transform, bad_dist) + + +def test_non_positive_computed_distances(): + # Computed distance matrices must be positive. + def metric(x, y): + return -1 + + tsne = TSNE(metric=metric, method='exact') + X = np.array([[0.0, 0.0], [1.0, 1.0]]) + assert_raises_regexp(ValueError, "All distances .*metric given.*", + tsne.fit_transform, X) + + +def test_not_available_neighbors_method(): + # Computed distance matrices must be positive. + tsne = TSNE(neighbors_method='not available', method='barnes_hut') + assert_raises_regexp(ValueError, "unrecognized algorithm: 'not available'", + tsne.fit_transform, np.array([[0.0, 1.0]])) + tsne = TSNE(neighbors_method=1, method='barnes_hut') + assert_raises_regexp(ValueError, "'neighbors_method' should be .*", + tsne.fit_transform, np.array([[0.0, 1.0]])) + + def test_init_not_available(): # 'init' must be 'pca', 'random', or numpy array. + tsne = TSNE(init="not available") m = "'init' must be 'pca', 'random', or a numpy array" - assert_raises_regexp(ValueError, m, TSNE, init="not available") + assert_raises_regexp(ValueError, m, tsne.fit_transform, + np.array([[0.0], [1.0]])) def test_init_ndarray(): @@ -353,6 +385,21 @@ def test_distance_not_available(): tsne.fit_transform, np.array([[0.0], [1.0]])) +def test_method_not_available(): + # 'nethod' must be 'barnes_hut' or 'exact' + tsne = TSNE(method='not available') + assert_raises_regexp(ValueError, "'method' must be 'barnes_hut' or ", + tsne.fit_transform, np.array([[0.0], [1.0]])) + + +def test_angle_out_of_range_checks(): + # check the angle parameter range + for angle in [-1, -1e-6, 1 + 1e-6, 2]: + tsne = TSNE(angle=angle) + assert_raises_regexp(ValueError, "'angle' must be between 0.0 - 1.0", + tsne.fit_transform, np.array([[0.0], [1.0]])) + + def test_pca_initialization_not_compatible_with_precomputed_kernel(): # Precomputed distance matrices must be square matrices. tsne = TSNE(metric="precomputed", init="pca") @@ -361,6 +408,13 @@ def test_pca_initialization_not_compatible_with_precomputed_kernel(): tsne.fit_transform, np.array([[0.0], [1.0]])) +def test_n_components_range(): + # barnes_hut method should only be used with n_components <= 3 + tsne = TSNE(n_components=4, method="barnes_hut") + assert_raises_regexp(ValueError, "'n_components' should be .*", + tsne.fit_transform, np.array([[0.0], [1.0]])) + + def test_answer_gradient_two_points(): # Test the tree with only a single set of children. # From acd266640fe1e845a89b474aaf1c537cdf1d9542 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 22 Jun 2017 10:33:27 +0200 Subject: [PATCH 17/55] CLN improve code readability --- sklearn/manifold/t_sne.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index d1dea6d57f008..b2e4104f8ce18 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -75,11 +75,11 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): Parameters ---------- - distances : array, shape (n_samples, K) - Distances of samples to its K nearest neighbors. + distances : array, shape (n_samples, k) + Distances of samples to its k nearest neighbors. - neighbors : array, shape (n_samples, K) - K nearest-neighbors for each samples. + neighbors : array, shape (n_samples, k) + Indices of the k nearest-neighbors for each samples. desired_perplexity : float Desired perplexity of the joint probability distributions. @@ -95,7 +95,7 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): t0 = time() # Compute conditional probabilities such that they approximately match # the desired perplexity - n_samples, K = neighbors.shape + n_samples, k = neighbors.shape distances = distances.astype(np.float32, copy=False) neighbors = neighbors.astype(np.int64, copy=False) conditional_P = _utils._binary_search_perplexity( @@ -103,13 +103,16 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): assert np.all(np.isfinite(conditional_P)), \ "All probabilities should be finite" + # Symmetrize the joint probability distribution using sparse operations P = csr_matrix((conditional_P.ravel(), neighbors.ravel(), - range(0, n_samples * K + 1, K)), + range(0, n_samples * k + 1, k)), shape=(n_samples, n_samples)) - P = P + P.T + + # Normalize the joint probability distribution sum_P = np.maximum(P.sum(), MACHINE_EPSILON) P /= sum_P + assert np.all(np.abs(P.data) <= 1.0) if verbose >= 2: duration = time() - t0 @@ -196,7 +199,8 @@ def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, Unraveled embedding. P : csr sparse matrix, shape (n_samples, n_sample) - Condensed joint probability matrix. + Sparse approximate joint probability matrix, computed only for the + k nearest-neighbors and symmetrized. degrees_of_freedom : float Degrees of freedom of the Student's-t distribution. From ffcb6f4544c1a72a891d0719d700070c065c2b6c Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 22 Jun 2017 11:08:38 +0200 Subject: [PATCH 18/55] CLN more comments in quad_tree --- sklearn/neighbors/quad_tree.pxd | 2 +- sklearn/neighbors/quad_tree.pyx | 287 ++++++++++++++++++-------------- 2 files changed, 167 insertions(+), 122 deletions(-) diff --git a/sklearn/neighbors/quad_tree.pxd b/sklearn/neighbors/quad_tree.pxd index 94297f14cb975..84595cce18d48 100644 --- a/sklearn/neighbors/quad_tree.pxd +++ b/sklearn/neighbors/quad_tree.pxd @@ -79,7 +79,7 @@ cdef class QuadTree: cdef SIZE_t insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, SIZE_t point_index, SIZE_t size=*) nogil - cdef void init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil + cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil cdef bint is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil cdef SIZE_t select_child(self, DTYPE_t[3] point, Cell* cell) nogil cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds) nogil diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 7b204b3586876..6164005735a7d 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -28,6 +28,8 @@ cdef extern from "numpy/arrayobject.h": void* data, int flags, object obj) +# XXX using (size_t)(-1) is ugly, but SIZE_MAX is not available in C89 +# (i.e., older MSVC). cdef SIZE_t DEFAULT = (-1) @@ -36,7 +38,7 @@ CELL_DTYPE = np.dtype({ 'names': ['parent', 'children', 'cell_id', 'point_index', 'is_leaf', 'max_width', 'depth', 'cumulative_size', 'center', 'barycenter', 'min_bounds', 'max_bounds'], - 'formats': [np.intp, (np.intp, 8), np.intp, np.intp, np.int32, np.float32, + 'formats': [np.intp, (np.intp, 8), np.intp, np.intp, np.int32, np.float32, np.intp, np.intp, (np.float32, 3), (np.float32, 3), (np.float32, 3), (np.float32, 3)], 'offsets': [ @@ -88,64 +90,31 @@ cdef class QuadTree: def __get__(self): return self._get_cell_ndarray()['is_leaf'][:self.cell_count] - cdef int _resize(self, SIZE_t capacity) nogil except -1: - """Resize all inner arrays to `capacity`, if `capacity` == -1, then - double the size of the inner arrays. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - """ - if self._resize_c(capacity) != 0: - # Acquire gil only if we need to raise - with gil: - raise MemoryError() - - # XXX using (size_t)(-1) is ugly, but SIZE_MAX is not available in C89 - # (i.e., older MSVC). - cdef int _resize_c(self, SIZE_t capacity=DEFAULT) nogil except -1: - """Guts of _resize - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - """ - if capacity == self.capacity and self.cells != NULL: - return 0 + def build_tree(self, X): + """Build a tree from an arary of points X.""" + cdef DTYPE_t[3] pt + cdef DTYPE_t[3] min_bounds, max_bounds - if capacity == DEFAULT: - if self.capacity == 0: - capacity = 9 # default initial value to min - else: - capacity = 2 * self.capacity + # validate X and prepare for query + # X = check_array(X, dtype=DTYPE_t, order='C') + n_samples = X.shape[0] - safe_realloc(&self.cells, capacity) + capacity = 100 + self._resize(capacity) + m, M = np.min(X, axis=0) - 1e-3, np.max(X, axis=0) + 1e-3 + for i in range(self.n_dimensions): + min_bounds[i] = m[i] + max_bounds[i] = M[i] - # if capacity smaller than cell_count, adjust the counter - if capacity < self.cell_count: - self.cell_count = capacity + # Create the initial node with boundaries from the dataset + self._init_root(min_bounds, max_bounds) - self.capacity = capacity - return 0 + for i in range(n_samples): + for j in range(self.n_dimensions): + pt[j] = X[i, j] + self.insert_point(pt, i) - cdef int check_point_in_cell(self, DTYPE_t[3] point, Cell* cell - ) nogil except -1: - if self.verbose >= 10: - printf("[QuadTree] Checking point (%f, %f, %f) in cell %li " - "([%f/%f, %f/%f, %f/%f], size %li)\n", - point[0], point[1], point[2], cell.cell_id, - cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], - cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], - cell.cumulative_size) - - for i in range(self.n_dimensions): - if (cell.min_bounds[i] > point[i] or - cell.max_bounds[i] <= point[i]): - with gil: - msg = "[QuadTree] InsertionError: point out of cell boundary.\n" - msg += "Axis %li: cell [%f, %f]; point %f\n" - - msg %= i, cell.min_bounds[i], cell.max_bounds[i], point[i] - raise ValueError(msg) - + self._resize(capacity=self.cell_count) cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index, SIZE_t cell_id=0) nogil except -1: @@ -181,7 +150,7 @@ cdef class QuadTree: for i in range(self.n_dimensions): # barycenter update using a weighted mean cell.barycenter[i] = (n_point * cell.barycenter[i] + point[i]) / (n_point + 1) - + # Increase the size of the subtree starting from this cell cell.cumulative_size += 1 @@ -212,17 +181,19 @@ cdef class QuadTree: # XXX: This operation is not Thread safe cdef SIZE_t insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, SIZE_t point_index, SIZE_t size=1) nogil: + """Create a child of cell which will contain point.""" # Local variable definition - cdef SIZE_t cell_id, cell_child_id, parent_id - cdef DTYPE_t[3] save_point - cdef DTYPE_t width - cdef Cell* child - cdef int i + cdef: + SIZE_t cell_id, cell_child_id, parent_id + DTYPE_t[3] save_point + DTYPE_t width + Cell* child + int i # If the maximal capacity of the Tree have been reach, double the capacity # We need to save the current cell id and the current point to retrieve them - # in case the reallocation + # in case the reallocation if self.cell_count + 1 > self.capacity: parent_id = cell.cell_id for i in range(self.n_dimensions): @@ -235,15 +206,16 @@ cdef class QuadTree: cell_id = self.cell_count self.cell_count += 1 child = &self.cells[cell_id] - - self.init_cell(child, cell.cell_id, cell.depth + 1) + + self._init_cell(child, cell.cell_id, cell.depth + 1) child.cell_id = cell_id # Set the cell as an inner cell of the Tree cell.is_leaf = False cell.point_index = -1 - # Set the correct boundary for the cell and store the point in it + # Set the correct boundary for the cell, store the point in the cell + # and compute its index in the children array. cell_child_id = 0 for i in range(self.n_dimensions): cell_child_id *= 2 @@ -259,14 +231,14 @@ cdef class QuadTree: child.barycenter[i] = point[i] child.max_width = max(child.max_width, width*width) - - # TODO: max_width + + # Store the point info and the size to account for duplicated points child.point_index = point_index child.cumulative_size = size # Store the child cell in the correct place in children cell.children[cell_child_id] = child.cell_id - + if DEBUGFLAG: # Assert that the point is in the right range self.check_point_in_cell(point, child) @@ -274,18 +246,10 @@ cdef class QuadTree: printf("[QuadTree] inserted point %li in new child %li\n", point_index, cell_id) return cell_id - - cdef void init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil: - cell.parent = parent - cell.is_leaf = True - cell.depth = depth - cell.max_width = 0 - cell.cumulative_size = 0 - for i in range(self.n_cells_per_cell): - cell.children[i] = DEFAULT cdef bint is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil: + """Check if the two given points are equals.""" cdef int i cdef bint res = True for i in range(self.n_dimensions): @@ -294,8 +258,11 @@ cdef class QuadTree: cdef SIZE_t select_child(self, DTYPE_t[3] point, Cell* cell) nogil: - cdef int i - cdef SIZE_t selected_child = 0 + """Select the child of cell which contains the given query point.""" + cdef: + int i + SIZE_t selected_child = 0 + for i in range(self.n_dimensions): # Select the correct child cell to insert the point by comparing # it to the borders of the cells using precomputed center. @@ -304,11 +271,25 @@ cdef class QuadTree: selected_child += 1 return cell.children[selected_child] - cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds) nogil: - cdef int i - cdef DTYPE_t width - cdef Cell* root = &self.cells[0] - self.init_cell(root, -1, 0) + cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil: + """Initialize a cell structure with some constants.""" + cell.parent = parent + cell.is_leaf = True + cell.depth = depth + cell.max_width = 0 + cell.cumulative_size = 0 + for i in range(self.n_cells_per_cell): + cell.children[i] = DEFAULT + + cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds + ) nogil: + """Initialize the root node with the given space boundaries""" + cdef: + int i + DTYPE_t width + Cell* root = &self.cells[0] + + self._init_cell(root, -1, 0) for i in range(self.n_dimensions): root.min_bounds[i] = min_bounds[i] root.max_bounds[i] = max_bounds[i] @@ -316,34 +297,8 @@ cdef class QuadTree: width = max_bounds[i] - min_bounds[i] root.max_width = max(root.max_width, width*width) root.cell_id = 0 - - self.cell_count += 1 - def build_tree(self, X): - """Build a tree from the points in X.""" - cdef DTYPE_t[3] pt - cdef DTYPE_t[3] min_bounds, max_bounds - - # validate X and prepare for query - # X = check_array(X, dtype=DTYPE_t, order='C') - n_samples = X.shape[0] - - capacity = 100 - self._resize(capacity) - m, M = np.min(X, axis=0) - 1e-3, np.max(X, axis=0) + 1e-3 - for i in range(self.n_dimensions): - min_bounds[i] = m[i] - max_bounds[i] = M[i] - - # Create the initial node with boundaries from the dataset - self._init_root(min_bounds, max_bounds) - - for i in range(n_samples): - for j in range(self.n_dimensions): - pt[j] = X[i, j] - self.insert_point(pt, i) - - self._resize(capacity=self.cell_count) + self.cell_count += 1 def plot_tree(self): """Plot the tree with cell boundaries and the points inserted in it.""" @@ -359,17 +314,50 @@ cdef class QuadTree: else: # If the cell is a leaf, display the point contained in it. plt.scatter(c.barycenter[0], c.barycenter[1], c='b', marker='.') - + # Print bounding box of the Tree root = self.cells[0] plt.vlines([root.min_bounds[0], root.max_bounds[0]], root.min_bounds[1], root.max_bounds[1]) plt.hlines([root.min_bounds[1], root.max_bounds[1]], root.min_bounds[0], root.max_bounds[0]) plt.show() + cdef int check_point_in_cell(self, DTYPE_t[3] point, Cell* cell + ) nogil except -1: + """Check that the given point is in the cell boundaries.""" + + if self.verbose >= 10: + printf("[QuadTree] Checking point (%f, %f, %f) in cell %li " + "([%f/%f, %f/%f, %f/%f], size %li)\n", + point[0], point[1], point[2], cell.cell_id, + cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], + cell.cumulative_size) + + for i in range(self.n_dimensions): + if (cell.min_bounds[i] > point[i] or + cell.max_bounds[i] <= point[i]): + with gil: + msg = "[QuadTree] InsertionError: point out of cell boundary.\n" + msg += "Axis %li: cell [%f, %f]; point %f\n" + + msg %= i, cell.min_bounds[i], cell.max_bounds[i], point[i] + raise ValueError(msg) + def check_coherence(self): + """Check the coherence of the cells of the tree. + + Check that the info stored in each cell are compatible with the info + stored in descendent and sibling cells. Raise a RuntimeError if this + fails. + """ for cell in self.cells[:self.cell_count]: + # Check that the barycenter of inserted point is within the cell + # boundaries self.check_point_in_cell(cell.barycenter, &cell) + if not cell.is_leaf: + # Compute the number of point in children and compare with + # its cummulative_size. n_points = 0 for idx in range(self.n_cells_per_cell): child_id = cell.children[idx] @@ -384,16 +372,19 @@ cdef class QuadTree: "in children. ({})" .format(cell.cell_id, cell.cumulative_size, n_points, cell.children)) + + # Make sure that the number of point in the tree correspond to the + # cummulative size in root cell. if self.n_points != self.cells[0].cumulative_size: raise RuntimeError( "QuadTree is incoherent. Size={} but found {} points " "in children." .format(self.n_points, self.cells[0].cumulative_size)) - + cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, SIZE_t cell_id=0, long idx=0, float squared_theta=.5) nogil: """Summarize the tree compared to a query point. - + Input arguments --------------- point: array (n_dimensions) @@ -410,11 +401,11 @@ cdef class QuadTree: result: array (n_samples * (n_dimensions+2)) result will contain a summary of the tree information compared to the query point: - - results[idx:idx+n_dimensions] contains the delta between a summary - node idx and the query point. + - results[idx:idx+n_dimensions] contains the delta between the summary + cell idx and the query point. - result[idx+n_dimensions+1] contains the squared euclidean distance - to the summary node idx. - - result[idx+n_dimensions+2] contains the size of the summary node idx. + to the summary cell idx. + - result[idx+n_dimensions+2] contains the size of the summary cell idx. Return ------ @@ -440,7 +431,7 @@ cdef class QuadTree: # Check whether we can use this node as a summary # It's a summary node if the angular size as measured from the point # is relatively small (w.r.t. to theta) or if it is a leaf node. - # If it can be summarized, we use the cell center of mass + # If it can be summarized, we use the cell center of mass # Otherwise, we go a higher level of resolution and into the leaves. if cell.is_leaf or ((cell.max_width / results[idx_d]) < squared_theta): results[idx_d + 1] = cell.cumulative_size @@ -454,17 +445,29 @@ cdef class QuadTree: idx = self.summarize(point, results, child_id, idx) return idx - + def get_cell(self, point): + """return the id of the cell containing the query point or raise + ValueError if the point is not in the tree + """ cdef DTYPE_t[3] query_pt cdef int i + assert len(point) == self.n_dimensions, ( + "Query point should be a point in dimension {}." + .format(self.n_dimensions)) + for i in range(self.n_dimensions): query_pt[i] = point[i] return self._get_cell(query_pt, 0) - cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=0) nogil except -1: + cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=0 + ) nogil except -1: + """guts of get_cell. + + Return the id of the cell containing the query point or raise ValueError + if the point is not in the tree""" cdef: SIZE_t selected_child Cell* cell = &self.cells[cell_id] @@ -485,6 +488,8 @@ cdef class QuadTree: with gil: raise ValueError("Query point not in the Tree.") + # Pickling primitives + def __reduce__(self): """Reduce re-implementation, for pickling.""" return (QuadTree, (self.n_dimensions, self.verbose), @@ -526,6 +531,10 @@ cdef class QuadTree: cells = memcpy(self.cells, ( cell_ndarray).data, self.capacity * sizeof(Cell)) + + # Array manipulation methods, to convert it to numpy or to resize + # self.cells array + cdef np.ndarray _get_cell_ndarray(self): """Wraps nodes as a NumPy struct array. @@ -544,4 +553,40 @@ cdef class QuadTree: np.NPY_DEFAULT, None) Py_INCREF(self) arr.base = self - return arr \ No newline at end of file + return arr + + cdef int _resize(self, SIZE_t capacity) nogil except -1: + """Resize all inner arrays to `capacity`, if `capacity` == -1, then + double the size of the inner arrays. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if self._resize_c(capacity) != 0: + # Acquire gil only if we need to raise + with gil: + raise MemoryError() + + cdef int _resize_c(self, SIZE_t capacity=DEFAULT) nogil except -1: + """Guts of _resize + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if capacity == self.capacity and self.cells != NULL: + return 0 + + if capacity == DEFAULT: + if self.capacity == 0: + capacity = 9 # default initial value to min + else: + capacity = 2 * self.capacity + + safe_realloc(&self.cells, capacity) + + # if capacity smaller than cell_count, adjust the counter + if capacity < self.cell_count: + self.cell_count = capacity + + self.capacity = capacity + return 0 From bebf2d2c1e2be907c3931dee352d52d9ad48fd09 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 22 Jun 2017 11:22:57 +0200 Subject: [PATCH 19/55] CLN remove malloc when possible + trailing white-space --- sklearn/manifold/_barnes_hut_tsne.pyx | 67 ++++++++++++--------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index 728bb78461fee..c605a69b7b421 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -23,7 +23,7 @@ cdef extern from "math.h": float fabsf(float x) nogil # Round points differing by less than this amount -# effectively ignoring differences near the 32bit +# effectively ignoring differences near the 32bit # floating point precision cdef float EPSILON = 1e-6 @@ -52,18 +52,20 @@ cdef float compute_gradient(float[:] val_P, long stop) nogil: # Having created the tree, calculate the gradient # in two components, the positive and negative forces - cdef long i, coord - cdef int ax - cdef long n_samples = pos_reference.shape[0] - cdef int n_dimensions = qt.n_dimensions + cdef: + long i, coord + int ax + long n_samples = pos_reference.shape[0] + int n_dimensions = qt.n_dimensions + float[1] sum_Q + clock_t t1, t2 + float sQ, error + if qt.verbose > 11: printf("[t-SNE] Allocating %li elements in force arrays\n", n_samples * n_dimensions * 2) - cdef float* sum_Q = malloc(sizeof(float)) cdef float* neg_f = malloc(sizeof(float) * n_samples * n_dimensions) cdef float* pos_f = malloc(sizeof(float) * n_samples * n_dimensions) - cdef clock_t t1, t2 - cdef float sQ, error sum_Q[0] = 0.0 t1 = clock() @@ -84,7 +86,7 @@ cdef float compute_gradient(float[:] val_P, for ax in range(n_dimensions): coord = i * n_dimensions + ax tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sum_Q[0]) - free(sum_Q) + free(neg_f) free(pos_f) return error @@ -112,15 +114,16 @@ cdef float compute_gradient_positive(float[:] val_P, float dij, qij, pij float C = 0.0 float exponent = (dof + 1.0) / -2.0 - cdef clock_t t1, t2 - cdef float* buff = malloc(sizeof(float) * n_dimensions) + float[3] buff + clock_t t1, t2 + t1 = clock() for i in range(start, n_samples): for ax in range(n_dimensions): pos_f[i * n_dimensions + ax] = 0.0 for k in range(indptr[i], indptr[i+1]): j = neighbors[k] - # we don't need to exclude the i==j case since we've + # we don't need to exclude the i==j case since we've # already thrown it out from the list of neighbors dij = 0.0 pij = val_P[k] @@ -138,7 +141,6 @@ cdef float compute_gradient_positive(float[:] val_P, if verbose > 10: printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt) - free(buff) return C @@ -147,34 +149,26 @@ cdef void compute_gradient_negative(float[:,:] pos_reference, quad_tree.QuadTree qt, float* sum_Q, float dof, - float theta, - long start, + float theta, + long start, long stop) nogil: if stop == -1: - stop = pos_reference.shape[0] + stop = pos_reference.shape[0] cdef: int ax - long i, j - long n = stop - start - float* force - float* iQ - float* pos - float size, dist2s - long* l int n_dimensions = qt.n_dimensions - float qijZ, mult - long idx, + long i, j, idx + long n = stop - start long dta = 0 long dtb = 0 - clock_t t1, t2, t3 - float* neg_force long offset = n_dimensions + 2 + long* l + float size, dist2s, qijZ, mult + float[1] iQ + float[3] force, neg_force, pos + clock_t t1, t2, t3 - iQ = malloc(sizeof(float)) - force = malloc(sizeof(float) * n_dimensions) - pos = malloc(sizeof(float) * n_dimensions) summary = malloc(sizeof(float) * n * offset) - neg_force= malloc(sizeof(float) * n_dimensions) for i in range(start, stop): # Clear the arrays @@ -190,11 +184,11 @@ cdef void compute_gradient_negative(float[:,:] pos_reference, t2 = clock() # Compute the t-SNE negative force # for the digits dataset, walking the tree - # is about 10-15x more expensive than the + # is about 10-15x more expensive than the # following for loop exponent = (dof + 1.0) / -2.0 for j in range(idx // offset): - + dist2s = summary[j * offset + n_dimensions] size = summary[j * offset + n_dimensions + 1] qijZ = ((1.0 + dist2s) / dof) ** exponent # 1/(1+dist) @@ -210,18 +204,15 @@ cdef void compute_gradient_negative(float[:,:] pos_reference, if qt.verbose > 20: printf("[t-SNE] Tree: %li clock ticks | ", dta) printf("Force computation: %li clock ticks\n", dtb) - free(iQ) - free(force) - free(pos) + free(summary) - free(neg_force) def calculate_edge(pos_output): # Make the boundaries slightly outside of the data # to avoid floating point error near the edge left_edge = np.min(pos_output, axis=0) - right_edge = np.max(pos_output, axis=0) + right_edge = np.max(pos_output, axis=0) center = (right_edge + left_edge) * 0.5 width = np.maximum(np.subtract(right_edge, left_edge), EPSILON) # Exagerate width to avoid boundary edge From 0efe4e547e97b8ed26e61015aac07b187927b301 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Mon, 26 Jun 2017 13:37:11 +0200 Subject: [PATCH 20/55] CLN fix pep8+add comments+rename methods quad_tree --- sklearn/manifold/_barnes_hut_tsne.pyx | 6 +- sklearn/manifold/tests/test_t_sne.py | 2 +- sklearn/neighbors/quad_tree.pxd | 5 +- sklearn/neighbors/quad_tree.pyx | 72 ++++++++++++++--------- sklearn/neighbors/tests/test_quad_tree.py | 10 ++-- 5 files changed, 55 insertions(+), 40 deletions(-) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index c605a69b7b421..d5fd7e40e7b05 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -45,7 +45,7 @@ cdef float compute_gradient(float[:] val_P, np.int64_t[:] neighbors, np.int64_t[:] indptr, float[:,:] tot_force, - quad_tree.QuadTree qt, + quad_tree._QuadTree qt, float theta, float dof, long start, @@ -146,7 +146,7 @@ cdef float compute_gradient_positive(float[:] val_P, cdef void compute_gradient_negative(float[:,:] pos_reference, float* neg_f, - quad_tree.QuadTree qt, + quad_tree._QuadTree qt, float* sum_Q, float dof, float theta, @@ -249,7 +249,7 @@ def gradient(float[:] val_P, assert n == indptr.shape[0] - 1, m if verbose > 10: printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions) - cdef quad_tree.QuadTree qt = quad_tree.QuadTree(pos_output.shape[1], 0) + cdef quad_tree._QuadTree qt = quad_tree._QuadTree(pos_output.shape[1], 0) if verbose > 10: printf("[t-SNE] Inserting %li points\n", pos_output.shape[0]) qt.build_tree(pos_output) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 9888ea332b4f5..bea2c11c0878d 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -288,7 +288,7 @@ def test_fit_csr_matrix(): def test_preserve_trustworthiness_approximately_with_precomputed_distances(): # Nearest neighbors should be preserved approximately. random_state = check_random_state(0) - for i in range(5): + for i in range(3): X = random_state.randn(100, 2) D = squareform(pdist(X), "sqeuclidean") tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, diff --git a/sklearn/neighbors/quad_tree.pxd b/sklearn/neighbors/quad_tree.pxd index 84595cce18d48..b30ba6fd4ee7a 100644 --- a/sklearn/neighbors/quad_tree.pxd +++ b/sklearn/neighbors/quad_tree.pxd @@ -51,7 +51,7 @@ cdef struct Cell: DTYPE_t[3] max_bounds # Superior boundaries of this cell (exclusive) -cdef class QuadTree: +cdef class _QuadTree: # The QuadTree object is a quad tree structure constructed by inserting # recursively points in the tree and splitting cells in 4 so that each # leaf cell contains at most one point. @@ -74,9 +74,6 @@ cdef class QuadTree: cdef int _resize(self, SIZE_t capacity) nogil except -1 cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1 - # cdef np.ndarray _get_value_ndarray(self) - # cdef np.ndarray _get_node_ndarray(self) - cdef SIZE_t insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, SIZE_t point_index, SIZE_t size=*) nogil cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 6164005735a7d..06af274236dc6 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -60,8 +60,17 @@ CELL_DTYPE = np.dtype({ assert CELL_DTYPE.itemsize == sizeof(Cell) -cdef class QuadTree: +cdef class _QuadTree: """Array-based representation of a QuadTree. + + This class is currently working for indexing 2D data (regular QuadTree) and + for indexing 3D data (OcTree). It is planned to split the 2 implementation + using `Cython.Tempita` to save some memory for QuadTree. + + Note that this code is currently internally used only by the Barnes-Hut + method in `sklearn.manifold.TSNE`. It is planned to be refactored and + generalized in the future to be compatible with nearest neighbors API of + `sklearn.neighbors` with 2D and 3D data. """ def __cinit__(self, int n_dimensions, int verbose): """Constructor.""" @@ -119,7 +128,7 @@ cdef class QuadTree: cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index, SIZE_t cell_id=0) nogil except -1: """Insert a point in the QuadTree.""" - cdef int i + cdef int ax cdef DTYPE_t n_frac cdef SIZE_t selected_child cdef Cell* cell = &self.cells[cell_id] @@ -132,7 +141,6 @@ cdef class QuadTree: if DEBUGFLAG: self.check_point_in_cell(point, cell) - # If the cell is an empty leaf, insert the point in it if cell.cumulative_size == 0: cell.cumulative_size = 1 @@ -147,9 +155,10 @@ cdef class QuadTree: # If the cell is not a leaf, update cell internals and # recurse in selected child if not cell.is_leaf: - for i in range(self.n_dimensions): + for ax in range(self.n_dimensions): # barycenter update using a weighted mean - cell.barycenter[i] = (n_point * cell.barycenter[i] + point[i]) / (n_point + 1) + cell.barycenter[ax] = ( + n_point * cell.barycenter[ax] + point[ax]) / (n_point + 1) # Increase the size of the subtree starting from this cell cell.cumulative_size += 1 @@ -175,12 +184,14 @@ cdef class QuadTree: # In a leaf, the barycenter correspond to the only point included # in it. - self.insert_point_in_new_child(cell.barycenter, cell, cell.point_index, cell.cumulative_size) + self.insert_point_in_new_child(cell.barycenter, cell, cell.point_index, + cell.cumulative_size) return self.insert_point(point, point_index, cell_id) # XXX: This operation is not Thread safe cdef SIZE_t insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, - SIZE_t point_index, SIZE_t size=1) nogil: + SIZE_t point_index, SIZE_t size=1 + ) nogil: """Create a child of cell which will contain point.""" # Local variable definition @@ -243,7 +254,8 @@ cdef class QuadTree: # Assert that the point is in the right range self.check_point_in_cell(point, child) if self.verbose >= 10: - printf("[QuadTree] inserted point %li in new child %li\n", point_index, cell_id) + printf("[QuadTree] inserted point %li in new child %li\n", + point_index, cell_id) return cell_id @@ -302,7 +314,7 @@ cdef class QuadTree: def plot_tree(self): """Plot the tree with cell boundaries and the points inserted in it.""" - self.check_coherence() + self._check_coherence() import matplotlib.pyplot as plt plt.figure() @@ -317,8 +329,10 @@ cdef class QuadTree: # Print bounding box of the Tree root = self.cells[0] - plt.vlines([root.min_bounds[0], root.max_bounds[0]], root.min_bounds[1], root.max_bounds[1]) - plt.hlines([root.min_bounds[1], root.max_bounds[1]], root.min_bounds[0], root.max_bounds[0]) + plt.vlines([root.min_bounds[0], root.max_bounds[0]], root.min_bounds[1], + root.max_bounds[1]) + plt.hlines([root.min_bounds[1], root.max_bounds[1]], root.min_bounds[0], + root.max_bounds[0]) plt.show() cdef int check_point_in_cell(self, DTYPE_t[3] point, Cell* cell @@ -337,17 +351,17 @@ cdef class QuadTree: if (cell.min_bounds[i] > point[i] or cell.max_bounds[i] <= point[i]): with gil: - msg = "[QuadTree] InsertionError: point out of cell boundary.\n" - msg += "Axis %li: cell [%f, %f]; point %f\n" + msg = "[QuadTree] InsertionError: point out of cell " + msg += "boundary.\nAxis %li: cell [%f, %f]; point %f\n" msg %= i, cell.min_bounds[i], cell.max_bounds[i], point[i] raise ValueError(msg) - def check_coherence(self): + def _check_coherence(self): """Check the coherence of the cells of the tree. - Check that the info stored in each cell are compatible with the info - stored in descendent and sibling cells. Raise a RuntimeError if this + Check that the info stored in each cell is compatible with the info + stored in descendent and sibling cells. Raise a ValueError if this fails. """ for cell in self.cells[:self.cell_count]: @@ -367,7 +381,7 @@ cdef class QuadTree: assert child.cell_id == child_id, ( "Cell id not correctly initiliazed.") if n_points != cell.cumulative_size: - raise RuntimeError( + raise ValueError( "Cell {} is incoherent. Size={} but found {} points " "in children. ({})" .format(cell.cell_id, cell.cumulative_size, @@ -376,13 +390,14 @@ cdef class QuadTree: # Make sure that the number of point in the tree correspond to the # cummulative size in root cell. if self.n_points != self.cells[0].cumulative_size: - raise RuntimeError( + raise ValueError( "QuadTree is incoherent. Size={} but found {} points " "in children." .format(self.n_points, self.cells[0].cumulative_size)) - cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, SIZE_t cell_id=0, - long idx=0, float squared_theta=.5) nogil: + cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, + SIZE_t cell_id=0, long idx=0, float squared_theta=.5 + ) nogil: """Summarize the tree compared to a query point. Input arguments @@ -401,11 +416,13 @@ cdef class QuadTree: result: array (n_samples * (n_dimensions+2)) result will contain a summary of the tree information compared to the query point: - - results[idx:idx+n_dimensions] contains the delta between the summary - cell idx and the query point. - - result[idx+n_dimensions+1] contains the squared euclidean distance - to the summary cell idx. - - result[idx+n_dimensions+2] contains the size of the summary cell idx. + - results[idx:idx+n_dimensions] contains the coordinate-wise + difference between the query point and the summary cell idx. + This is usefull in t-SNE to compute the negative forces. + - result[idx+n_dimensions+1] contains the squared euclidean + distance to the summary cell idx. + - result[idx+n_dimensions+2] contains the number of point of the + tree contained in the summary cell idx. Return ------ @@ -475,7 +492,8 @@ cdef class QuadTree: if cell.is_leaf: if self.is_duplicate(cell.barycenter, point): if self.verbose > 99: - printf("[QuadTree] Found point in cell: %li\n", cell.cell_id) + printf("[QuadTree] Found point in cell: %li\n", + cell.cell_id) return cell_id with gil: raise ValueError("Query point not in the Tree.") @@ -492,7 +510,7 @@ cdef class QuadTree: def __reduce__(self): """Reduce re-implementation, for pickling.""" - return (QuadTree, (self.n_dimensions, self.verbose), + return (_QuadTree, (self.n_dimensions, self.verbose), self.__getstate__()) def __getstate__(self): diff --git a/sklearn/neighbors/tests/test_quad_tree.py b/sklearn/neighbors/tests/test_quad_tree.py index f0ce77d2700ab..43a52ec74f988 100644 --- a/sklearn/neighbors/tests/test_quad_tree.py +++ b/sklearn/neighbors/tests/test_quad_tree.py @@ -1,6 +1,6 @@ import pickle import numpy as np -from sklearn.neighbors.quad_tree import QuadTree +from sklearn.neighbors.quad_tree import _QuadTree from sklearn.utils import check_random_state @@ -34,9 +34,9 @@ def test_quadtree_similar_point(): dtype=np.float32)) for X in Xs: - tree = QuadTree(n_dimensions=2, verbose=0) + tree = _QuadTree(n_dimensions=2, verbose=0) tree.build_tree(X) - tree.check_coherence() + tree._check_coherence() def test_quad_tree_pickle(): @@ -45,7 +45,7 @@ def test_quad_tree_pickle(): for n_dimensions in (2, 3): X = rng.random_sample((10, n_dimensions)) - tree = QuadTree(n_dimensions=n_dimensions, verbose=0) + tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) tree.build_tree(X) def check_pickle_protocol(protocol): @@ -68,7 +68,7 @@ def check_insert_duplicate(n_dimensions=2): X = rng.random_sample((10, n_dimensions)) Xd = np.r_[X, X[:5]] - tree = QuadTree(n_dimensions=n_dimensions, verbose=0) + tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) tree.build_tree(Xd) cumulative_size = tree.cumulative_size From 067ec73e2b9a6862a3dcdf6da33de677c1f52d34 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 29 Jun 2017 09:53:36 +0200 Subject: [PATCH 21/55] CLN pep8+typo+remove plot --- benchmarks/bench_tsne_mnist.py | 12 ++++++------ sklearn/manifold/_barnes_hut_tsne.pyx | 12 ++++++------ sklearn/manifold/_utils.pyx | 22 ++++++++++----------- sklearn/neighbors/quad_tree.pyx | 28 +++------------------------ sklearn/neighbors/setup.py | 3 +-- 5 files changed, 27 insertions(+), 50 deletions(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 601c9dfe037bb..4707150ac9bbd 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -103,14 +103,14 @@ def sanitize(filename): if isinstance(args.n_jobs, int): tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, verbose=args.verbose, n_jobs=args.n_jobs, n_iter=1000) - methods += [("sklearn TSNE", - lambda data: tsne_fit_transform(tsne, data))] + methods.append(("sklearn TSNE", + lambda data: tsne_fit_transform(tsne, data))) elif isinstance(args.n_jobs, list): for n_jobs in args.n_jobs: tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, verbose=args.verbose, n_jobs=n_jobs) - methods += [("sklearn TSNE (n_jobs={})".format(n_jobs), - lambda data: tsne_fit_transform(tsne, data))] + methods.append(("sklearn TSNE (n_jobs={})".format(n_jobs), + lambda data: tsne_fit_transform(tsne, data))) if args.bhtsne: try: @@ -134,7 +134,7 @@ def bhtsne(X): n_iter = -1 # TODO find a way to report the number of iterations return run_bh_tsne(X, use_pca=False, perplexity=args.perplexity, verbose=args.verbose > 0), n_iter - methods += [("lvdmaaten/bhtsne", bhtsne)] + methods.append(("lvdmaaten/bhtsne", bhtsne)) if args.profile: @@ -148,7 +148,7 @@ def bhtsne(X): data_size = [100, 500, 1000, 5000, 10000] if args.all: - data_size += [70000] + data_size.append(70000) results = [] basename, _ = os.path.splitext(__file__) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index d5fd7e40e7b05..10272e70964e6 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -41,10 +41,10 @@ cdef extern from "time.h": cdef float compute_gradient(float[:] val_P, - float[:,:] pos_reference, + float[:, :] pos_reference, np.int64_t[:] neighbors, np.int64_t[:] indptr, - float[:,:] tot_force, + float[:, :] tot_force, quad_tree._QuadTree qt, float theta, float dof, @@ -93,7 +93,7 @@ cdef float compute_gradient(float[:] val_P, cdef float compute_gradient_positive(float[:] val_P, - float[:,:] pos_reference, + float[:, :] pos_reference, np.int64_t[:] neighbors, np.int64_t[:] indptr, float* pos_f, @@ -144,7 +144,7 @@ cdef float compute_gradient_positive(float[:] val_P, return C -cdef void compute_gradient_negative(float[:,:] pos_reference, +cdef void compute_gradient_negative(float[:, :] pos_reference, float* neg_f, quad_tree._QuadTree qt, float* sum_Q, @@ -226,10 +226,10 @@ def calculate_edge(pos_output): def gradient(float[:] val_P, - float[:,:] pos_output, + float[:, :] pos_output, np.int64_t[:] neighbors, np.int64_t[:] indptr, - float[:,:] forces, + float[:, :] forces, float theta, int n_dimensions, int verbose, diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx index 452ae38f31220..1f51889a4f906 100644 --- a/sklearn/manifold/_utils.pyx +++ b/sklearn/manifold/_utils.pyx @@ -23,11 +23,11 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( Parameters ---------- - affinities : array-like, shape (n_samples, K) - Distances between training samples. + affinities : array-like, shape (n_samples, k) + Distances between training samples and its k nearest neighbors. - neighbors : array-like, shape (n_samples, K) or None - Each row contains the indices to the K nearest neigbors. If this + neighbors : array-like, shape (n_samples, k) or None + Each row contains the indices to the k nearest neigbors. If this array is None, then the perplexity is estimated over all data not just the nearest neighbors. @@ -59,17 +59,17 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( cdef float entropy cdef float sum_Pi cdef float sum_disti_Pi - cdef long i, j, k, l = 0 - cdef long K = n_samples + cdef long i, j, k, l + cdef long n_neighbors = n_samples cdef int using_neighbors = neighbors is not None if using_neighbors: - K = neighbors.shape[1] + n_neighbors = neighbors.shape[1] # This array is later used as a 32bit array. It has multiple intermediate # floating point additions that benefit from the extra precision - cdef np.ndarray[np.float64_t, ndim=2] P = np.zeros((n_samples, K), - dtype=np.float64) + cdef np.ndarray[np.float64_t, ndim=2] P = np.zeros( + (n_samples, n_neighbors), dtype=np.float64) for i in range(n_samples): beta_min = -NPY_INFINITY @@ -82,7 +82,7 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( # computed just over the nearest neighbors or over all data # if we're not using neighbors sum_Pi = 0.0 - for j in range(K): + for j in range(n_neighbors): if j != i or using_neighbors: P[i, j] = math.exp(-affinities[i, j] * beta) sum_Pi += P[i, j] @@ -91,7 +91,7 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( sum_Pi = EPSILON_DBL sum_disti_Pi = 0.0 - for j in range(K): + for j in range(n_neighbors): P[i, j] /= sum_Pi sum_disti_Pi += affinities[i, j] * P[i, j] diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 06af274236dc6..92ecf5d6f2ac9 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -64,7 +64,7 @@ cdef class _QuadTree: """Array-based representation of a QuadTree. This class is currently working for indexing 2D data (regular QuadTree) and - for indexing 3D data (OcTree). It is planned to split the 2 implementation + for indexing 3D data (OcTree). It is planned to split the 2 implementations using `Cython.Tempita` to save some memory for QuadTree. Note that this code is currently internally used only by the Barnes-Hut @@ -110,7 +110,8 @@ cdef class _QuadTree: capacity = 100 self._resize(capacity) - m, M = np.min(X, axis=0) - 1e-3, np.max(X, axis=0) + 1e-3 + m = np.min(X, axis=0) - 1e-3 + M = np.max(X, axis=0) + 1e-3 for i in range(self.n_dimensions): min_bounds[i] = m[i] max_bounds[i] = M[i] @@ -312,29 +313,6 @@ cdef class _QuadTree: self.cell_count += 1 - def plot_tree(self): - """Plot the tree with cell boundaries and the points inserted in it.""" - self._check_coherence() - import matplotlib.pyplot as plt - - plt.figure() - for c in self.cells[:self.cell_count]: - if not c.is_leaf: - # Plot the cell division if the cell is an inner cell - plt.vlines(c.center[0], c.min_bounds[1], c.max_bounds[1]) - plt.hlines(c.center[1], c.min_bounds[0], c.max_bounds[0]) - else: - # If the cell is a leaf, display the point contained in it. - plt.scatter(c.barycenter[0], c.barycenter[1], c='b', marker='.') - - # Print bounding box of the Tree - root = self.cells[0] - plt.vlines([root.min_bounds[0], root.max_bounds[0]], root.min_bounds[1], - root.max_bounds[1]) - plt.hlines([root.min_bounds[1], root.max_bounds[1]], root.min_bounds[0], - root.max_bounds[0]) - plt.show() - cdef int check_point_in_cell(self, DTYPE_t[3] point, Cell* cell ) nogil except -1: """Check that the given point is in the cell boundaries.""" diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py index ca97e0c1a85a6..8b1ad7bac9fab 100644 --- a/sklearn/neighbors/setup.py +++ b/sklearn/neighbors/setup.py @@ -34,8 +34,7 @@ def configuration(parent_package='', top_path=None): config.add_extension("quad_tree", sources=["quad_tree.pyx"], include_dirs=[numpy.get_include()], - libraries=libraries, - extra_compile_args=["-O3"]) + libraries=libraries) config.add_subpackage('tests') From 71fcd29df1c24927d235e72e020cda41c6587dab Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 29 Jun 2017 10:02:11 +0200 Subject: [PATCH 22/55] CLN remove knn extra args for TSNE --- sklearn/manifold/t_sne.py | 40 ++++++--------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index b2e4104f8ce18..5285315365a67 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -573,19 +573,6 @@ class TSNE(BaseEstimator): in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing computation time and angle greater 0.8 has quickly increasing error. - n_jobs : integer (default: 1) - Only used if method='barnes_hut' - Number of CPU used to compute the nearest neighbors of each point. - If ``n_jobs=-1``, use all the CPUs. - - neighbors_method : string or NeighborsBase object (default: 'ball_tree') - Only used if method='barnes_hut' - Method used to compute k nearest neighbors for Barnes-Hut T-SNE. - If it is a string, it should be compatible with the algorithm parameter - of `NearestNeighbors`. If it is an object, it should implement a - `kneighbors` method returning distances_nn and neighbors_nn. - - Attributes ---------- embedding_ : array-like, shape (n_samples, n_components) @@ -627,8 +614,7 @@ def __init__(self, n_components=2, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=30, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, - random_state=None, method='barnes_hut', angle=0.5, n_jobs=1, - neighbors_method='ball_tree'): + random_state=None, method='barnes_hut', angle=0.5): self.n_components = n_components self.perplexity = perplexity self.early_exaggeration = early_exaggeration @@ -642,8 +628,6 @@ def __init__(self, n_components=2, perplexity=30.0, self.random_state = random_state self.method = method self.angle = angle - self.n_jobs = n_jobs - self.neighbors_method = neighbors_method def _fit(self, X, skip_num_points=0): """Fit the model using X as training data. @@ -744,23 +728,11 @@ def _fit(self, X, skip_num_points=0): print("[t-SNE] Computing {} nearest neighbors...".format(k)) # Find the nearest neighbors for every point - if isinstance(self.neighbors_method, string_types): - if (self.metric == 'precomputed' and - self.neighbors_method == "ball_tree"): - warnings.warn("Cannot use neighbors_method='ball_tree' " - "with metric='precomputed'. Switching to " - "neighbors_method='brute'.", RuntimeWarning) - self.neighbors_method = "brute" - knn = NearestNeighbors(algorithm=self.neighbors_method, - n_neighbors=k, metric=self.metric, - n_jobs=self.n_jobs) - elif isinstance(self.neighbors_method, NeighborsBase): - knn = self.neighbors_method - else: - raise ValueError("'neighbors_method' should be either a " - "string or a subclass of NeighborsBase. {} " - "is not valid.".format(self.neighbors_method)) - + neighbors_method = 'ball_tree' + if (self.metric == 'precomputed'): + neighbors_method = 'brute' + knn = NearestNeighbors(algorithm=neighbors_method, n_neighbors=k, + metric=self.metric) t0 = time() knn.fit(X) duration = time() - t0 From b3276eb22bef639890c382dd1da8affa93a1b4e6 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 29 Jun 2017 10:08:49 +0200 Subject: [PATCH 23/55] TST reduce test time --- sklearn/manifold/tests/test_t_sne.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index bea2c11c0878d..93bf1f72d8e66 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -149,7 +149,7 @@ def test_binary_search_neighbors(): assert_array_almost_equal(P_nn, P2, decimal=4) # Test that the highest P_ij are the same when few neighbors are used - for k in np.linspace(80, n_samples, 10): + for k in np.linspace(80, n_samples, 5): k = int(k) topn = k * 10 # check the top 10 *k entries out of k * k entries neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64) @@ -247,10 +247,10 @@ def test_preserve_trustworthiness_approximately(): # perplexity=5, so that the number of neighbors is 5%. n_components = 2 methods = ['exact', 'barnes_hut'] - X = random_state.randn(100, n_components).astype(np.float32) + X = random_state.randn(50, n_components).astype(np.float32) for init in ('random', 'pca'): for method in methods: - tsne = TSNE(n_components=n_components, perplexity=50, + tsne = TSNE(n_components=n_components, perplexity=25, learning_rate=100.0, init=init, random_state=0, method=method) X_embedded = tsne.fit_transform(X) @@ -293,7 +293,7 @@ def test_preserve_trustworthiness_approximately_with_precomputed_distances(): D = squareform(pdist(X), "sqeuclidean") tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, early_exaggeration=2.0, metric="precomputed", - random_state=i, verbose=0, neighbors_method='brute') + random_state=i, verbose=0) X_embedded = tsne.fit_transform(D) t = trustworthiness(D, X_embedded, n_neighbors=1, precomputed=True) @@ -341,16 +341,6 @@ def metric(x, y): tsne.fit_transform, X) -def test_not_available_neighbors_method(): - # Computed distance matrices must be positive. - tsne = TSNE(neighbors_method='not available', method='barnes_hut') - assert_raises_regexp(ValueError, "unrecognized algorithm: 'not available'", - tsne.fit_transform, np.array([[0.0, 1.0]])) - tsne = TSNE(neighbors_method=1, method='barnes_hut') - assert_raises_regexp(ValueError, "'neighbors_method' should be .*", - tsne.fit_transform, np.array([[0.0, 1.0]])) - - def test_init_not_available(): # 'init' must be 'pca', 'random', or numpy array. tsne = TSNE(init="not available") @@ -369,8 +359,7 @@ def test_init_ndarray(): def test_init_ndarray_precomputed(): # Initialize TSNE with ndarray and metric 'precomputed' # Make sure no FutureWarning is thrown from _fit - tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed", - neighbors_method='brute') + tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed") tsne.fit(np.zeros((100, 100))) @@ -557,7 +546,7 @@ def test_64bit(): methods = ['barnes_hut', 'exact'] for method in methods: for dt in [np.float32, np.float64]: - X = random_state.randn(100, 2).astype(dt) + X = random_state.randn(50, 2).astype(dt) tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, random_state=0, method=method, verbose=0) X_embedded = tsne.fit_transform(X) From 1db287153359b59fdf95702af5d6082405512419 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 29 Jun 2017 10:18:55 +0200 Subject: [PATCH 24/55] FIX flake8 unused import --- sklearn/manifold/t_sne.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 5285315365a67..ce41ec1647c40 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -9,7 +9,6 @@ # http://cseweb.ucsd.edu/~lvdmaaten/workshops/nips2010/papers/vandermaaten.pdf from time import time -import warnings import numpy as np from scipy import linalg import scipy.sparse as sp @@ -17,7 +16,6 @@ from scipy.spatial.distance import squareform from scipy.sparse import csr_matrix from ..neighbors import NearestNeighbors -from ..neighbors.base import NeighborsBase from ..base import BaseEstimator from ..utils import check_array from ..utils import check_random_state From 98c64ab4d2d3e76246306f3ce0aa24de8f779d13 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 29 Jun 2017 15:58:50 +0200 Subject: [PATCH 25/55] TST make test_preserve_trustworthiness_approximately less strict --- sklearn/manifold/tests/test_t_sne.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 93bf1f72d8e66..dac743ff9f0aa 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -10,6 +10,7 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_less +from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_raises_regexp from sklearn.utils.testing import assert_in from sklearn.utils.testing import skip_if_32bit @@ -241,10 +242,6 @@ def test_trustworthiness(): def test_preserve_trustworthiness_approximately(): # Nearest neighbors should be preserved approximately. random_state = check_random_state(0) - # The Barnes-Hut approximation uses a different method to estimate - # P_ij using only a number of nearest neighbors instead of all - # points (so that k = 3 * perplexity). As a result we set the - # perplexity=5, so that the number of neighbors is 5%. n_components = 2 methods = ['exact', 'barnes_hut'] X = random_state.randn(50, n_components).astype(np.float32) @@ -254,8 +251,8 @@ def test_preserve_trustworthiness_approximately(): learning_rate=100.0, init=init, random_state=0, method=method) X_embedded = tsne.fit_transform(X) - T = trustworthiness(X, X_embedded, n_neighbors=1) - assert_almost_equal(T, 1.0, decimal=1) + t = trustworthiness(X, X_embedded, n_neighbors=1) + assert_greater(t, 0.9) def test_optimization_minimizes_kl_divergence(): From d04395590d6e8587718f27049e9f45cd27fd333b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 30 Jun 2017 21:52:31 +0200 Subject: [PATCH 26/55] DOC adjust the perplexity range in t-SNE example --- examples/manifold/plot_t_sne_perplexity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/manifold/plot_t_sne_perplexity.py b/examples/manifold/plot_t_sne_perplexity.py index 4165dac14178c..85178549c70c1 100644 --- a/examples/manifold/plot_t_sne_perplexity.py +++ b/examples/manifold/plot_t_sne_perplexity.py @@ -14,7 +14,7 @@ As shown below, t-SNE for higher perplexities finds meaningful topology of two concentric circles, however the size and the distance of the circles varies slightly from the original. Contrary to the two circles dataset, the shapes -visually diverge from S-curve topology on the S-curve dateset even for +visually diverge from S-curve topology on the S-curve dataset even for larger perplexity values. For further details, "How to Use t-SNE Effectively" @@ -37,7 +37,7 @@ n_samples = 500 n_components = 2 (fig, subplots) = plt.subplots(2, 5, figsize=(15, 8)) -perplexities = [5, 50, 100, 150] +perplexities = [5, 30, 50, 100] X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) From 487a46b17e40b5a5d2be92e1bff98de1f244373f Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Mon, 3 Jul 2017 10:49:32 +0200 Subject: [PATCH 27/55] CLN clarify quad_tree header --- sklearn/neighbors/quad_tree.pxd | 33 ++++++++++++++++++++++----------- sklearn/neighbors/quad_tree.pyx | 26 +++++++++++++------------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/sklearn/neighbors/quad_tree.pxd b/sklearn/neighbors/quad_tree.pxd index b30ba6fd4ee7a..1d1b8b7d2a6f2 100644 --- a/sklearn/neighbors/quad_tree.pxd +++ b/sklearn/neighbors/quad_tree.pxd @@ -55,6 +55,8 @@ cdef class _QuadTree: # The QuadTree object is a quad tree structure constructed by inserting # recursively points in the tree and splitting cells in 4 so that each # leaf cell contains at most one point. + # This structure also handle 3D data, inserted in trees with 8 children + # for each node. # Parameters of the tree cdef public int n_dimensions # Number of dimensions in X @@ -68,21 +70,30 @@ cdef class _QuadTree: cdef public SIZE_t n_points # Total number of points cdef Cell* cells # Array of nodes - # Methods + # Point insertion methods cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index, SIZE_t cell_id=*) nogil except -1 - cdef int _resize(self, SIZE_t capacity) nogil except -1 - cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1 + cdef SIZE_t _insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, + SIZE_t point_index, SIZE_t size=* + ) nogil + cdef SIZE_t _select_child(self, DTYPE_t[3] point, Cell* cell) nogil + cdef bint _is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil - cdef SIZE_t insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, - SIZE_t point_index, SIZE_t size=*) nogil - cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil - cdef bint is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil - cdef SIZE_t select_child(self, DTYPE_t[3] point, Cell* cell) nogil - cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds) nogil - - cdef int check_point_in_cell(self, DTYPE_t[3] point, Cell* cell) nogil except -1 + # Create a summary of the Tree compare to a query point cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, int cell_id=*, long idx=*, float squared_theta=*) nogil + + # Internal cell initialization methods + cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil + cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds + ) nogil + + # Private methods + cdef int _check_point_in_cell(self, DTYPE_t[3] point, Cell* cell + ) nogil except -1 + + # Private array manipulation to manage the ``cells`` array + cdef int _resize(self, SIZE_t capacity) nogil except -1 + cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1 cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=*) nogil except -1 cdef np.ndarray _get_cell_ndarray(self) diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 92ecf5d6f2ac9..e93e294cbb99d 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -140,7 +140,7 @@ cdef class _QuadTree: # Assert that the point is in the right range if DEBUGFLAG: - self.check_point_in_cell(point, cell) + self._check_point_in_cell(point, cell) # If the cell is an empty leaf, insert the point in it if cell.cumulative_size == 0: @@ -165,18 +165,18 @@ cdef class _QuadTree: cell.cumulative_size += 1 # Insert child in the correct subtree - selected_child = self.select_child(point, cell) + selected_child = self._select_child(point, cell) if self.verbose >= 10: printf("[QuadTree] selected child %li\n", selected_child) if selected_child == -1: self.n_points += 1 - return self.insert_point_in_new_child(point, cell, point_index) + return self._insert_point_in_new_child(point, cell, point_index) return self.insert_point(point, point_index, selected_child) # Finally, if the cell is a leaf with a point already inserted, # split the cell in n_cells_per_cell if the point is not a duplicate. # If it is a duplicate, increase the size of the leaf and return. - if self.is_duplicate(point, cell.barycenter): + if self._is_duplicate(point, cell.barycenter): if self.verbose >= 10: printf("[QuadTree] found a duplicate!\n") cell.cumulative_size += 1 @@ -185,12 +185,12 @@ cdef class _QuadTree: # In a leaf, the barycenter correspond to the only point included # in it. - self.insert_point_in_new_child(cell.barycenter, cell, cell.point_index, + self._insert_point_in_new_child(cell.barycenter, cell, cell.point_index, cell.cumulative_size) return self.insert_point(point, point_index, cell_id) # XXX: This operation is not Thread safe - cdef SIZE_t insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, + cdef SIZE_t _insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, SIZE_t point_index, SIZE_t size=1 ) nogil: """Create a child of cell which will contain point.""" @@ -253,7 +253,7 @@ cdef class _QuadTree: if DEBUGFLAG: # Assert that the point is in the right range - self.check_point_in_cell(point, child) + self._check_point_in_cell(point, child) if self.verbose >= 10: printf("[QuadTree] inserted point %li in new child %li\n", point_index, cell_id) @@ -261,7 +261,7 @@ cdef class _QuadTree: return cell_id - cdef bint is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil: + cdef bint _is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil: """Check if the two given points are equals.""" cdef int i cdef bint res = True @@ -270,7 +270,7 @@ cdef class _QuadTree: return res - cdef SIZE_t select_child(self, DTYPE_t[3] point, Cell* cell) nogil: + cdef SIZE_t _select_child(self, DTYPE_t[3] point, Cell* cell) nogil: """Select the child of cell which contains the given query point.""" cdef: int i @@ -313,7 +313,7 @@ cdef class _QuadTree: self.cell_count += 1 - cdef int check_point_in_cell(self, DTYPE_t[3] point, Cell* cell + cdef int _check_point_in_cell(self, DTYPE_t[3] point, Cell* cell ) nogil except -1: """Check that the given point is in the cell boundaries.""" @@ -345,7 +345,7 @@ cdef class _QuadTree: for cell in self.cells[:self.cell_count]: # Check that the barycenter of inserted point is within the cell # boundaries - self.check_point_in_cell(cell.barycenter, &cell) + self._check_point_in_cell(cell.barycenter, &cell) if not cell.is_leaf: # Compute the number of point in children and compare with @@ -468,7 +468,7 @@ cdef class _QuadTree: Cell* cell = &self.cells[cell_id] if cell.is_leaf: - if self.is_duplicate(cell.barycenter, point): + if self._is_duplicate(cell.barycenter, point): if self.verbose > 99: printf("[QuadTree] Found point in cell: %li\n", cell.cell_id) @@ -476,7 +476,7 @@ cdef class _QuadTree: with gil: raise ValueError("Query point not in the Tree.") - selected_child = self.select_child(point, cell) + selected_child = self._select_child(point, cell) if selected_child > 0: if self.verbose > 99: printf("[QuadTree] Selected_child: %li\n", selected_child) From f7cdb2e99bf3e42a08064172966e4a9e469cdff4 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Mon, 3 Jul 2017 11:08:03 +0200 Subject: [PATCH 28/55] CLN comment and typo+early free knn --- sklearn/manifold/t_sne.py | 3 +++ sklearn/mixture/base.py | 6 +++--- sklearn/neighbors/quad_tree.pyx | 12 +++++++----- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index ce41ec1647c40..949fed07dfc6c 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -746,6 +746,9 @@ def _fit(self, X, skip_num_points=0): print("[t-SNE] Computed neighbors for {} samples in {:.3f}s..." .format(n_samples, duration)) + # Free the memory used by the ball_tree + del knn + if self.metric == "euclidean": # knn return the euclidean distance but we need it squared # to be consistent with the 'exact' method. Note that the diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py index d00ccc9de9765..88cb62623e138 100644 --- a/sklearn/mixture/base.py +++ b/sklearn/mixture/base.py @@ -340,7 +340,7 @@ def predict(self, X): return self._estimate_weighted_log_prob(X).argmax(axis=1) def predict_proba(self, X): - """Predict posterior probability of data per each component. + """Predict posterior probability of each component given the data. Parameters ---------- @@ -351,8 +351,8 @@ def predict_proba(self, X): Returns ------- resp : array, shape (n_samples, n_components) - Returns the probability of the sample for each Gaussian - (state) in the model. + Returns the probability each Gaussian (state) in + the model given each sample. """ self._check_is_fitted() X = _check_X(X, None, self.means_.shape[1]) diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index e93e294cbb99d..3af9e23282767 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -124,6 +124,7 @@ cdef class _QuadTree: pt[j] = X[i, j] self.insert_point(pt, i) + # Shrink the cells array to reduce memory usage self._resize(capacity=self.cell_count) cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index, @@ -266,6 +267,7 @@ cdef class _QuadTree: cdef int i cdef bint res = True for i in range(self.n_dimensions): + # Use EPSILON to avoid numerical error that would overgrow the tree res &= fabsf(point1[i] - point2[i]) <= EPSILON return res @@ -380,18 +382,18 @@ cdef class _QuadTree: Input arguments --------------- - point: array (n_dimensions) + point : array (n_dimensions) query point to construct the summary. - cell_id: integer, optional (default: 0) + cell_id : integer, optional (default: 0) current cell of the tree summarized. This should be set to 0 for external calls. - idx: integer, optional (default: 0) + idx : integer, optional (default: 0) current index in the result array. This should be set to 0 for external calls Output arguments ---------------- - result: array (n_samples * (n_dimensions+2)) + results : array (n_samples * (n_dimensions+2)) result will contain a summary of the tree information compared to the query point: - results[idx:idx+n_dimensions] contains the coordinate-wise @@ -404,7 +406,7 @@ cdef class _QuadTree: Return ------ - idx: integer + idx : integer number of elements in the results array. """ cdef: From 6619f6a10d9987b76ae092430faf9bf0590bb79a Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Wed, 5 Jul 2017 17:55:42 +0200 Subject: [PATCH 29/55] CLN add what's new entry --- doc/whats_new.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 3c87d4174c388..e959f4a79a2f5 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -19,6 +19,7 @@ occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. * :class:`sklearn.ensemble.IsolationForest` (bug fix) + * :class:`sklearn.manifold.TSNE` (bug fix) Details are listed in the changelog below. @@ -245,6 +246,14 @@ Enhancements - Speed improvements to :class:`model_selection.StratifiedShuffleSplit`. :issue:`5991` by :user:`Arthur Mensch ` and `Joel Nothman`_. + - Memory improvements to :class:`manifold.TSNE` + :issue:`7089` by :user:`Thomas Moreau and :user:`Olivier Grisel `. + + - Optimization schedule improvements for so the results are closer to the + one from the reference implementation + `lvdmaaten/bhtsne`_, by + :user:`Thomas Moreau and :user:`Olivier Grisel `. + Bug fixes ......... @@ -478,6 +487,14 @@ Bug fixes and :class:`linear_model.Ridge` when using ``normalize=True`` by `Alexandre Gramfort`_. + - Fixed the implementation of :class:`manifold.TSNE`: + - ``early_exageration`` parameter had no effect and is now used for the + first 250 optimization iterations. + - Fixed the ``InsersionError`` reported in :issue:`8992`. + - Improve the learning schedule to match the one from the reference + implementation `lvdmaaten/bhtsne`_. + by :user:`Thomas Moreau and :user:`Olivier Grisel `. + API changes summary ------------------- From dec5fb9b0658fe5447d8be873a6dc9568fe3a877 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Wed, 5 Jul 2017 17:56:07 +0200 Subject: [PATCH 30/55] TST improve test_t_sne for changed parameters --- sklearn/manifold/tests/test_t_sne.py | 35 ++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index dac743ff9f0aa..d8ccf35fd8dd5 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -401,6 +401,41 @@ def test_n_components_range(): tsne.fit_transform, np.array([[0.0], [1.0]])) +def test_early_exaggeration_used(): + # check that the ``early_exaggeration`` parameter has an effect + random_state = check_random_state(0) + n_components = 2 + methods = ['exact', 'barnes_hut'] + X = random_state.randn(25, n_components).astype(np.float32) + for method in methods: + tsne = TSNE(n_components=n_components, perplexity=1, + learning_rate=100.0, init="pca", random_state=0, + method=method, early_exaggeration=1.0) + X_embedded1 = tsne.fit_transform(X) + tsne = TSNE(n_components=n_components, perplexity=1, + learning_rate=100.0, init="pca", random_state=0, + method=method, early_exaggeration=10.0) + X_embedded2 = tsne.fit_transform(X) + + assert not np.allclose(X_embedded1, X_embedded2) + + +def test_n_iter_used(): + # check that the ``early_exaggeration`` parameter has an effect + random_state = check_random_state(0) + n_components = 2 + methods = ['exact', 'barnes_hut'] + X = random_state.randn(25, n_components).astype(np.float32) + for method in methods: + for n_iter in [251, 500, 1000]: + tsne = TSNE(n_components=n_components, perplexity=1, + learning_rate=1.0, init="pca", random_state=0, + method=method, early_exaggeration=1.0, n_iter=n_iter) + tsne.fit_transform(X) + + assert tsne.n_iter_final == n_iter - 1 + + def test_answer_gradient_two_points(): # Test the tree with only a single set of children. # From d7991bbe3ab98c91254ba1b669f4e0779acfa1f8 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 6 Jul 2017 11:14:56 +0200 Subject: [PATCH 31/55] FIX what's new entry --- doc/whats_new.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index e959f4a79a2f5..1244c4596b741 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -246,13 +246,13 @@ Enhancements - Speed improvements to :class:`model_selection.StratifiedShuffleSplit`. :issue:`5991` by :user:`Arthur Mensch ` and `Joel Nothman`_. - - Memory improvements to :class:`manifold.TSNE` - :issue:`7089` by :user:`Thomas Moreau and :user:`Olivier Grisel `. + - Memory improvements for method barnes_hut in :class:`manifold.TSNE` + :issue:`7089` by :user:`Thomas Moreau ` and `Olivier Grisel`_. - Optimization schedule improvements for so the results are closer to the one from the reference implementation - `lvdmaaten/bhtsne`_, by - :user:`Thomas Moreau and :user:`Olivier Grisel `. + `lvdmaaten/bhtsne `_ by + :user:`Thomas Moreau ` and `Olivier Grisel`_. Bug fixes ......... @@ -492,8 +492,8 @@ Bug fixes first 250 optimization iterations. - Fixed the ``InsersionError`` reported in :issue:`8992`. - Improve the learning schedule to match the one from the reference - implementation `lvdmaaten/bhtsne`_. - by :user:`Thomas Moreau and :user:`Olivier Grisel `. + implementation `lvdmaaten/bhtsne `_. + by :user:`Thomas Moreau ` and `Olivier Grisel`_. API changes summary ------------------- From 476aeb675042f4fe9a669a05f74f2145bfe17350 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Thu, 6 Jul 2017 11:17:54 +0200 Subject: [PATCH 32/55] FIX typo in test and reduce test time --- sklearn/manifold/tests/test_t_sne.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index d8ccf35fd8dd5..8b9c9d6a76862 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -421,15 +421,15 @@ def test_early_exaggeration_used(): def test_n_iter_used(): - # check that the ``early_exaggeration`` parameter has an effect + # check that the ``n_iter`` parameter has an effect random_state = check_random_state(0) n_components = 2 methods = ['exact', 'barnes_hut'] X = random_state.randn(25, n_components).astype(np.float32) for method in methods: - for n_iter in [251, 500, 1000]: + for n_iter in [251, 500]: tsne = TSNE(n_components=n_components, perplexity=1, - learning_rate=1.0, init="pca", random_state=0, + learning_rate=0.5, init="random", random_state=0, method=method, early_exaggeration=1.0, n_iter=n_iter) tsne.fit_transform(X) From f753da8fbb509e44710b55bf5d00505f06826f0f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 6 Jul 2017 14:36:11 +0200 Subject: [PATCH 33/55] Add uniform grid to perplexity example --- examples/manifold/plot_t_sne_perplexity.py | 40 +++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/examples/manifold/plot_t_sne_perplexity.py b/examples/manifold/plot_t_sne_perplexity.py index 85178549c70c1..cc3dafc12a3ea 100644 --- a/examples/manifold/plot_t_sne_perplexity.py +++ b/examples/manifold/plot_t_sne_perplexity.py @@ -28,15 +28,16 @@ print(__doc__) +import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import NullFormatter from sklearn import manifold, datasets from time import time -n_samples = 500 +n_samples = 300 n_components = 2 -(fig, subplots) = plt.subplots(2, 5, figsize=(15, 8)) +(fig, subplots) = plt.subplots(3, 5, figsize=(15, 8)) perplexities = [5, 30, 50, 100] X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) @@ -71,7 +72,7 @@ X, color = datasets.samples_generator.make_s_curve(n_samples, random_state=0) ax = subplots[1][0] -ax.scatter(X[:, 0], X[:, 2], c=color, cmap=plt.cm.Spectral) +ax.scatter(X[:, 0], X[:, 2], c=color, cmap=plt.cm.viridis) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) @@ -86,9 +87,40 @@ print("S-curve, perplexity=%d in %.2g sec" % (perplexity, t1 - t0)) ax.set_title("Perplexity=%d" % perplexity) - ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) + ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.viridis) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) ax.axis('tight') + +# Another example using a 2D uniform grid +x = np.linspace(0, 1, int(np.sqrt(n_samples))) +xx, yy = np.meshgrid(x, x) +X = np.hstack([ + xx.ravel().reshape(-1, 1), + yy.ravel().reshape(-1, 1), +]) +color = xx.ravel() +ax = subplots[2][0] +ax.scatter(X[:, 0], X[:, 1], c=color, cmap=plt.cm.viridis) +ax.xaxis.set_major_formatter(NullFormatter()) +ax.yaxis.set_major_formatter(NullFormatter()) + +for i, perplexity in enumerate(perplexities): + ax = subplots[2][i + 1] + + t0 = time() + tsne = manifold.TSNE(n_components=n_components, init='random', + random_state=0, perplexity=perplexity) + Y = tsne.fit_transform(X) + t1 = time() + print("uniform grid, perplexity=%d in %.2g sec" % (perplexity, t1 - t0)) + + ax.set_title("Perplexity=%d" % perplexity) + ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.viridis) + ax.xaxis.set_major_formatter(NullFormatter()) + ax.yaxis.set_major_formatter(NullFormatter()) + ax.axis('tight') + + plt.show() From fdc16d61fe52447dbb8b4344fd2cee79e096d2d1 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 6 Jul 2017 15:53:06 +0200 Subject: [PATCH 34/55] TST 2D uniform grid recovery by TSNE --- sklearn/manifold/tests/test_t_sne.py | 41 ++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 8b9c9d6a76862..9f6026dc0a68b 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -4,6 +4,7 @@ import scipy.sparse as sp from sklearn.neighbors import BallTree +from sklearn.neighbors import NearestNeighbors from sklearn.utils.testing import assert_less_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_almost_equal @@ -31,6 +32,14 @@ from sklearn.metrics.pairwise import pairwise_distances +x = np.linspace(0, 1, 10) +xx, yy = np.meshgrid(x, x) +X_2d_grid = np.hstack([ + xx.ravel().reshape(-1, 1), + yy.ravel().reshape(-1, 1), +]) + + def test_gradient_descent_stops(): # Test stopping conditions of gradient descent. class ObjectiveSmallGradient: @@ -717,3 +726,35 @@ def test_accessible_kl_divergence(): error, _, _ = error.partition(',') break assert_almost_equal(tsne.kl_divergence_, float(error), decimal=5) + + +def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000): + """Make sure that TSNE can approximately recover a uniform 2D grid""" + for seed in seeds: + tsne = TSNE(n_components=2, init='random', random_state=seed, + perplexity=10, n_iter=n_iter) + Y = tsne.fit_transform(X_2d_grid) + + # Ensure that the convergence criterion has been triggered + assert tsne.n_iter_ < n_iter + + # Ensure that the resulting embedding leads to approximately + # uniformly spaced points: the distance to the closest neighbors + # should be approximately be non-zero and constant. + nn = NearestNeighbors(n_neighbors=2).fit(Y) + dist_to_nn, _ = nn.kneighbors(Y, return_distance=True) + + # the first neighbor is the query vector it-self + dist_to_nn = dist_to_nn[:, 1] + assert dist_to_nn.min() > 0.1 + + smallest_to_mean = dist_to_nn.min() / np.mean(dist_to_nn) + largest_to_mean = dist_to_nn.max() / np.mean(dist_to_nn) + + assert 0.5 < smallest_to_mean + assert largest_to_mean < 2 + + +def test_uniform_grid(): + for method in ['barnes_hut', 'exact']: + yield check_uniform_grid, method From 63aecca43395decd285659391ee2b5e08e6f8f6a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 6 Jul 2017 16:09:39 +0200 Subject: [PATCH 35/55] TST simpler code --- sklearn/manifold/tests/test_t_sne.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 9f6026dc0a68b..fe0d609159dee 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -741,11 +741,8 @@ def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000): # Ensure that the resulting embedding leads to approximately # uniformly spaced points: the distance to the closest neighbors # should be approximately be non-zero and constant. - nn = NearestNeighbors(n_neighbors=2).fit(Y) - dist_to_nn, _ = nn.kneighbors(Y, return_distance=True) - - # the first neighbor is the query vector it-self - dist_to_nn = dist_to_nn[:, 1] + nn = NearestNeighbors(n_neighbors=1).fit(Y) + dist_to_nn = nn.kneighbors(return_distance=True)[0].ravel() assert dist_to_nn.min() > 0.1 smallest_to_mean = dist_to_nn.min() / np.mean(dist_to_nn) From 4639bb4385ff8bf8da232888dbefad0508a7a443 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 6 Jul 2017 16:23:40 +0200 Subject: [PATCH 36/55] TST fix comment in check_uniform_grid --- sklearn/manifold/tests/test_t_sne.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index fe0d609159dee..ece8f687807b5 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -740,7 +740,7 @@ def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000): # Ensure that the resulting embedding leads to approximately # uniformly spaced points: the distance to the closest neighbors - # should be approximately be non-zero and constant. + # should be non-zero and approximately constant. nn = NearestNeighbors(n_neighbors=1).fit(Y) dist_to_nn = nn.kneighbors(return_distance=True)[0].ravel() assert dist_to_nn.min() > 0.1 From 0509d1215b0437608924951063b7b85ede635d5d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 7 Jul 2017 15:35:11 +0200 Subject: [PATCH 37/55] ENH simplify example usage in TSNE docstring --- sklearn/manifold/t_sne.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 949fed07dfc6c..75db00b2e200f 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -588,9 +588,7 @@ class TSNE(BaseEstimator): >>> import numpy as np >>> from sklearn.manifold import TSNE >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) - >>> model = TSNE(n_components=2, random_state=0) - >>> np.set_printoptions(suppress=True) - >>> X_embedded = model.fit_transform(X) + >>> X_embedded = TSNE(n_components=2).fit_transform(X) >>> X_embedded.shape (4, 2) From e8026890a1a1b88bc0b9c8918ed18a6eb3dad7b7 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Fri, 7 Jul 2017 16:35:11 +0200 Subject: [PATCH 38/55] CLN max_width->squared_max_width to improve code readability --- sklearn/neighbors/quad_tree.pxd | 2 +- sklearn/neighbors/quad_tree.pyx | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/sklearn/neighbors/quad_tree.pxd b/sklearn/neighbors/quad_tree.pxd index 1d1b8b7d2a6f2..541149b3ca932 100644 --- a/sklearn/neighbors/quad_tree.pxd +++ b/sklearn/neighbors/quad_tree.pxd @@ -37,7 +37,7 @@ cdef struct Cell: SIZE_t point_index # Index of the point at this cell (only defined # in non empty leaf) bint is_leaf # Does this cell have children? - DTYPE_t max_width # The value of the maximum width w + DTYPE_t squared_max_width # Squared value of the maximum width w SIZE_t depth # Depth of the cell in the tree SIZE_t cumulative_size # Number of points included in the subtree with # this cell as a root. diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 3af9e23282767..40035f9e2a448 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -47,7 +47,7 @@ CELL_DTYPE = np.dtype({ &( NULL).cell_id, &( NULL).point_index, &( NULL).is_leaf, - &( NULL).max_width, + &( NULL).squared_max_width, &( NULL).depth, &( NULL).cumulative_size, &( NULL).center, @@ -243,7 +243,7 @@ cdef class _QuadTree: width = child.max_bounds[i] - child.min_bounds[i] child.barycenter[i] = point[i] - child.max_width = max(child.max_width, width*width) + child.squared_max_width = max(child.squared_max_width, width*width) # Store the point info and the size to account for duplicated points child.point_index = point_index @@ -291,7 +291,7 @@ cdef class _QuadTree: cell.parent = parent cell.is_leaf = True cell.depth = depth - cell.max_width = 0 + cell.squared_max_width = 0 cell.cumulative_size = 0 for i in range(self.n_cells_per_cell): cell.children[i] = DEFAULT @@ -310,7 +310,7 @@ cdef class _QuadTree: root.max_bounds[i] = max_bounds[i] root.center[i] = (max_bounds[i] + min_bounds[i]) / 2. width = max_bounds[i] - min_bounds[i] - root.max_width = max(root.max_width, width*width) + root.squared_max_width = max(root.squared_max_width, width*width) root.cell_id = 0 self.cell_count += 1 @@ -390,6 +390,13 @@ cdef class _QuadTree: idx : integer, optional (default: 0) current index in the result array. This should be set to 0 for external calls + squared_theta: float, optional (default: .5) + threshold to decide whether the node is sufficiently far + from the query point to be a good summary. The formula is such that + the node is a summary if + node_width^2 / dist_node_point^2 < squared_theta. + Note that the argument should be passed as theta^2 to avoid + computing square roots of the distances. Output arguments ---------------- @@ -430,7 +437,8 @@ cdef class _QuadTree: # is relatively small (w.r.t. to theta) or if it is a leaf node. # If it can be summarized, we use the cell center of mass # Otherwise, we go a higher level of resolution and into the leaves. - if cell.is_leaf or ((cell.max_width / results[idx_d]) < squared_theta): + if cell.is_leaf or ( + (cell.squared_max_width / results[idx_d]) < squared_theta): results[idx_d + 1] = cell.cumulative_size return idx + 2 + self.n_dimensions From c0cc2f2a0155c8a40519cfe83df17a9883eedc0e Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Sun, 9 Jul 2017 09:18:04 +0200 Subject: [PATCH 39/55] FIX duplicated point in quad_tree summary --- sklearn/manifold/_barnes_hut_tsne.pyx | 2 +- sklearn/neighbors/quad_tree.pxd | 5 +++-- sklearn/neighbors/quad_tree.pyx | 17 +++++++++-------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index 10272e70964e6..fe9aab5ed21e3 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -180,7 +180,7 @@ cdef void compute_gradient_negative(float[:, :] pos_reference, # Find which nodes are summarizing and collect their centers of mass # deltas, and sizes, into vectorized arrays t1 = clock() - idx = qt.summarize(pos, summary, 0, 0, theta*theta) + idx = qt.summarize(pos, summary, theta*theta) t2 = clock() # Compute the t-SNE negative force # for the digits dataset, walking the tree diff --git a/sklearn/neighbors/quad_tree.pxd b/sklearn/neighbors/quad_tree.pxd index 541149b3ca932..49d7f1da10e8a 100644 --- a/sklearn/neighbors/quad_tree.pxd +++ b/sklearn/neighbors/quad_tree.pxd @@ -80,8 +80,9 @@ cdef class _QuadTree: cdef bint _is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil # Create a summary of the Tree compare to a query point - cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, int cell_id=*, - long idx=*, float squared_theta=*) nogil + cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, + float squared_theta=*, int cell_id=*, long idx=* + ) nogil # Internal cell initialization methods cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 40035f9e2a448..519fcd551082d 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -110,8 +110,9 @@ cdef class _QuadTree: capacity = 100 self._resize(capacity) - m = np.min(X, axis=0) - 1e-3 - M = np.max(X, axis=0) + 1e-3 + m = np.min(X, axis=0) + M = np.max(X, axis=0) + M = np.maximum(M * 1.001, M + 1e-3) for i in range(self.n_dimensions): min_bounds[i] = m[i] max_bounds[i] = M[i] @@ -187,7 +188,7 @@ cdef class _QuadTree: # In a leaf, the barycenter correspond to the only point included # in it. self._insert_point_in_new_child(cell.barycenter, cell, cell.point_index, - cell.cumulative_size) + cell.cumulative_size) return self.insert_point(point, point_index, cell_id) # XXX: This operation is not Thread safe @@ -376,7 +377,7 @@ cdef class _QuadTree: .format(self.n_points, self.cells[0].cumulative_size)) cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results, - SIZE_t cell_id=0, long idx=0, float squared_theta=.5 + float squared_theta=.5, SIZE_t cell_id=0, long idx=0 ) nogil: """Summarize the tree compared to a query point. @@ -421,12 +422,11 @@ cdef class _QuadTree: bint duplicate = True Cell* cell = &self.cells[cell_id] - idx_d = idx + self.n_dimensions results[idx_d] = 0. for i in range(self.n_dimensions): results[idx + i] = point[i] - cell.barycenter[i] results[idx_d] += results[idx + i] * results[idx + i] - duplicate &= fabsf(results[idx + i]) < EPSILON + duplicate &= fabsf(results[idx + i]) <= EPSILON # Do not compute self interactions if duplicate and cell.is_leaf: @@ -440,14 +440,15 @@ cdef class _QuadTree: if cell.is_leaf or ( (cell.squared_max_width / results[idx_d]) < squared_theta): results[idx_d + 1] = cell.cumulative_size - return idx + 2 + self.n_dimensions + return idx + self.n_dimensions + 2 else: # Recursively compute the summary in nodes for c in range(self.n_cells_per_cell): child_id = cell.children[c] if child_id != -1: - idx = self.summarize(point, results, child_id, idx) + idx = self.summarize(point, results, squared_theta, + child_id, idx) return idx From 0737deb64d75d0e6748ce9dd676d81c4c7ba9b30 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Sun, 9 Jul 2017 09:52:25 +0200 Subject: [PATCH 40/55] TST add summary test for quad_tree --- sklearn/neighbors/quad_tree.pyx | 57 +++++++++++++++++++++++ sklearn/neighbors/tests/test_quad_tree.py | 4 ++ 2 files changed, 61 insertions(+) diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 519fcd551082d..20e47bc95f1f9 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -597,3 +597,60 @@ cdef class _QuadTree: self.capacity = capacity return 0 + + @staticmethod + def test_summarize(): + + cdef: + DTYPE_t[3] query_pt + float* summary + int i, n_samples, n_dimensions + + n_dimensions = 2 + n_samples = 4 + angle = 0.9 + offset = n_dimensions + 2 + X = np.array([[-10., -10.], [9., 10.], [10., 9.], [10., 10.]]) + + n_dimensions = X.shape[1] + qt = _QuadTree(n_dimensions, verbose=0) + qt.build_tree(X) + + summary = malloc(sizeof(float) * n_samples * 4) + + for i in range(n_dimensions): + query_pt[i] = X[0, i] + + # Summary should contain only 1 node with size 3 and distance to + # X[1:] barycenter + idx = qt.summarize(query_pt, summary, angle * angle) + + node_dist = summary[n_dimensions] + node_size = summary[n_dimensions + 1] + + barycenter = X[1:].mean(axis=0) + ds2c = ((X[0] - barycenter) ** 2).sum() + + assert idx == offset + assert node_size == 3, "summary size = {}".format(node_size) + assert np.isclose(node_dist, ds2c) + + # Summary should contain all 3 node with size 1 and distance to + # each point in X[1:] for ``angle=0`` + idx = qt.summarize(query_pt, summary, 0) + + node_dist = summary[n_dimensions] + node_size = summary[n_dimensions + 1] + + barycenter = X[1:].mean(axis=0) + ds2c = ((X[0] - barycenter) ** 2).sum() + + assert idx == 3 * (offset) + for i in range(3): + node_dist = summary[i * offset + n_dimensions] + node_size = summary[i * offset + n_dimensions + 1] + + ds2c = ((X[0] - X[i + 1]) ** 2).sum() + + assert node_size == 1, "summary size = {}".format(node_size) + assert np.isclose(node_dist, ds2c) diff --git a/sklearn/neighbors/tests/test_quad_tree.py b/sklearn/neighbors/tests/test_quad_tree.py index 43a52ec74f988..544a7fd16a0fa 100644 --- a/sklearn/neighbors/tests/test_quad_tree.py +++ b/sklearn/neighbors/tests/test_quad_tree.py @@ -83,3 +83,7 @@ def check_insert_duplicate(n_dimensions=2): for n_dimensions in (2, 3): yield check_insert_duplicate, n_dimensions + + +def test_summarize(): + _QuadTree.test_summarize() From 39f83deeb4226d36669815a0f693e15de7278250 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Sun, 9 Jul 2017 10:12:25 +0200 Subject: [PATCH 41/55] TST fix t-SNE tests for uniform grid --- sklearn/manifold/_barnes_hut_tsne.pyx | 25 +++++-------------------- sklearn/manifold/tests/test_t_sne.py | 3 ++- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index fe9aab5ed21e3..da15b0444935e 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -85,7 +85,7 @@ cdef float compute_gradient(float[:] val_P, for i in range(start, n_samples): for ax in range(n_dimensions): coord = i * n_dimensions + ax - tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sum_Q[0]) + tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sQ) free(neg_f) free(pos_f) @@ -119,12 +119,12 @@ cdef float compute_gradient_positive(float[:] val_P, t1 = clock() for i in range(start, n_samples): + # Init the gradient vector for ax in range(n_dimensions): pos_f[i * n_dimensions + ax] = 0.0 + # Compute the positive interaction for the nearest neighbors for k in range(indptr[i], indptr[i+1]): j = neighbors[k] - # we don't need to exclude the i==j case since we've - # already thrown it out from the list of neighbors dij = 0.0 pij = val_P[k] for ax in range(n_dimensions): @@ -205,26 +205,11 @@ cdef void compute_gradient_negative(float[:, :] pos_reference, printf("[t-SNE] Tree: %li clock ticks | ", dta) printf("Force computation: %li clock ticks\n", dtb) + # Put sum_Q to machine EPSILON to avoid 0 divisions + sum_Q[0] = max(sum_Q[0], EPSILON) free(summary) -def calculate_edge(pos_output): - # Make the boundaries slightly outside of the data - # to avoid floating point error near the edge - left_edge = np.min(pos_output, axis=0) - right_edge = np.max(pos_output, axis=0) - center = (right_edge + left_edge) * 0.5 - width = np.maximum(np.subtract(right_edge, left_edge), EPSILON) - # Exagerate width to avoid boundary edge - printf("WIDTH %f, %f\n", float(width[0]), float(width[1])) - width = width.astype(np.float32) * 1.001 - left_edge = center - width / 2.0 - right_edge = center + width / 2.0 - printf("ROOT_x %f, %f\n", float(left_edge[0]), float(right_edge[0])) - printf("ROOT_y %f, %f\n", float(left_edge[1]), float(right_edge[1])) - return left_edge, right_edge, width - - def gradient(float[:] val_P, float[:, :] pos_output, np.int64_t[:] neighbors, diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index ece8f687807b5..b431c2ac72add 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -732,7 +732,8 @@ def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000): """Make sure that TSNE can approximately recover a uniform 2D grid""" for seed in seeds: tsne = TSNE(n_components=2, init='random', random_state=seed, - perplexity=10, n_iter=n_iter) + perplexity=10, n_iter=n_iter, method=method, + n_iter_without_progress=60) Y = tsne.fit_transform(X_2d_grid) # Ensure that the convergence criterion has been triggered From e24288114071134bb1fa4ebbcd0e1480030d50af Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Mon, 10 Jul 2017 11:49:17 +0200 Subject: [PATCH 42/55] CLN unified TSNE stopping criterion + comments --- sklearn/manifold/_barnes_hut_tsne.pyx | 2 +- sklearn/manifold/t_sne.py | 25 ++++++------------- sklearn/manifold/tests/test_t_sne.py | 35 ++++++++++++++------------- 3 files changed, 26 insertions(+), 36 deletions(-) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index da15b0444935e..173b9211bee1f 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -205,7 +205,7 @@ cdef void compute_gradient_negative(float[:, :] pos_reference, printf("[t-SNE] Tree: %li clock ticks | ", dta) printf("Force computation: %li clock ticks\n", dtb) - # Put sum_Q to machine EPSILON to avoid 0 divisions + # Put sum_Q to machine EPSILON to avoid divisions by 0 sum_Q[0] = max(sum_Q[0], EPSILON) free(summary) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 75db00b2e200f..50f13f55e3911 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -389,7 +389,6 @@ def _gradient_descent(objective, p0, it, n_iter, if new_error is not None: error = new_error - return p, error, i @@ -766,6 +765,8 @@ def _fit(self, X, skip_num_points=0): random_state=random_state) X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) elif self.init == 'random': + # The embedding is initialized with iid samples from Gaussians with + # standard deviation 1e-4. X_embedded = 1e-4 * random_state.randn( n_samples, self.n_components).astype(np.float32) else: @@ -797,32 +798,20 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, # we use is batch gradient descent with two stages: # * initial optimization with early exaggeration and momentum at 0.5 # * final optimization with momentum at 0.8 - # The embedding is initialized with iid samples from Gaussians with - # standard deviation 1e-4. params = X_embedded.ravel() - opt_args = {"it": 0, + opt_args = {"it": 0, "n_iter_without_progress": EXPLORATION_N_ITER, "learning_rate": self.learning_rate, "verbose": self.verbose, "n_iter_check": 50, - "kwargs": dict(skip_num_points=skip_num_points)} + "kwargs": dict(skip_num_points=skip_num_points), + "min_grad_norm": self.min_grad_norm} + opt_args['args'] = [P, degrees_of_freedom, n_samples, + self.n_components] if self.method == 'barnes_hut': obj_func = _kl_divergence_bh - args = [P, degrees_of_freedom, n_samples, - self.n_components] - - opt_args['args'] = args - opt_args['n_iter_without_progress'] = EXPLORATION_N_ITER - # Don't always calculate the cost since that calculation - # can be nearly as expensive as the gradient opt_args['kwargs']['angle'] = self.angle - opt_args['kwargs']['verbose'] = self.verbose else: obj_func = _kl_divergence - opt_args['args'] = [P, degrees_of_freedom, n_samples, - self.n_components] - opt_args['n_iter_without_progress'] = self.n_iter_without_progress - opt_args['min_error_diff'] = 0.0 - opt_args['min_grad_norm'] = self.min_grad_norm # Learning schedule (part 1): do 250 iteration with lower momentum but # higher learning rate controlled via the early exageration parameter diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index b431c2ac72add..c680044b45eda 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -442,7 +442,7 @@ def test_n_iter_used(): method=method, early_exaggeration=1.0, n_iter=n_iter) tsne.fit_transform(X) - assert tsne.n_iter_final == n_iter - 1 + assert tsne.n_iter_ == n_iter - 1 def test_answer_gradient_two_points(): @@ -640,22 +640,23 @@ def test_barnes_hut_angle(): def test_n_iter_without_progress(): # Use a dummy negative n_iter_without_progress and check output on stdout random_state = check_random_state(0) - X = random_state.randn(100, 2) - tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e8, - random_state=1, method='exact', n_iter=300) - - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - tsne.fit_transform(X) - finally: - out = sys.stdout.getvalue() - sys.stdout.close() - sys.stdout = old_stdout - - # The output needs to contain the value of n_iter_without_progress - assert_in("did not make any progress during the " - "last -1 episodes. Finished.", out) + X = random_state.randn(100, 10) + for method in ["barnes_hut", "exact"]: + tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e8, + random_state=1, method='barnes_hut', n_iter=351) + + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + tsne.fit_transform(X) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + + # The output needs to contain the value of n_iter_without_progress + assert_in("did not make any progress during the " + "last -1 episodes. Finished.", out) def test_min_grad_norm(): From b7629cc3c0910a2c71ce25cc24693a65be14e871 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Mon, 10 Jul 2017 14:54:35 +0200 Subject: [PATCH 43/55] FIX pep8+typo/ more log for tests failures --- sklearn/manifold/t_sne.py | 10 ++++++---- sklearn/manifold/tests/test_t_sne.py | 15 ++++++++------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 50f13f55e3911..73fbb656265f2 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -800,11 +800,13 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, # * final optimization with momentum at 0.8 params = X_embedded.ravel() - opt_args = {"it": 0, "n_iter_without_progress": EXPLORATION_N_ITER, + opt_args = {"it": 0, + "n_iter_check": 50, + "n_iter_without_progress": EXPLORATION_N_ITER, + "min_grad_norm": self.min_grad_norm, "learning_rate": self.learning_rate, - "verbose": self.verbose, "n_iter_check": 50, - "kwargs": dict(skip_num_points=skip_num_points), - "min_grad_norm": self.min_grad_norm} + "verbose": self.verbose, + "kwargs": dict(skip_num_points=skip_num_points)} opt_args['args'] = [P, degrees_of_freedom, n_samples, self.n_components] if self.method == 'barnes_hut': diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index c680044b45eda..092d86b79168c 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -256,8 +256,7 @@ def test_preserve_trustworthiness_approximately(): X = random_state.randn(50, n_components).astype(np.float32) for init in ('random', 'pca'): for method in methods: - tsne = TSNE(n_components=n_components, perplexity=25, - learning_rate=100.0, init=init, random_state=0, + tsne = TSNE(n_components=n_components, init=init, random_state=0, method=method) X_embedded = tsne.fit_transform(X) t = trustworthiness(X, X_embedded, n_neighbors=1) @@ -643,7 +642,7 @@ def test_n_iter_without_progress(): X = random_state.randn(100, 10) for method in ["barnes_hut", "exact"]: tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e8, - random_state=1, method='barnes_hut', n_iter=351) + random_state=0, method=method, n_iter=351, init="random") old_stdout = sys.stdout sys.stdout = StringIO() @@ -653,10 +652,11 @@ def test_n_iter_without_progress(): out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout + print(out) # The output needs to contain the value of n_iter_without_progress assert_in("did not make any progress during the " - "last -1 episodes. Finished.", out) + "last -1 episodes. Finished.", out) def test_min_grad_norm(): @@ -734,7 +734,7 @@ def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000): for seed in seeds: tsne = TSNE(n_components=2, init='random', random_state=seed, perplexity=10, n_iter=n_iter, method=method, - n_iter_without_progress=60) + n_iter_without_progress=60, verbose=10) Y = tsne.fit_transform(X_2d_grid) # Ensure that the convergence criterion has been triggered @@ -750,8 +750,9 @@ def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000): smallest_to_mean = dist_to_nn.min() / np.mean(dist_to_nn) largest_to_mean = dist_to_nn.max() / np.mean(dist_to_nn) - assert 0.5 < smallest_to_mean - assert largest_to_mean < 2 + try_name = "{}_{}".format(method, seed) + assert_greater(smallest_to_mean, .5, msg=try_name) + assert_less(largest_to_mean, 2, msg=try_name) def test_uniform_grid(): From 73cb4213bf5e44f136b60f6de0439941357b9d95 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Tue, 11 Jul 2017 11:21:06 +0200 Subject: [PATCH 44/55] FIX quad_tree boundary computations --- sklearn/neighbors/quad_tree.pyx | 4 +++- sklearn/neighbors/tests/test_quad_tree.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 20e47bc95f1f9..991ce94fa1191 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -112,7 +112,9 @@ cdef class _QuadTree: self._resize(capacity) m = np.min(X, axis=0) M = np.max(X, axis=0) - M = np.maximum(M * 1.001, M + 1e-3) + # Scale the maximum to get all points strictly in the tree bounding box + # The 3 bounds are for positive, negative and small values + M = np.maximum(M * (1. + 1e-3 * np.sign(M)), M + 1e-3) for i in range(self.n_dimensions): min_bounds[i] = m[i] max_bounds[i] = M[i] diff --git a/sklearn/neighbors/tests/test_quad_tree.py b/sklearn/neighbors/tests/test_quad_tree.py index 544a7fd16a0fa..6cfa4bcc562e2 100644 --- a/sklearn/neighbors/tests/test_quad_tree.py +++ b/sklearn/neighbors/tests/test_quad_tree.py @@ -4,6 +4,25 @@ from sklearn.utils import check_random_state +def test_quadtree_boundary_computation(): + # Introduce a point into a quad tree with boundaries not easy to compute. + Xs = [] + + # check a random case + Xs.append(np.array([[-1, 1], [-4, -1]], dtype=np.float32)) + # check the case where only 0 are inserted + Xs.append(np.array([[0, 0], [0, 0]], dtype=np.float32)) + # check the case where only negative are inserted + Xs.append(np.array([[-1, -2], [-4, 0]], dtype=np.float32)) + # check the case where only small numbers are inserted + Xs.append(np.array([[-1e-6, 1e-6], [-4e-6, -1e-6]], dtype=np.float32)) + + for X in Xs: + tree = _QuadTree(n_dimensions=2, verbose=0) + tree.build_tree(X) + tree._check_coherence() + + def test_quadtree_similar_point(): # Introduce a point into a quad tree where a similar point already exists. # Test will hang if it doesn't complete. From ae12767a62920cc8785cb716a986031a0768333f Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Tue, 11 Jul 2017 11:23:43 +0200 Subject: [PATCH 45/55] ENH use dbl tsne_bh sum_Q to match exact solver --- sklearn/manifold/_barnes_hut_tsne.pyx | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index 173b9211bee1f..5041c1db5adb7 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -26,6 +26,7 @@ cdef extern from "math.h": # effectively ignoring differences near the 32bit # floating point precision cdef float EPSILON = 1e-6 +cdef float DBL_EPSILON = 1e-16 # This is effectively an ifdef statement in Cython # It allows us to write printf debugging lines @@ -57,7 +58,7 @@ cdef float compute_gradient(float[:] val_P, int ax long n_samples = pos_reference.shape[0] int n_dimensions = qt.n_dimensions - float[1] sum_Q + double[1] sum_Q clock_t t1, t2 float sQ, error @@ -99,7 +100,7 @@ cdef float compute_gradient_positive(float[:] val_P, float* pos_f, int n_dimensions, float dof, - float sum_Q, + double sum_Q, np.int64_t start, int verbose) nogil: # Sum over the following expression for i not equal to j @@ -133,7 +134,7 @@ cdef float compute_gradient_positive(float[:] val_P, qij = (((1.0 + dij) / dof) ** exponent) dij = pij * qij qij /= sum_Q - C += pij * log((pij + EPSILON) / (qij + EPSILON)) + C += pij * log(max(pij, EPSILON) / max(qij, EPSILON)) for ax in range(n_dimensions): pos_f[i * n_dimensions + ax] += dij * buff[ax] t2 = clock() @@ -147,7 +148,7 @@ cdef float compute_gradient_positive(float[:] val_P, cdef void compute_gradient_negative(float[:, :] pos_reference, float* neg_f, quad_tree._QuadTree qt, - float* sum_Q, + double* sum_Q, float dof, float theta, long start, @@ -163,7 +164,8 @@ cdef void compute_gradient_negative(float[:, :] pos_reference, long dtb = 0 long offset = n_dimensions + 2 long* l - float size, dist2s, qijZ, mult + float size, dist2s, mult + double qijZ float[1] iQ float[3] force, neg_force, pos clock_t t1, t2, t3 @@ -206,7 +208,7 @@ cdef void compute_gradient_negative(float[:, :] pos_reference, printf("Force computation: %li clock ticks\n", dtb) # Put sum_Q to machine EPSILON to avoid divisions by 0 - sum_Q[0] = max(sum_Q[0], EPSILON) + sum_Q[0] = max(sum_Q[0], DBL_EPSILON) free(summary) From 8a9c9e740562612b686911db947843310fa28abf Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Tue, 11 Jul 2017 11:24:04 +0200 Subject: [PATCH 46/55] CLN param description+Add private optim controls - CLN parameter description for `n_iter_without_progress` - ENH add TSNE._EXPLORATION_N_ITER to better control optim args in tests - ENH add TSNE._N_ITER_CHECK to better control optim args in tests - ENH add test that ensure barnes_hut and exact are behaving the same, up to numerical errors --- sklearn/manifold/t_sne.py | 19 ++++++++++++------- sklearn/manifold/tests/test_t_sne.py | 26 +++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 73fbb656265f2..6f4e375f0ba22 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -168,7 +168,7 @@ def _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components, # np.sum(x * y) because it calls BLAS # Objective: C (Kullback-Leibler divergence of P and Q) - kl_divergence = 2.0 * np.dot(P, np.log(P / Q)) + kl_divergence = 2.0 * np.dot(P, np.log(np.maximum(P, MACHINE_EPSILON) / Q)) # Gradient: dC/dY # pdist always returns double precision distances. Thus we need to take @@ -507,10 +507,10 @@ class TSNE(BaseEstimator): least 250. n_iter_without_progress : int, optional (default: 30) - Only used if method='exact' Maximum number of iterations without progress before we abort the - optimization. If method='barnes_hut' this parameter is fixed to - a value of 30 and cannot be changed. + optimization, used after 250 initial iterations with early + exaggeration. Note that progress is only checked every 50 iterations so + this value is rounded to the next multiple of 50. .. versionadded:: 0.17 parameter *n_iter_without_progress* to control stopping criteria. @@ -604,6 +604,11 @@ class TSNE(BaseEstimator): Journal of Machine Learning Research 15(Oct):3221-3245, 2014. http://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf """ + # Control the number of exploration iterations with early_exaggeration on + _EXPLORATION_N_ITER = 250 + + # Control the number of iterations between progress checks + _N_ITER_CHECK = 50 def __init__(self, n_components=2, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, @@ -801,8 +806,8 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, params = X_embedded.ravel() opt_args = {"it": 0, - "n_iter_check": 50, - "n_iter_without_progress": EXPLORATION_N_ITER, + "n_iter_check": self._N_ITER_CHECK, + "n_iter_without_progress": self._EXPLORATION_N_ITER, "min_grad_norm": self.min_grad_norm, "learning_rate": self.learning_rate, "verbose": self.verbose, @@ -817,7 +822,7 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, # Learning schedule (part 1): do 250 iteration with lower momentum but # higher learning rate controlled via the early exageration parameter - opt_args['n_iter'] = EXPLORATION_N_ITER + opt_args['n_iter'] = self._EXPLORATION_N_ITER opt_args['momentum'] = 0.5 P *= self.early_exaggeration diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 092d86b79168c..1d67d74a32085 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -643,6 +643,8 @@ def test_n_iter_without_progress(): for method in ["barnes_hut", "exact"]: tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e8, random_state=0, method=method, n_iter=351, init="random") + tsne._N_ITER_CHECK = 1 + tsne._EXPLORATION_N_ITER = 0 old_stdout = sys.stdout sys.stdout = StringIO() @@ -734,7 +736,7 @@ def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000): for seed in seeds: tsne = TSNE(n_components=2, init='random', random_state=seed, perplexity=10, n_iter=n_iter, method=method, - n_iter_without_progress=60, verbose=10) + n_iter_without_progress=101, verbose=10) Y = tsne.fit_transform(X_2d_grid) # Ensure that the convergence criterion has been triggered @@ -758,3 +760,25 @@ def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000): def test_uniform_grid(): for method in ['barnes_hut', 'exact']: yield check_uniform_grid, method + + +def test_bh_match_exact(): + # check that the ``barnes_hut`` method match the exact one when + # ``angle = 0`` and ``perplexity > n_samples / 3`` + random_state = check_random_state(0) + n_features = 10 + X = random_state.randn(30, n_features).astype(np.float32) + X_embeddeds = {} + n_iter = {} + for method in ['exact', 'barnes_hut']: + tsne = TSNE(n_components=2, method=method, learning_rate=1.0, + init="random", random_state=0, n_iter=251, + perplexity=30.0, angle=0) + # Kill the early_exaggeration + tsne._EXPLORATION_N_ITER = 0 + X_embeddeds[method] = tsne.fit_transform(X) + n_iter[method] = tsne.n_iter_ + + assert n_iter['exact'] == n_iter['barnes_hut'] + assert_array_almost_equal(X_embeddeds['exact'], X_embeddeds['barnes_hut'], + decimal=3) From ec80a29fe03f4c03bfa0e6928a74477abc9524ea Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 11 Jul 2017 13:40:41 +0200 Subject: [PATCH 47/55] FIX TSNE set n_iter_without_progress=300 by default --- sklearn/manifold/t_sne.py | 8 ++++---- sklearn/manifold/tests/test_t_sne.py | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 6f4e375f0ba22..77b75a2d0ca08 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -254,7 +254,7 @@ def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, def _gradient_descent(objective, p0, it, n_iter, - n_iter_check=1, n_iter_without_progress=51, + n_iter_check=1, n_iter_without_progress=300, momentum=0.8, learning_rate=200.0, min_gain=0.01, min_grad_norm=1e-7, min_error_diff=1e-7, verbose=0, args=None, kwargs=None): @@ -282,7 +282,7 @@ def _gradient_descent(objective, p0, it, n_iter, Number of iterations before evaluating the global error. If the error is sufficiently low, we abort the optimization. - n_iter_without_progress : int, optional (default: 51) + n_iter_without_progress : int, optional (default: 300) Maximum number of iterations without progress before we abort the optimization. @@ -506,7 +506,7 @@ class TSNE(BaseEstimator): Maximum number of iterations for the optimization. Should be at least 250. - n_iter_without_progress : int, optional (default: 30) + n_iter_without_progress : int, optional (default: 300) Maximum number of iterations without progress before we abort the optimization, used after 250 initial iterations with early exaggeration. Note that progress is only checked every 50 iterations so @@ -612,7 +612,7 @@ class TSNE(BaseEstimator): def __init__(self, n_components=2, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, - n_iter_without_progress=30, min_grad_norm=1e-7, + n_iter_without_progress=300, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, random_state=None, method='barnes_hut', angle=0.5): self.n_components = n_components diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 1d67d74a32085..86c7f01e19a81 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -735,8 +735,7 @@ def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000): """Make sure that TSNE can approximately recover a uniform 2D grid""" for seed in seeds: tsne = TSNE(n_components=2, init='random', random_state=seed, - perplexity=10, n_iter=n_iter, method=method, - n_iter_without_progress=101, verbose=10) + perplexity=10, n_iter=n_iter, method=method) Y = tsne.fit_transform(X_2d_grid) # Ensure that the convergence criterion has been triggered From 2b26219cb52310053b3af097e52cba350bd07ba1 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 11 Jul 2017 13:46:42 +0200 Subject: [PATCH 48/55] FIX bench_tsne_mnist.py: no n_jobs for now --- benchmarks/bench_tsne_mnist.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 4707150ac9bbd..26dde6aac3123 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -82,8 +82,6 @@ def sanitize(filename): help="if set, run the benchmark with a memory " "profiler.") parser.add_argument('--verbose', type=int, default=0) - parser.add_argument('--n_jobs', type=int, nargs="+", default=2, - help="Number of CPU used to fit sklearn.TSNE") parser.add_argument('--pca-components', type=int, default=50, help="Number of principal components for " "preprocessing.") @@ -100,17 +98,10 @@ def sanitize(filename): methods = [] # Put TSNE in methods - if isinstance(args.n_jobs, int): - tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, - verbose=args.verbose, n_jobs=args.n_jobs, n_iter=1000) - methods.append(("sklearn TSNE", - lambda data: tsne_fit_transform(tsne, data))) - elif isinstance(args.n_jobs, list): - for n_jobs in args.n_jobs: - tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, - verbose=args.verbose, n_jobs=n_jobs) - methods.append(("sklearn TSNE (n_jobs={})".format(n_jobs), - lambda data: tsne_fit_transform(tsne, data))) + tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, + verbose=args.verbose, n_iter=1000) + methods.append(("sklearn TSNE", + lambda data: tsne_fit_transform(tsne, data))) if args.bhtsne: try: From 8e0e79803c85262d2724eeb1f432257f8b12a8fc Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 11 Jul 2017 20:03:07 +0200 Subject: [PATCH 49/55] FIX various optimization schedule issues in TSNE Respect class level _EXPLORATION_N_ITER. Disable min_error_diff. Fix docstring about min_grad_norm. --- sklearn/manifold/t_sne.py | 51 ++++++++++------------------ sklearn/manifold/tests/test_t_sne.py | 22 ++---------- 2 files changed, 20 insertions(+), 53 deletions(-) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 77b75a2d0ca08..4585aa5a25ea2 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -28,7 +28,6 @@ MACHINE_EPSILON = np.finfo(np.double).eps -EXPLORATION_N_ITER = 250 def _joint_probabilities(distances, desired_perplexity, verbose): @@ -256,8 +255,7 @@ def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, def _gradient_descent(objective, p0, it, n_iter, n_iter_check=1, n_iter_without_progress=300, momentum=0.8, learning_rate=200.0, min_gain=0.01, - min_grad_norm=1e-7, min_error_diff=1e-7, verbose=0, - args=None, kwargs=None): + min_grad_norm=1e-7, verbose=0, args=None, kwargs=None): """Batch gradient descent with momentum and individual gains. Parameters @@ -304,10 +302,6 @@ def _gradient_descent(objective, p0, it, n_iter, If the gradient norm is below this threshold, the optimization will be aborted. - min_error_diff : float, optional (default: 1e-7) - If the absolute difference of two successive cost function values - is below this threshold, the optimization will be aborted. - verbose : int, optional (default: 0) Verbosity level. @@ -342,7 +336,7 @@ def _gradient_descent(objective, p0, it, n_iter, tic = time() for i in range(it, n_iter): - new_error, grad = objective(p, *args, **kwargs) + error, grad = objective(p, *args, **kwargs) grad_norm = linalg.norm(grad) inc = update * grad < 0.0 @@ -358,8 +352,6 @@ def _gradient_descent(objective, p0, it, n_iter, toc = time() duration = toc - tic tic = toc - error_diff = np.abs(new_error - error) - error = new_error if verbose >= 2: print("[t-SNE] Iteration %d: error = %.7f," @@ -381,14 +373,7 @@ def _gradient_descent(objective, p0, it, n_iter, print("[t-SNE] Iteration %d: gradient norm %f. Finished." % (i + 1, grad_norm)) break - if error_diff <= min_error_diff: - if verbose >= 2: - m = "[t-SNE] Iteration %d: error difference %f. Finished." - print(m % (i + 1, error_diff)) - break - if new_error is not None: - error = new_error return p, error, i @@ -516,10 +501,8 @@ class TSNE(BaseEstimator): parameter *n_iter_without_progress* to control stopping criteria. min_grad_norm : float, optional (default: 1e-7) - Only used if method='exact' If the gradient norm is below this threshold, the optimization will - be aborted. If method='barnes_hut' this parameter is fixed to a value - of 1e-3 and cannot be changed. + be stopped. metric : string or callable, optional The metric to use when calculating distance between instances in a @@ -805,15 +788,18 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, # * final optimization with momentum at 0.8 params = X_embedded.ravel() - opt_args = {"it": 0, - "n_iter_check": self._N_ITER_CHECK, - "n_iter_without_progress": self._EXPLORATION_N_ITER, - "min_grad_norm": self.min_grad_norm, - "learning_rate": self.learning_rate, - "verbose": self.verbose, - "kwargs": dict(skip_num_points=skip_num_points)} - opt_args['args'] = [P, degrees_of_freedom, n_samples, - self.n_components] + opt_args = { + "it": 0, + "n_iter_check": self._N_ITER_CHECK, + "min_grad_norm": self.min_grad_norm, + "learning_rate": self.learning_rate, + "verbose": self.verbose, + "kwargs": dict(skip_num_points=skip_num_points), + "args": [P, degrees_of_freedom, n_samples, self.n_components], + "n_iter_without_progress": self._EXPLORATION_N_ITER, + "n_iter": self._EXPLORATION_N_ITER, + "momentum": 0.5, + } if self.method == 'barnes_hut': obj_func = _kl_divergence_bh opt_args['kwargs']['angle'] = self.angle @@ -822,10 +808,7 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, # Learning schedule (part 1): do 250 iteration with lower momentum but # higher learning rate controlled via the early exageration parameter - opt_args['n_iter'] = self._EXPLORATION_N_ITER - opt_args['momentum'] = 0.5 P *= self.early_exaggeration - params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args) if self.verbose: @@ -835,8 +818,8 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, # Learning schedule (part 2): disable early exaggeration and finish # optimization with a higher momentum at 0.8 P /= self.early_exaggeration - remaining = self.n_iter - EXPLORATION_N_ITER - if it < EXPLORATION_N_ITER or remaining > 0: + remaining = self.n_iter - self._EXPLORATION_N_ITER + if it < self._EXPLORATION_N_ITER or remaining > 0: opt_args['n_iter'] = self.n_iter opt_args['it'] = it + 1 opt_args['momentum'] = 0.8 diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 86c7f01e19a81..22085048b7367 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -60,7 +60,7 @@ def flat_function(_): _, error, it = _gradient_descent( ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=100, n_iter_without_progress=100, momentum=0.0, learning_rate=0.0, - min_gain=0.0, min_grad_norm=1e-5, min_error_diff=0.0, verbose=2) + min_gain=0.0, min_grad_norm=1e-5, verbose=2) finally: out = sys.stdout.getvalue() sys.stdout.close() @@ -69,22 +69,6 @@ def flat_function(_): assert_equal(it, 0) assert("gradient norm" in out) - # Error difference - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - _, error, it = _gradient_descent( - ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=100, - n_iter_without_progress=100, momentum=0.0, learning_rate=0.0, - min_gain=0.0, min_grad_norm=0.0, min_error_diff=0.2, verbose=2) - finally: - out = sys.stdout.getvalue() - sys.stdout.close() - sys.stdout = old_stdout - assert_equal(error, 0.9) - assert_equal(it, 1) - assert("error difference" in out) - # Maximum number of iterations without improvement old_stdout = sys.stdout sys.stdout = StringIO() @@ -92,7 +76,7 @@ def flat_function(_): _, error, it = _gradient_descent( flat_function, np.zeros(1), 0, n_iter=100, n_iter_without_progress=10, momentum=0.0, learning_rate=0.0, - min_gain=0.0, min_grad_norm=0.0, min_error_diff=-1.0, verbose=2) + min_gain=0.0, min_grad_norm=0.0, verbose=2) finally: out = sys.stdout.getvalue() sys.stdout.close() @@ -108,7 +92,7 @@ def flat_function(_): _, error, it = _gradient_descent( ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=11, n_iter_without_progress=100, momentum=0.0, learning_rate=0.0, - min_gain=0.0, min_grad_norm=0.0, min_error_diff=0.0, verbose=2) + min_gain=0.0, min_grad_norm=0.0, verbose=2) finally: out = sys.stdout.getvalue() sys.stdout.close() From 4a71ecb7f2f648ee671787304e5e21694b6f120a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 11 Jul 2017 20:21:53 +0200 Subject: [PATCH 50/55] TST fix broken test in tsne --- benchmarks/bench_tsne_mnist.py | 1 + sklearn/manifold/tests/test_t_sne.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 26dde6aac3123..be12191befe38 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -138,6 +138,7 @@ def bhtsne(X): methods = [(n, profile(m)) for n, m in methods] data_size = [100, 500, 1000, 5000, 10000] + data_size = [] if args.all: data_size.append(70000) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 22085048b7367..0aecf91a0c90f 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -531,9 +531,7 @@ def test_verbose(): assert("nearest neighbors..." in out) assert("Computed conditional probabilities" in out) assert("Mean sigma" in out) - assert("Finished" in out) assert("early exaggeration" in out) - assert("Finished" in out) def test_chebyshev_metric(): From 90241125305ece7fff3c8fa90c2dc2ceae0bf246 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 11 Jul 2017 21:55:46 +0200 Subject: [PATCH 51/55] FIX debug code in bench script [ci skip] --- benchmarks/bench_tsne_mnist.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index be12191befe38..26dde6aac3123 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -138,7 +138,6 @@ def bhtsne(X): methods = [(n, profile(m)) for n, m in methods] data_size = [100, 500, 1000, 5000, 10000] - data_size = [] if args.all: data_size.append(70000) From a274ab66b5b51b6769ad7378120f3979bbd970c5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 11 Jul 2017 23:20:33 +0200 Subject: [PATCH 52/55] TST compare KL error of BH approx vs exact with angle=0 --- sklearn/manifold/tests/test_t_sne.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 0aecf91a0c90f..b62f2b508fbab 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -595,8 +595,8 @@ def test_barnes_hut_angle(): np.fill_diagonal(distances, 0.0) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, verbose=0) - kl, gradex = _kl_divergence(params, P, degrees_of_freedom, n_samples, - n_components) + kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom, + n_samples, n_components) k = n_samples - 1 bt = BallTree(distances) @@ -606,15 +606,17 @@ def test_barnes_hut_angle(): for i in range(n_samples)]) assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\ abs(distances[0, neighbors_nn[0]] - distances_nn[0]) - Pbh = _joint_probabilities_nn(distances_nn, neighbors_nn, - perplexity, verbose=0) - kl, gradbh = _kl_divergence_bh(params, Pbh, degrees_of_freedom, - n_samples, n_components, angle=angle, - skip_num_points=0, verbose=0) + P_bh = _joint_probabilities_nn(distances_nn, neighbors_nn, + perplexity, verbose=0) + kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom, + n_samples, n_components, + angle=angle, skip_num_points=0, + verbose=0) P = squareform(P) - Pbh = Pbh.toarray() - assert_array_almost_equal(Pbh, P, decimal=5) + P_bh = P_bh.toarray() + assert_array_almost_equal(P_bh, P, decimal=5) + assert_almost_equal(kl_exact, kl_bh, decimal=3) @skip_if_32bit From 5b36cc7bab7a0c9b23b450cdaf02891339069058 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Wed, 12 Jul 2017 10:55:48 +0200 Subject: [PATCH 53/55] CLN verbose log more informative --- sklearn/manifold/_barnes_hut_tsne.pyx | 3 +- sklearn/manifold/t_sne.py | 2 ++ sklearn/neighbors/quad_tree.pxd | 2 +- sklearn/neighbors/quad_tree.pyx | 44 ++++++++++++++++++--------- 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index 5041c1db5adb7..fea6f81f783df 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -236,7 +236,8 @@ def gradient(float[:] val_P, assert n == indptr.shape[0] - 1, m if verbose > 10: printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions) - cdef quad_tree._QuadTree qt = quad_tree._QuadTree(pos_output.shape[1], 0) + cdef quad_tree._QuadTree qt = quad_tree._QuadTree(pos_output.shape[1], + verbose) if verbose > 10: printf("[t-SNE] Inserting %li points\n", pos_output.shape[0]) qt.build_tree(pos_output) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 4585aa5a25ea2..163e8340f7b29 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -803,6 +803,8 @@ def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded, if self.method == 'barnes_hut': obj_func = _kl_divergence_bh opt_args['kwargs']['angle'] = self.angle + # Repeat verbose argument for _kl_divergence_bh + opt_args['kwargs']['verbose'] = self.verbose else: obj_func = _kl_divergence diff --git a/sklearn/neighbors/quad_tree.pxd b/sklearn/neighbors/quad_tree.pxd index 49d7f1da10e8a..0dc4bd3fe5f3c 100644 --- a/sklearn/neighbors/quad_tree.pxd +++ b/sklearn/neighbors/quad_tree.pxd @@ -18,7 +18,7 @@ ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer # It allows us to write printf debugging lines # and remove them at compile time cdef enum: - DEBUGFLAG = 1 + DEBUGFLAG = 0 cdef float EPSILON = 1e-6 diff --git a/sklearn/neighbors/quad_tree.pyx b/sklearn/neighbors/quad_tree.pyx index 991ce94fa1191..b2cdaac84cb67 100644 --- a/sklearn/neighbors/quad_tree.pyx +++ b/sklearn/neighbors/quad_tree.pyx @@ -101,8 +101,10 @@ cdef class _QuadTree: def build_tree(self, X): """Build a tree from an arary of points X.""" - cdef DTYPE_t[3] pt - cdef DTYPE_t[3] min_bounds, max_bounds + cdef: + int i + DTYPE_t[3] pt + DTYPE_t[3] min_bounds, max_bounds # validate X and prepare for query # X = check_array(X, dtype=DTYPE_t, order='C') @@ -119,6 +121,10 @@ cdef class _QuadTree: min_bounds[i] = m[i] max_bounds[i] = M[i] + if self.verbose > 10: + printf("[QuadTree] bounding box axis %i : [%f, %f]\n", + i, min_bounds[i], max_bounds[i]) + # Create the initial node with boundaries from the dataset self._init_root(min_bounds, max_bounds) @@ -139,7 +145,7 @@ cdef class _QuadTree: cdef Cell* cell = &self.cells[cell_id] cdef SIZE_t n_point = cell.cumulative_size - if self.verbose >= 10: + if self.verbose > 10: printf("[QuadTree] Inserting depth %li\n", cell.depth) # Assert that the point is in the right range @@ -153,8 +159,9 @@ cdef class _QuadTree: for i in range(self.n_dimensions): cell.barycenter[i] = point[i] cell.point_index = point_index - if self.verbose >= 10: - printf("[QuadTree] inserted point in cell %li\n", cell_id) + if self.verbose > 10: + printf("[QuadTree] inserted point %li in cell %li\n", + point_index, cell_id) return cell_id # If the cell is not a leaf, update cell internals and @@ -170,7 +177,7 @@ cdef class _QuadTree: # Insert child in the correct subtree selected_child = self._select_child(point, cell) - if self.verbose >= 10: + if self.verbose > 49: printf("[QuadTree] selected child %li\n", selected_child) if selected_child == -1: self.n_points += 1 @@ -181,7 +188,7 @@ cdef class _QuadTree: # split the cell in n_cells_per_cell if the point is not a duplicate. # If it is a duplicate, increase the size of the leaf and return. if self._is_duplicate(point, cell.barycenter): - if self.verbose >= 10: + if self.verbose > 10: printf("[QuadTree] found a duplicate!\n") cell.cumulative_size += 1 self.n_points += 1 @@ -258,7 +265,7 @@ cdef class _QuadTree: if DEBUGFLAG: # Assert that the point is in the right range self._check_point_in_cell(point, child) - if self.verbose >= 10: + if self.verbose > 10: printf("[QuadTree] inserted point %li in new child %li\n", point_index, cell_id) @@ -322,13 +329,20 @@ cdef class _QuadTree: ) nogil except -1: """Check that the given point is in the cell boundaries.""" - if self.verbose >= 10: - printf("[QuadTree] Checking point (%f, %f, %f) in cell %li " - "([%f/%f, %f/%f, %f/%f], size %li)\n", - point[0], point[1], point[2], cell.cell_id, - cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], - cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], - cell.cumulative_size) + if self.verbose >= 50: + if self.n_dimensions == 3: + printf("[QuadTree] Checking point (%f, %f, %f) in cell %li " + "([%f/%f, %f/%f, %f/%f], size %li)\n", + point[0], point[1], point[2], cell.cell_id, + cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], + cell.cumulative_size) + else: + printf("[QuadTree] Checking point (%f, %f) in cell %li " + "([%f/%f, %f/%f], size %li)\n", + point[0], point[1],cell.cell_id, cell.min_bounds[0], + cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.cumulative_size) for i in range(self.n_dimensions): if (cell.min_bounds[i] > point[i] or From f88351989b54f45b34b7aba8c98b05ee5a73b2fe Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 12 Jul 2017 21:54:32 +0200 Subject: [PATCH 54/55] CLN left out debug print --- sklearn/manifold/tests/test_t_sne.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index b62f2b508fbab..2311b48ee2eae 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -638,7 +638,6 @@ def test_n_iter_without_progress(): out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout - print(out) # The output needs to contain the value of n_iter_without_progress assert_in("did not make any progress during the " From e9d5c921ec5d6221141a3cf6bc1506b91d6d43e8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 12 Jul 2017 21:55:14 +0200 Subject: [PATCH 55/55] FIX numerical precision in Barnes Hut error computation --- sklearn/manifold/_barnes_hut_tsne.pyx | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index fea6f81f783df..f08a2ced26767 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -22,11 +22,13 @@ cdef char* EMPTY_STRING = "" cdef extern from "math.h": float fabsf(float x) nogil -# Round points differing by less than this amount -# effectively ignoring differences near the 32bit -# floating point precision -cdef float EPSILON = 1e-6 -cdef float DBL_EPSILON = 1e-16 +# Smallest strictly positive value that can be represented by floating +# point numbers for different precision levels. This is useful to avoid +# taking the log of zero when computing the KL divergence. +cdef float FLOAT32_TINY = np.finfo(np.float32).tiny + +# Useful to void division by zero or divergence to +inf. +cdef float FLOAT64_EPS = np.finfo(np.float64).eps # This is effectively an ifdef statement in Cython # It allows us to write printf debugging lines @@ -134,14 +136,14 @@ cdef float compute_gradient_positive(float[:] val_P, qij = (((1.0 + dij) / dof) ** exponent) dij = pij * qij qij /= sum_Q - C += pij * log(max(pij, EPSILON) / max(qij, EPSILON)) + C += pij * log(max(pij, FLOAT32_TINY) + / max(qij, FLOAT32_TINY)) for ax in range(n_dimensions): pos_f[i * n_dimensions + ax] += dij * buff[ax] t2 = clock() dt = ((float) (t2 - t1)) if verbose > 10: printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt) - return C @@ -208,7 +210,7 @@ cdef void compute_gradient_negative(float[:, :] pos_reference, printf("Force computation: %li clock ticks\n", dtb) # Put sum_Q to machine EPSILON to avoid divisions by 0 - sum_Q[0] = max(sum_Q[0], DBL_EPSILON) + sum_Q[0] = max(sum_Q[0], FLOAT64_EPS) free(summary)