diff --git a/benchmarks/bench_minibatch_nmf.py b/benchmarks/bench_minibatch_nmf.py new file mode 100644 index 0000000000000..3814c1eb28bca --- /dev/null +++ b/benchmarks/bench_minibatch_nmf.py @@ -0,0 +1,117 @@ + +from time import time +import pandas as pd + +from sklearn.decomposition.nmf import _beta_divergence +from sklearn.feature_extraction.text import HashingVectorizer +from sklearn.utils import gen_batches + +from nmf import NMF +from nmf_original import NMFOriginal +from nmf_original import non_negative_factorization + +import matplotlib.pyplot as plt + +# Download file from: +# https://www.dropbox.com/s/n8ynmz6jxkynvyy/enwiki_1M_first_paragraphs.csv.zip?dl=0 +# https://filesender.renater.fr/?s=download&token=88222d6d-5aee-c59b-4f34-c233b4d184e1 +df = pd.read_csv('enwiki_1M_first_paragraphs.csv') +cats = df['0'].sample(frac=1, random_state=5).astype(str) +counter = HashingVectorizer(analyzer='word', ngram_range=(1, 1), + n_features=2**12, norm=None, + alternate_sign=False) +X = counter.fit_transform(cats) +n_components = 10 +beta_loss = 'kullback-leibler' +n_train = 500000 +n_test = 10000 +batch_size = 10000 +random_state = 12 +n_batch = (n_train - 1) // batch_size + 1 +X_test = X[:n_test, :] +X = X[n_test:n_train + n_test, :] + +max_iter_nmf = [1, 5, 10, 30, 50, 100] +n_iter_minibatch_nmf = 50 + + +def get_optimal_w(X, H): + W, _, _ = non_negative_factorization( + X=X, W=None, H=H, + n_components=n_components, + init='custom', update_H=False, solver='mu', + beta_loss=beta_loss, tol=1e-4, max_iter=200, alpha=0., + l1_ratio=0., regularization=None, random_state=None, + verbose=0, shuffle=False) + return W + + +minibatch_nmf = NMF( + n_components=n_components, beta_loss=beta_loss, batch_size=batch_size, + solver='mu', random_state=random_state, max_iter=3) + +fig, ax = plt.subplots() +plt.xscale('log') +fontsize = 16 + +total_time = 0 +time_nmf = [] +loss_nmf = [] +for n_iter in range(n_iter_minibatch_nmf): + + for j, slice in enumerate(gen_batches(n=n_train, + batch_size=batch_size)): + t0 = time() + minibatch_nmf.partial_fit(X[slice]) + tf = time() - t0 + total_time += tf + if ((j % 11 == 9) and (n_iter <= 1)) or j == n_batch - 1: + time_nmf.append(total_time) + W = get_optimal_w(X_test, minibatch_nmf.components_) + loss = _beta_divergence(X_test, W, minibatch_nmf.components_, + minibatch_nmf.beta_loss) / n_test + loss_nmf.append(loss) + plt.plot(time_nmf, loss_nmf, 'b', marker='o', + label='Mini-batch NMF') + plt.pause(.01) + + print('Time MiniBatchNMF: %.1fs.' % total_time) + print('KL-div MiniBatchNMF: %.2f' % loss) + del W + +total_time = 0 +time_nmf = [] +loss_nmf = [] +for i, max_iter in enumerate(max_iter_nmf): + nmf = NMFOriginal(n_components=n_components, beta_loss=beta_loss, + solver='mu', max_iter=max_iter, + random_state=random_state, tol=0) + t0 = time() + nmf.fit(X) + tf = time() - t0 + total_time += tf + time_nmf.append(total_time) + print('Time NMF: %.1fs.' % total_time) + W = get_optimal_w(X_test, nmf.components_) + loss = _beta_divergence(X_test, W, nmf.components_, + nmf.beta_loss) / n_test + loss_nmf.append(loss) + print('KL-div NMF: %.2f' % loss) + plt.plot(time_nmf, loss_nmf, 'r', marker='o', label='NMF') + plt.pause(.01) + del W + +handles, labels = ax.get_legend_handles_labels() +plt.legend(handles=(handles[-1], handles[0]), + labels=(labels[-1], labels[0]), fontsize=fontsize) +plt.tick_params(axis='both', which='major', labelsize=fontsize-2) +plt.xlabel('Time (seconds)', fontsize=fontsize) +plt.ylabel(beta_loss, fontsize=fontsize) +title = 'Wikipedia articles (first paragraph)' +ax.set_title(title, fontsize=fontsize+4) + +figname = 'benchmark_nmf_wikipedia_articles.png' +print('Saving: ' + figname) +plt.savefig(figname, + transparent=False, bbox_inches='tight', pad_inches=0) +plt.show() diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index 63d9d457687eb..b1fb100c5c025 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -14,13 +14,13 @@ import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin -from ..utils import check_random_state, check_array -from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm -from ..utils.extmath import safe_min -from ..utils.validation import check_is_fitted, check_non_negative -from ..exceptions import ConvergenceWarning -from .cdnmf_fast import _update_cdnmf_fast +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_random_state, check_array, gen_batches +from sklearn.utils.extmath import randomized_svd, safe_sparse_dot, squared_norm +from sklearn.utils.extmath import safe_min +from sklearn.utils.validation import check_is_fitted, check_non_negative +from sklearn.exceptions import ConvergenceWarning +from sklearn.decomposition.cdnmf_fast import _update_cdnmf_fast EPSILON = np.finfo(np.float32).eps @@ -328,7 +328,9 @@ def _initialize_nmf(X, n_components, init=None, eps=1e-6, # supported as a kwarg on ufuncs np.abs(H, H) np.abs(W, W) - return W, H + A = H.copy() + B = np.ones((n_components, n_features)) + return W, H, A, B # NNDSVD initialization U, S, V = randomized_svd(X, n_components, random_state=random_state) @@ -384,8 +386,9 @@ def _initialize_nmf(X, n_components, init=None, eps=1e-6, raise ValueError( 'Invalid init parameter: got %r instead of one of %r' % (init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar'))) - - return W, H + A = H.copy() + B = np.ones((n_components, n_features)) + return W, H, A, B def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle, @@ -564,7 +567,8 @@ def _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma, WH_safe_X_data[WH_safe_X_data == 0] = EPSILON if beta_loss == 1: - np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data) + np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data, + where=(WH_safe_X_data != 0)) elif beta_loss == 0: # speeds up computation time # refer to /numpy/numpy/issues/9363 @@ -620,7 +624,9 @@ def _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma, return delta_W, H_sum, HHt, XHt -def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma): +def _multiplicative_update_h(X, W, H, A, B, + beta_loss, l1_reg_H, l2_reg_H, gamma, + n_iter): """update H in Multiplicative Update NMF""" if beta_loss == 2: numerator = safe_sparse_dot(W.T, X) @@ -645,7 +651,8 @@ def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma): WH_safe_X_data[WH_safe_X_data == 0] = EPSILON if beta_loss == 1: - np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data) + np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data, + where=(WH_safe_X_data != 0)) elif beta_loss == 0: # speeds up computation time # refer to /numpy/numpy/issues/9363 @@ -692,17 +699,24 @@ def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma): denominator = denominator + l2_reg_H * H denominator[denominator == 0] = EPSILON - numerator /= denominator - delta_H = numerator + # r = .1 + # rho = r ** (1 / n_iter) + rho = .99 + A *= rho + B *= rho + A += numerator * H + B += denominator + H = np.divide(A, B) # gamma is in ]0, 1] if gamma != 1: delta_H **= gamma - return delta_H + return H, A, B -def _fit_multiplicative_update(X, W, H, beta_loss='frobenius', +def _fit_multiplicative_update(X, W, H, A, B, beta_loss='frobenius', + batch_size=1024, max_iter=200, tol=1e-4, l1_reg_W=0, l1_reg_H=0, l2_reg_W=0, l2_reg_H=0, update_H=True, verbose=0): @@ -783,49 +797,61 @@ def _fit_multiplicative_update(X, W, H, beta_loss='frobenius', gamma = 1. / (beta_loss - 1.) else: gamma = 1. - + n_samples = X.shape[0] # used for the convergence criterion error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True) previous_error = error_at_init H_sum, HHt, XHt = None, None, None + + n_iter_update_h_ = 1 + max_iter_update_w_ = 5 + for n_iter in range(1, max_iter + 1): # update W # H_sum, HHt and XHt are saved and reused if not update_H - delta_W, H_sum, HHt, XHt = _multiplicative_update_w( - X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma, - H_sum, HHt, XHt, update_H) - W *= delta_W + for i, slice in enumerate(gen_batches(n=n_samples, + batch_size=batch_size)): - # necessary for stability with beta_loss < 1 - if beta_loss < 1: - W[W < np.finfo(np.float64).eps] = 0. - - # update H - if update_H: - delta_H = _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, - l2_reg_H, gamma) - H *= delta_H - - # These values will be recomputed since H changed - H_sum, HHt, XHt = None, None, None + for j in range(max_iter_update_w_): + delta_W, H_sum, HHt, XHt = _multiplicative_update_w( + X[slice], W[slice], H, beta_loss, l1_reg_W, l2_reg_W, + gamma, H_sum, HHt, XHt, update_H) + W[slice] *= delta_W # necessary for stability with beta_loss < 1 - if beta_loss <= 1: - H[H < np.finfo(np.float64).eps] = 0. - - # test convergence criterion every 10 iterations - if tol > 0 and n_iter % 10 == 0: - error = _beta_divergence(X, W, H, beta_loss, square_root=True) - - if verbose: - iter_time = time.time() - print("Epoch %02d reached after %.3f seconds, error: %f" % - (n_iter, iter_time - start_time, error)) - - if (previous_error - error) / error_at_init < tol: - break - previous_error = error + if beta_loss < 1: + W[slice][W[slice] < np.finfo(np.float64).eps] = 0. + + # update H + if update_H: + H, A, B = _multiplicative_update_h(X[slice], W[slice], H, + A, B, + beta_loss, l1_reg_H, + l2_reg_H, gamma, + n_iter_update_h_) + n_iter_update_h_ += 1 + + # These values will be recomputed since H changed + H_sum, HHt, XHt = None, None, None + + # necessary for stability with beta_loss < 1 + if beta_loss <= 1: + H[H < np.finfo(np.float64).eps] = 0. + + # test convergence criterion every 10 iterations + if tol > 0 and n_iter % 10 == 0: + error = _beta_divergence(X, W, H, beta_loss, + square_root=True) + + if verbose: + iter_time = time.time() + print("Epoch %02d reached after %.3f seconds, error: %f" % + (n_iter, iter_time - start_time, error)) + + if (previous_error - error) / error_at_init < tol: + break + previous_error = error # do not print if we have already printed in the convergence test if verbose and (tol == 0 or n_iter % 10 != 0): @@ -836,7 +862,9 @@ def _fit_multiplicative_update(X, W, H, beta_loss='frobenius', return W, H, n_iter -def non_negative_factorization(X, W=None, H=None, n_components=None, +def non_negative_factorization(X, W=None, H=None, A=None, B=None, + n_components=None, + batch_size=1024, init='warn', update_H=True, solver='cd', beta_loss='frobenius', tol=1e-4, max_iter=200, alpha=0., l1_ratio=0., @@ -1031,6 +1059,8 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, # check W and H, or initialize them if init == 'custom' and update_H: _check_init(H, (n_components, n_features), "NMF (input H)") + _check_init(A, (n_components, n_features), "NMF (input A)") + _check_init(B, (n_components, n_features), "NMF (input B)") _check_init(W, (n_samples, n_components), "NMF (input W)") elif not update_H: _check_init(H, (n_components, n_features), "NMF (input H)") @@ -1040,9 +1070,11 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, W = np.full((n_samples, n_components), avg) else: W = np.zeros((n_samples, n_components)) + A = None + B = None else: - W, H = _initialize_nmf(X, n_components, init=init, - random_state=random_state) + W, H, A, B = _initialize_nmf(X, n_components, init=init, + random_state=random_state) l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization( alpha, l1_ratio, regularization) @@ -1056,7 +1088,8 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, shuffle=shuffle, random_state=random_state) elif solver == 'mu': - W, H, n_iter = _fit_multiplicative_update(X, W, H, beta_loss, max_iter, + W, H, n_iter = _fit_multiplicative_update(X, W, H, A, B, beta_loss, + batch_size, max_iter, tol, l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H, update_H, verbose) @@ -1068,7 +1101,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, warnings.warn("Maximum number of iteration %d reached. Increase it to" " improve convergence." % max_iter, ConvergenceWarning) - return W, H, n_iter + return W, H, A, B, n_iter class NMF(BaseEstimator, TransformerMixin): @@ -1223,12 +1256,14 @@ class NMF(BaseEstimator, TransformerMixin): """ def __init__(self, n_components=None, init=None, solver='cd', + batch_size=1024, beta_loss='frobenius', tol=1e-4, max_iter=200, random_state=None, alpha=0., l1_ratio=0., verbose=0, shuffle=False): self.n_components = n_components self.init = init self.solver = solver + self.batch_size = batch_size self.beta_loss = beta_loss self.tol = tol self.max_iter = max_iter @@ -1263,19 +1298,22 @@ def fit_transform(self, X, y=None, W=None, H=None): """ X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float) - W, H, n_iter_ = non_negative_factorization( - X=X, W=W, H=H, n_components=self.n_components, init=self.init, + W, H, A, B, n_iter_ = non_negative_factorization( + X=X, W=W, H=H, A=None, B=None, n_components=self.n_components, + batch_size=self.batch_size, init=self.init, update_H=True, solver=self.solver, beta_loss=self.beta_loss, - tol=self.tol, max_iter=self.max_iter, alpha=self.alpha, + tol=0, max_iter=1, alpha=self.alpha, l1_ratio=self.l1_ratio, regularization='both', random_state=self.random_state, verbose=self.verbose, shuffle=self.shuffle) - + # TODO internal iters for W self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss, square_root=True) self.n_components_ = H.shape[0] self.components_ = H + self.components_numerator_ = A + self.components_denominator_ = B self.n_iter_ = n_iter_ return W @@ -1297,6 +1335,38 @@ def fit(self, X, y=None, **params): self.fit_transform(X, **params) return self + def partial_fit(self, X, y=None, **params): + if hasattr(self, 'components_'): + W = np.ones((X.shape[0], self.n_components)) + W *= np.maximum(1e-6, X.sum(axis=1).A) + W /= W.sum(axis=1, keepdims=True) + W, H, A, B, n_iter_ = non_negative_factorization( + X=X, W=W, H=self.components_, + A=self.components_numerator_, B=self.components_denominator_, + n_components=self.n_components, + batch_size=self.batch_size, init='custom', + update_H=True, solver=self.solver, beta_loss=self.beta_loss, + tol=0, max_iter=1, alpha=self.alpha, + l1_ratio=self.l1_ratio, regularization='both', + random_state=self.random_state, verbose=self.verbose, + shuffle=self.shuffle) + + # probably not necessary to compute at each time + # self.reconstruction_err_ = _beta_divergence(X, W, H, + # self.beta_loss, + # square_root=True) + + self.n_components_ = H.shape[0] + self.components_ = H + self.components_numerator_ = A + self.components_denominator_ = B + self.n_iter_ = n_iter_ + + else: + self.fit_transform(X, **params) + + return self + def transform(self, X): """Transform the data X according to the fitted NMF model @@ -1312,8 +1382,10 @@ def transform(self, X): """ check_is_fitted(self, 'n_components_') - W, _, n_iter_ = non_negative_factorization( - X=X, W=None, H=self.components_, n_components=self.n_components_, + W, _, _, _, n_iter_ = non_negative_factorization( + X=X, W=None, H=self.components_, A=None, B=None, + n_components=self.n_components_, + batch_size=self.batch_size, init=self.init, update_H=False, solver=self.solver, beta_loss=self.beta_loss, tol=self.tol, max_iter=self.max_iter, alpha=self.alpha, l1_ratio=self.l1_ratio, regularization='both', diff --git a/sklearn/decomposition/nmf_original.py b/sklearn/decomposition/nmf_original.py new file mode 100644 index 0000000000000..d568573513f5f --- /dev/null +++ b/sklearn/decomposition/nmf_original.py @@ -0,0 +1,1341 @@ +""" Non-negative matrix factorization +""" +# Author: Vlad Niculae +# Lars Buitinck +# Mathieu Blondel +# Tom Dupre la Tour +# License: BSD 3 clause + +from math import sqrt +import warnings +import numbers +import time + +import numpy as np +import scipy.sparse as sp + +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_random_state, check_array +from sklearn.utils.extmath import randomized_svd, safe_sparse_dot, squared_norm +from sklearn.utils.extmath import safe_min +from sklearn.utils.validation import check_is_fitted, check_non_negative +from sklearn.exceptions import ConvergenceWarning +from sklearn.decomposition.cdnmf_fast import _update_cdnmf_fast + +EPSILON = np.finfo(np.float32).eps + +INTEGER_TYPES = (numbers.Integral, np.integer) + + +def norm(x): + """Dot product-based Euclidean norm implementation + + See: http://fseoane.net/blog/2011/computing-the-vector-norm/ + + Parameters + ---------- + x : array-like + Vector for which to compute the norm + """ + return sqrt(squared_norm(x)) + + +def trace_dot(X, Y): + """Trace of np.dot(X, Y.T). + + Parameters + ---------- + X : array-like + First matrix + Y : array-like + Second matrix + """ + return np.dot(X.ravel(), Y.ravel()) + + +def _check_init(A, shape, whom): + A = check_array(A) + if np.shape(A) != shape: + raise ValueError('Array with wrong shape passed to %s. Expected %s, ' + 'but got %s ' % (whom, shape, np.shape(A))) + check_non_negative(A, whom) + if np.max(A) == 0: + raise ValueError('Array passed to %s is full of zeros.' % whom) + + +def _beta_divergence(X, W, H, beta, square_root=False): + """Compute the beta-divergence of X and dot(W, H). + + Parameters + ---------- + X : float or array-like, shape (n_samples, n_features) + + W : float or dense array-like, shape (n_samples, n_components) + + H : float or dense array-like, shape (n_components, n_features) + + beta : float, string in {'frobenius', 'kullback-leibler', 'itakura-saito'} + Parameter of the beta-divergence. + If beta == 2, this is half the Frobenius *squared* norm. + If beta == 1, this is the generalized Kullback-Leibler divergence. + If beta == 0, this is the Itakura-Saito divergence. + Else, this is the general beta-divergence. + + square_root : boolean, default False + If True, return np.sqrt(2 * res) + For beta == 2, it corresponds to the Frobenius norm. + + Returns + ------- + res : float + Beta divergence of X and np.dot(X, H) + """ + beta = _beta_loss_to_float(beta) + + # The method can be called with scalars + if not sp.issparse(X): + X = np.atleast_2d(X) + W = np.atleast_2d(W) + H = np.atleast_2d(H) + + # Frobenius norm + if beta == 2: + # Avoid the creation of the dense np.dot(W, H) if X is sparse. + if sp.issparse(X): + norm_X = np.dot(X.data, X.data) + norm_WH = trace_dot(np.dot(np.dot(W.T, W), H), H) + cross_prod = trace_dot((X * H.T), W) + res = (norm_X + norm_WH - 2. * cross_prod) / 2. + else: + res = squared_norm(X - np.dot(W, H)) / 2. + + if square_root: + return np.sqrt(res * 2) + else: + return res + + if sp.issparse(X): + # compute np.dot(W, H) only where X is nonzero + WH_data = _special_sparse_dot(W, H, X).data + X_data = X.data + else: + WH = np.dot(W, H) + WH_data = WH.ravel() + X_data = X.ravel() + + # do not affect the zeros: here 0 ** (-1) = 0 and not infinity + indices = X_data > EPSILON + WH_data = WH_data[indices] + X_data = X_data[indices] + + # used to avoid division by zero + WH_data[WH_data == 0] = EPSILON + + # generalized Kullback-Leibler divergence + if beta == 1: + # fast and memory efficient computation of np.sum(np.dot(W, H)) + sum_WH = np.dot(np.sum(W, axis=0), np.sum(H, axis=1)) + # computes np.sum(X * log(X / WH)) only where X is nonzero + div = X_data / WH_data + res = np.dot(X_data, np.log(div)) + # add full np.sum(np.dot(W, H)) - np.sum(X) + res += sum_WH - X_data.sum() + + # Itakura-Saito divergence + elif beta == 0: + div = X_data / WH_data + res = np.sum(div) - np.product(X.shape) - np.sum(np.log(div)) + + # beta-divergence, beta not in (0, 1, 2) + else: + if sp.issparse(X): + # slow loop, but memory efficient computation of : + # np.sum(np.dot(W, H) ** beta) + sum_WH_beta = 0 + for i in range(X.shape[1]): + sum_WH_beta += np.sum(np.dot(W, H[:, i]) ** beta) + + else: + sum_WH_beta = np.sum(WH ** beta) + + sum_X_WH = np.dot(X_data, WH_data ** (beta - 1)) + res = (X_data ** beta).sum() - beta * sum_X_WH + res += sum_WH_beta * (beta - 1) + res /= beta * (beta - 1) + + if square_root: + return np.sqrt(2 * res) + else: + return res + + +def _special_sparse_dot(W, H, X): + """Computes np.dot(W, H), only where X is non zero.""" + if sp.issparse(X): + ii, jj = X.nonzero() + dot_vals = np.multiply(W[ii, :], H.T[jj, :]).sum(axis=1) + WH = sp.coo_matrix((dot_vals, (ii, jj)), shape=X.shape) + return WH.tocsr() + else: + return np.dot(W, H) + + +def _compute_regularization(alpha, l1_ratio, regularization): + """Compute L1 and L2 regularization coefficients for W and H""" + alpha_H = 0. + alpha_W = 0. + if regularization in ('both', 'components'): + alpha_H = float(alpha) + if regularization in ('both', 'transformation'): + alpha_W = float(alpha) + + l1_reg_W = alpha_W * l1_ratio + l1_reg_H = alpha_H * l1_ratio + l2_reg_W = alpha_W * (1. - l1_ratio) + l2_reg_H = alpha_H * (1. - l1_ratio) + return l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H + + +def _check_string_param(solver, regularization, beta_loss, init): + allowed_solver = ('cd', 'mu') + if solver not in allowed_solver: + raise ValueError( + 'Invalid solver parameter: got %r instead of one of %r' % + (solver, allowed_solver)) + + allowed_regularization = ('both', 'components', 'transformation', None) + if regularization not in allowed_regularization: + raise ValueError( + 'Invalid regularization parameter: got %r instead of one of %r' % + (regularization, allowed_regularization)) + + # 'mu' is the only solver that handles other beta losses than 'frobenius' + if solver != 'mu' and beta_loss not in (2, 'frobenius'): + raise ValueError( + 'Invalid beta_loss parameter: solver %r does not handle beta_loss' + ' = %r' % (solver, beta_loss)) + + if solver == 'mu' and init == 'nndsvd': + warnings.warn("The multiplicative update ('mu') solver cannot update " + "zeros present in the initialization, and so leads to " + "poorer results when used jointly with init='nndsvd'. " + "You may try init='nndsvda' or init='nndsvdar' instead.", + UserWarning) + + beta_loss = _beta_loss_to_float(beta_loss) + return beta_loss + + +def _beta_loss_to_float(beta_loss): + """Convert string beta_loss to float""" + allowed_beta_loss = {'frobenius': 2, + 'kullback-leibler': 1, + 'itakura-saito': 0} + if isinstance(beta_loss, str) and beta_loss in allowed_beta_loss: + beta_loss = allowed_beta_loss[beta_loss] + + if not isinstance(beta_loss, numbers.Number): + raise ValueError('Invalid beta_loss parameter: got %r instead ' + 'of one of %r, or a float.' % + (beta_loss, allowed_beta_loss.keys())) + return beta_loss + + +def _initialize_nmf(X, n_components, init=None, eps=1e-6, + random_state=None): + """Algorithms for NMF initialization. + + Computes an initial guess for the non-negative + rank k matrix approximation for X: X = WH + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The data matrix to be decomposed. + + n_components : integer + The number of components desired in the approximation. + + init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' + Method used to initialize the procedure. + Default: None. + Valid options: + + - None: 'nndsvd' if n_components <= min(n_samples, n_features), + otherwise 'random'. + + - 'random': non-negative random matrices, scaled with: + sqrt(X.mean() / n_components) + + - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) + initialization (better for sparseness) + + - 'nndsvda': NNDSVD with zeros filled with the average of X + (better when sparsity is not desired) + + - 'nndsvdar': NNDSVD with zeros filled with small random values + (generally faster, less accurate alternative to NNDSVDa + for when sparsity is not desired) + + - 'custom': use custom matrices W and H + + eps : float + Truncate all values less then this in output to zero. + + random_state : int, RandomState instance or None, optional, default: None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. Used when ``random`` == 'nndsvdar' or 'random'. + + Returns + ------- + W : array-like, shape (n_samples, n_components) + Initial guesses for solving X ~= WH + + H : array-like, shape (n_components, n_features) + Initial guesses for solving X ~= WH + + References + ---------- + C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for + nonnegative matrix factorization - Pattern Recognition, 2008 + http://tinyurl.com/nndsvd + """ + check_non_negative(X, "NMF initialization") + n_samples, n_features = X.shape + + if (init is not None and init != 'random' + and n_components > min(n_samples, n_features)): + raise ValueError("init = '{}' can only be used when " + "n_components <= min(n_samples, n_features)" + .format(init)) + + if init is None: + if n_components <= min(n_samples, n_features): + init = 'nndsvd' + else: + init = 'random' + + # Random initialization + if init == 'random': + avg = np.sqrt(X.mean() / n_components) + rng = check_random_state(random_state) + H = avg * rng.randn(n_components, n_features) + W = avg * rng.randn(n_samples, n_components) + # we do not write np.abs(H, out=H) to stay compatible with + # numpy 1.5 and earlier where the 'out' keyword is not + # supported as a kwarg on ufuncs + np.abs(H, H) + np.abs(W, W) + return W, H + + # NNDSVD initialization + U, S, V = randomized_svd(X, n_components, random_state=random_state) + W, H = np.zeros(U.shape), np.zeros(V.shape) + + # The leading singular triplet is non-negative + # so it can be used as is for initialization. + W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0]) + H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :]) + + for j in range(1, n_components): + x, y = U[:, j], V[j, :] + + # extract positive and negative parts of column vectors + x_p, y_p = np.maximum(x, 0), np.maximum(y, 0) + x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)) + + # and their norms + x_p_nrm, y_p_nrm = norm(x_p), norm(y_p) + x_n_nrm, y_n_nrm = norm(x_n), norm(y_n) + + m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm + + # choose update + if m_p > m_n: + u = x_p / x_p_nrm + v = y_p / y_p_nrm + sigma = m_p + else: + u = x_n / x_n_nrm + v = y_n / y_n_nrm + sigma = m_n + + lbd = np.sqrt(S[j] * sigma) + W[:, j] = lbd * u + H[j, :] = lbd * v + + W[W < eps] = 0 + H[H < eps] = 0 + + if init == "nndsvd": + pass + elif init == "nndsvda": + avg = X.mean() + W[W == 0] = avg + H[H == 0] = avg + elif init == "nndsvdar": + rng = check_random_state(random_state) + avg = X.mean() + W[W == 0] = abs(avg * rng.randn(len(W[W == 0])) / 100) + H[H == 0] = abs(avg * rng.randn(len(H[H == 0])) / 100) + else: + raise ValueError( + 'Invalid init parameter: got %r instead of one of %r' % + (init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar'))) + + return W, H + + +def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle, + random_state): + """Helper function for _fit_coordinate_descent + + Update W to minimize the objective function, iterating once over all + coordinates. By symmetry, to update H, one can call + _update_coordinate_descent(X.T, Ht, W, ...) + + """ + n_components = Ht.shape[1] + + HHt = np.dot(Ht.T, Ht) + XHt = safe_sparse_dot(X, Ht) + + # L2 regularization corresponds to increase of the diagonal of HHt + if l2_reg != 0.: + # adds l2_reg only on the diagonal + HHt.flat[::n_components + 1] += l2_reg + # L1 regularization corresponds to decrease of each element of XHt + if l1_reg != 0.: + XHt -= l1_reg + + if shuffle: + permutation = random_state.permutation(n_components) + else: + permutation = np.arange(n_components) + # The following seems to be required on 64-bit Windows w/ Python 3.5. + permutation = np.asarray(permutation, dtype=np.intp) + return _update_cdnmf_fast(W, HHt, XHt, permutation) + + +def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0, + l1_reg_H=0, l2_reg_W=0, l2_reg_H=0, update_H=True, + verbose=0, shuffle=False, random_state=None): + """Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent + + The objective function is minimized with an alternating minimization of W + and H. Each minimization is done with a cyclic (up to a permutation of the + features) Coordinate Descent. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Constant matrix. + + W : array-like, shape (n_samples, n_components) + Initial guess for the solution. + + H : array-like, shape (n_components, n_features) + Initial guess for the solution. + + tol : float, default: 1e-4 + Tolerance of the stopping condition. + + max_iter : integer, default: 200 + Maximum number of iterations before timing out. + + l1_reg_W : double, default: 0. + L1 regularization parameter for W. + + l1_reg_H : double, default: 0. + L1 regularization parameter for H. + + l2_reg_W : double, default: 0. + L2 regularization parameter for W. + + l2_reg_H : double, default: 0. + L2 regularization parameter for H. + + update_H : boolean, default: True + Set to True, both W and H will be estimated from initial guesses. + Set to False, only W will be estimated. + + verbose : integer, default: 0 + The verbosity level. + + shuffle : boolean, default: False + If true, randomize the order of coordinates in the CD solver. + + random_state : int, RandomState instance or None, optional, default: None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Returns + ------- + W : array-like, shape (n_samples, n_components) + Solution to the non-negative least squares problem. + + H : array-like, shape (n_components, n_features) + Solution to the non-negative least squares problem. + + n_iter : int + The number of iterations done by the algorithm. + + References + ---------- + Cichocki, Andrzej, and Phan, Anh-Huy. "Fast local algorithms for + large scale nonnegative matrix and tensor factorizations." + IEICE transactions on fundamentals of electronics, communications and + computer sciences 92.3: 708-721, 2009. + """ + # so W and Ht are both in C order in memory + Ht = check_array(H.T, order='C') + X = check_array(X, accept_sparse='csr') + + rng = check_random_state(random_state) + + for n_iter in range(max_iter): + violation = 0. + + # Update W + violation += _update_coordinate_descent(X, W, Ht, l1_reg_W, + l2_reg_W, shuffle, rng) + # Update H + if update_H: + violation += _update_coordinate_descent(X.T, Ht, W, l1_reg_H, + l2_reg_H, shuffle, rng) + + if n_iter == 0: + violation_init = violation + + if violation_init == 0: + break + + if verbose: + print("violation:", violation / violation_init) + + if violation / violation_init <= tol: + if verbose: + print("Converged at iteration", n_iter + 1) + break + + return W, Ht.T, n_iter + + +def _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma, + H_sum=None, HHt=None, XHt=None, update_H=True): + """update W in Multiplicative Update NMF""" + if beta_loss == 2: + # Numerator + if XHt is None: + XHt = safe_sparse_dot(X, H.T) + if update_H: + # avoid a copy of XHt, which will be re-computed (update_H=True) + numerator = XHt + else: + # preserve the XHt, which is not re-computed (update_H=False) + numerator = XHt.copy() + + # Denominator + if HHt is None: + HHt = np.dot(H, H.T) + denominator = np.dot(W, HHt) + + else: + # Numerator + # if X is sparse, compute WH only where X is non zero + WH_safe_X = _special_sparse_dot(W, H, X) + if sp.issparse(X): + WH_safe_X_data = WH_safe_X.data + X_data = X.data + else: + WH_safe_X_data = WH_safe_X + X_data = X + # copy used in the Denominator + WH = WH_safe_X.copy() + if beta_loss - 1. < 0: + WH[WH == 0] = EPSILON + + # to avoid taking a negative power of zero + if beta_loss - 2. < 0: + WH_safe_X_data[WH_safe_X_data == 0] = EPSILON + + if beta_loss == 1: + np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data) + elif beta_loss == 0: + # speeds up computation time + # refer to /numpy/numpy/issues/9363 + WH_safe_X_data **= -1 + WH_safe_X_data **= 2 + # element-wise multiplication + WH_safe_X_data *= X_data + else: + WH_safe_X_data **= beta_loss - 2 + # element-wise multiplication + WH_safe_X_data *= X_data + + # here numerator = dot(X * (dot(W, H) ** (beta_loss - 2)), H.T) + numerator = safe_sparse_dot(WH_safe_X, H.T) + + # Denominator + if beta_loss == 1: + if H_sum is None: + H_sum = np.sum(H, axis=1) # shape(n_components, ) + denominator = H_sum[np.newaxis, :] + + else: + # computation of WHHt = dot(dot(W, H) ** beta_loss - 1, H.T) + if sp.issparse(X): + # memory efficient computation + # (compute row by row, avoiding the dense matrix WH) + WHHt = np.empty(W.shape) + for i in range(X.shape[0]): + WHi = np.dot(W[i, :], H) + if beta_loss - 1 < 0: + WHi[WHi == 0] = EPSILON + WHi **= beta_loss - 1 + WHHt[i, :] = np.dot(WHi, H.T) + else: + WH **= beta_loss - 1 + WHHt = np.dot(WH, H.T) + denominator = WHHt + + # Add L1 and L2 regularization + if l1_reg_W > 0: + denominator += l1_reg_W + if l2_reg_W > 0: + denominator = denominator + l2_reg_W * W + denominator[denominator == 0] = EPSILON + + numerator /= denominator + delta_W = numerator + + # gamma is in ]0, 1] + if gamma != 1: + delta_W **= gamma + + return delta_W, H_sum, HHt, XHt + + +def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma): + """update H in Multiplicative Update NMF""" + if beta_loss == 2: + numerator = safe_sparse_dot(W.T, X) + denominator = np.dot(np.dot(W.T, W), H) + + else: + # Numerator + WH_safe_X = _special_sparse_dot(W, H, X) + if sp.issparse(X): + WH_safe_X_data = WH_safe_X.data + X_data = X.data + else: + WH_safe_X_data = WH_safe_X + X_data = X + # copy used in the Denominator + WH = WH_safe_X.copy() + if beta_loss - 1. < 0: + WH[WH == 0] = EPSILON + + # to avoid division by zero + if beta_loss - 2. < 0: + WH_safe_X_data[WH_safe_X_data == 0] = EPSILON + + if beta_loss == 1: + np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data) + elif beta_loss == 0: + # speeds up computation time + # refer to /numpy/numpy/issues/9363 + WH_safe_X_data **= -1 + WH_safe_X_data **= 2 + # element-wise multiplication + WH_safe_X_data *= X_data + else: + WH_safe_X_data **= beta_loss - 2 + # element-wise multiplication + WH_safe_X_data *= X_data + + # here numerator = dot(W.T, (dot(W, H) ** (beta_loss - 2)) * X) + numerator = safe_sparse_dot(W.T, WH_safe_X) + + # Denominator + if beta_loss == 1: + W_sum = np.sum(W, axis=0) # shape(n_components, ) + W_sum[W_sum == 0] = 1. + denominator = W_sum[:, np.newaxis] + + # beta_loss not in (1, 2) + else: + # computation of WtWH = dot(W.T, dot(W, H) ** beta_loss - 1) + if sp.issparse(X): + # memory efficient computation + # (compute column by column, avoiding the dense matrix WH) + WtWH = np.empty(H.shape) + for i in range(X.shape[1]): + WHi = np.dot(W, H[:, i]) + if beta_loss - 1 < 0: + WHi[WHi == 0] = EPSILON + WHi **= beta_loss - 1 + WtWH[:, i] = np.dot(W.T, WHi) + else: + WH **= beta_loss - 1 + WtWH = np.dot(W.T, WH) + denominator = WtWH + + # Add L1 and L2 regularization + if l1_reg_H > 0: + denominator += l1_reg_H + if l2_reg_H > 0: + denominator = denominator + l2_reg_H * H + denominator[denominator == 0] = EPSILON + + numerator /= denominator + delta_H = numerator + + # gamma is in ]0, 1] + if gamma != 1: + delta_H **= gamma + + return delta_H + + +def _fit_multiplicative_update(X, W, H, beta_loss='frobenius', + max_iter=200, tol=1e-4, + l1_reg_W=0, l1_reg_H=0, l2_reg_W=0, l2_reg_H=0, + update_H=True, verbose=0): + """Compute Non-negative Matrix Factorization with Multiplicative Update + + The objective function is _beta_divergence(X, WH) and is minimized with an + alternating minimization of W and H. Each minimization is done with a + Multiplicative Update. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Constant input matrix. + + W : array-like, shape (n_samples, n_components) + Initial guess for the solution. + + H : array-like, shape (n_components, n_features) + Initial guess for the solution. + + beta_loss : float or string, default 'frobenius' + String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}. + Beta divergence to be minimized, measuring the distance between X + and the dot product WH. Note that values different from 'frobenius' + (or 2) and 'kullback-leibler' (or 1) lead to significantly slower + fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input + matrix X cannot contain zeros. + + max_iter : integer, default: 200 + Number of iterations. + + tol : float, default: 1e-4 + Tolerance of the stopping condition. + + l1_reg_W : double, default: 0. + L1 regularization parameter for W. + + l1_reg_H : double, default: 0. + L1 regularization parameter for H. + + l2_reg_W : double, default: 0. + L2 regularization parameter for W. + + l2_reg_H : double, default: 0. + L2 regularization parameter for H. + + update_H : boolean, default: True + Set to True, both W and H will be estimated from initial guesses. + Set to False, only W will be estimated. + + verbose : integer, default: 0 + The verbosity level. + + Returns + ------- + W : array, shape (n_samples, n_components) + Solution to the non-negative least squares problem. + + H : array, shape (n_components, n_features) + Solution to the non-negative least squares problem. + + n_iter : int + The number of iterations done by the algorithm. + + References + ---------- + Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix + factorization with the beta-divergence. Neural Computation, 23(9). + """ + start_time = time.time() + + beta_loss = _beta_loss_to_float(beta_loss) + + # gamma for Maximization-Minimization (MM) algorithm [Fevotte 2011] + if beta_loss < 1: + gamma = 1. / (2. - beta_loss) + elif beta_loss > 2: + gamma = 1. / (beta_loss - 1.) + else: + gamma = 1. + + # used for the convergence criterion + error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True) + previous_error = error_at_init + + H_sum, HHt, XHt = None, None, None + for n_iter in range(1, max_iter + 1): + # update W + # H_sum, HHt and XHt are saved and reused if not update_H + delta_W, H_sum, HHt, XHt = _multiplicative_update_w( + X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma, + H_sum, HHt, XHt, update_H) + W *= delta_W + + # necessary for stability with beta_loss < 1 + if beta_loss < 1: + W[W < np.finfo(np.float64).eps] = 0. + + # update H + if update_H: + delta_H = _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, + l2_reg_H, gamma) + H *= delta_H + + # These values will be recomputed since H changed + H_sum, HHt, XHt = None, None, None + + # necessary for stability with beta_loss < 1 + if beta_loss <= 1: + H[H < np.finfo(np.float64).eps] = 0. + + # test convergence criterion every 10 iterations + if tol > 0 and n_iter % 10 == 0: + error = _beta_divergence(X, W, H, beta_loss, square_root=True) + + if verbose: + iter_time = time.time() + print("Epoch %02d reached after %.3f seconds, error: %f" % + (n_iter, iter_time - start_time, error)) + + if (previous_error - error) / error_at_init < tol: + break + previous_error = error + + # do not print if we have already printed in the convergence test + if verbose and (tol == 0 or n_iter % 10 != 0): + end_time = time.time() + print("Epoch %02d reached after %.3f seconds." % + (n_iter, end_time - start_time)) + + return W, H, n_iter + + +def non_negative_factorization(X, W=None, H=None, n_components=None, + init='warn', update_H=True, solver='cd', + beta_loss='frobenius', tol=1e-4, + max_iter=200, alpha=0., l1_ratio=0., + regularization=None, random_state=None, + verbose=0, shuffle=False): + r"""Compute Non-negative Matrix Factorization (NMF) + + Find two non-negative matrices (W, H) whose product approximates the non- + negative matrix X. This factorization can be used for example for + dimensionality reduction, source separation or topic extraction. + + The objective function is:: + + 0.5 * ||X - WH||_Fro^2 + + alpha * l1_ratio * ||vec(W)||_1 + + alpha * l1_ratio * ||vec(H)||_1 + + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 + + 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2 + + Where:: + + ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm) + ||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm) + + For multiplicative-update ('mu') solver, the Frobenius norm + (0.5 * ||X - WH||_Fro^2) can be changed into another beta-divergence loss, + by changing the beta_loss parameter. + + The objective function is minimized with an alternating minimization of W + and H. If H is given and update_H=False, it solves for W only. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Constant matrix. + + W : array-like, shape (n_samples, n_components) + If init='custom', it is used as initial guess for the solution. + + H : array-like, shape (n_components, n_features) + If init='custom', it is used as initial guess for the solution. + If update_H=False, it is used as a constant, to solve for W only. + + n_components : integer + Number of components, if n_components is not set all features + are kept. + + init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom' + Method used to initialize the procedure. + Default: 'random'. + + The default value will change from 'random' to None in version 0.23 + to make it consistent with decomposition.NMF. + + Valid options: + + - None: 'nndsvd' if n_components < n_features, otherwise 'random'. + + - 'random': non-negative random matrices, scaled with: + sqrt(X.mean() / n_components) + + - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) + initialization (better for sparseness) + + - 'nndsvda': NNDSVD with zeros filled with the average of X + (better when sparsity is not desired) + + - 'nndsvdar': NNDSVD with zeros filled with small random values + (generally faster, less accurate alternative to NNDSVDa + for when sparsity is not desired) + + - 'custom': use custom matrices W and H + + update_H : boolean, default: True + Set to True, both W and H will be estimated from initial guesses. + Set to False, only W will be estimated. + + solver : 'cd' | 'mu' + Numerical solver to use: + 'cd' is a Coordinate Descent solver that uses Fast Hierarchical + Alternating Least Squares (Fast HALS). + 'mu' is a Multiplicative Update solver. + + .. versionadded:: 0.17 + Coordinate Descent solver. + + .. versionadded:: 0.19 + Multiplicative Update solver. + + beta_loss : float or string, default 'frobenius' + String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}. + Beta divergence to be minimized, measuring the distance between X + and the dot product WH. Note that values different from 'frobenius' + (or 2) and 'kullback-leibler' (or 1) lead to significantly slower + fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input + matrix X cannot contain zeros. Used only in 'mu' solver. + + .. versionadded:: 0.19 + + tol : float, default: 1e-4 + Tolerance of the stopping condition. + + max_iter : integer, default: 200 + Maximum number of iterations before timing out. + + alpha : double, default: 0. + Constant that multiplies the regularization terms. + + l1_ratio : double, default: 0. + The regularization mixing parameter, with 0 <= l1_ratio <= 1. + For l1_ratio = 0 the penalty is an elementwise L2 penalty + (aka Frobenius Norm). + For l1_ratio = 1 it is an elementwise L1 penalty. + For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. + + regularization : 'both' | 'components' | 'transformation' | None + Select whether the regularization affects the components (H), the + transformation (W), both or none of them. + + random_state : int, RandomState instance or None, optional, default: None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + verbose : integer, default: 0 + The verbosity level. + + shuffle : boolean, default: False + If true, randomize the order of coordinates in the CD solver. + + Returns + ------- + W : array-like, shape (n_samples, n_components) + Solution to the non-negative least squares problem. + + H : array-like, shape (n_components, n_features) + Solution to the non-negative least squares problem. + + n_iter : int + Actual number of iterations. + + Examples + -------- + >>> import numpy as np + >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) + >>> from sklearn.decomposition import non_negative_factorization + >>> W, H, n_iter = non_negative_factorization(X, n_components=2, + ... init='random', random_state=0) + + References + ---------- + Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for + large scale nonnegative matrix and tensor factorizations." + IEICE transactions on fundamentals of electronics, communications and + computer sciences 92.3: 708-721, 2009. + + Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix + factorization with the beta-divergence. Neural Computation, 23(9). + """ + + X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float) + check_non_negative(X, "NMF (input X)") + beta_loss = _check_string_param(solver, regularization, beta_loss, init) + + if safe_min(X) == 0 and beta_loss <= 0: + raise ValueError("When beta_loss <= 0 and X contains zeros, " + "the solver may diverge. Please add small values to " + "X, or use a positive beta_loss.") + + n_samples, n_features = X.shape + if n_components is None: + n_components = n_features + + if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0: + raise ValueError("Number of components must be a positive integer;" + " got (n_components=%r)" % n_components) + if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0: + raise ValueError("Maximum number of iterations must be a positive " + "integer; got (max_iter=%r)" % max_iter) + if not isinstance(tol, numbers.Number) or tol < 0: + raise ValueError("Tolerance for stopping criteria must be " + "positive; got (tol=%r)" % tol) + + if init == "warn": + if n_components < n_features: + warnings.warn("The default value of init will change from " + "random to None in 0.23 to make it consistent " + "with decomposition.NMF.", FutureWarning) + init = "random" + + # check W and H, or initialize them + if init == 'custom' and update_H: + _check_init(H, (n_components, n_features), "NMF (input H)") + _check_init(W, (n_samples, n_components), "NMF (input W)") + elif not update_H: + _check_init(H, (n_components, n_features), "NMF (input H)") + # 'mu' solver should not be initialized by zeros + if solver == 'mu': + avg = np.sqrt(X.mean() / n_components) + W = np.full((n_samples, n_components), avg) + else: + W = np.zeros((n_samples, n_components)) + else: + W, H = _initialize_nmf(X, n_components, init=init, + random_state=random_state) + + l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization( + alpha, l1_ratio, regularization) + + if solver == 'cd': + W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter, + l1_reg_W, l1_reg_H, + l2_reg_W, l2_reg_H, + update_H=update_H, + verbose=verbose, + shuffle=shuffle, + random_state=random_state) + elif solver == 'mu': + W, H, n_iter = _fit_multiplicative_update(X, W, H, beta_loss, max_iter, + tol, l1_reg_W, l1_reg_H, + l2_reg_W, l2_reg_H, update_H, + verbose) + + else: + raise ValueError("Invalid solver parameter '%s'." % solver) + + if n_iter == max_iter and tol > 0: + warnings.warn("Maximum number of iteration %d reached. Increase it to" + " improve convergence." % max_iter, ConvergenceWarning) + + return W, H, n_iter + + +class NMFOriginal(BaseEstimator, TransformerMixin): + r"""Non-Negative Matrix Factorization (NMF) + + Find two non-negative matrices (W, H) whose product approximates the non- + negative matrix X. This factorization can be used for example for + dimensionality reduction, source separation or topic extraction. + + The objective function is:: + + 0.5 * ||X - WH||_Fro^2 + + alpha * l1_ratio * ||vec(W)||_1 + + alpha * l1_ratio * ||vec(H)||_1 + + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 + + 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2 + + Where:: + + ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm) + ||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm) + + For multiplicative-update ('mu') solver, the Frobenius norm + (0.5 * ||X - WH||_Fro^2) can be changed into another beta-divergence loss, + by changing the beta_loss parameter. + + The objective function is minimized with an alternating minimization of W + and H. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int or None + Number of components, if n_components is not set all features + are kept. + + init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom' + Method used to initialize the procedure. + Default: None. + Valid options: + + - None: 'nndsvd' if n_components <= min(n_samples, n_features), + otherwise random. + + - 'random': non-negative random matrices, scaled with: + sqrt(X.mean() / n_components) + + - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) + initialization (better for sparseness) + + - 'nndsvda': NNDSVD with zeros filled with the average of X + (better when sparsity is not desired) + + - 'nndsvdar': NNDSVD with zeros filled with small random values + (generally faster, less accurate alternative to NNDSVDa + for when sparsity is not desired) + + - 'custom': use custom matrices W and H + + solver : 'cd' | 'mu' + Numerical solver to use: + 'cd' is a Coordinate Descent solver. + 'mu' is a Multiplicative Update solver. + + .. versionadded:: 0.17 + Coordinate Descent solver. + + .. versionadded:: 0.19 + Multiplicative Update solver. + + beta_loss : float or string, default 'frobenius' + String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}. + Beta divergence to be minimized, measuring the distance between X + and the dot product WH. Note that values different from 'frobenius' + (or 2) and 'kullback-leibler' (or 1) lead to significantly slower + fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input + matrix X cannot contain zeros. Used only in 'mu' solver. + + .. versionadded:: 0.19 + + tol : float, default: 1e-4 + Tolerance of the stopping condition. + + max_iter : integer, default: 200 + Maximum number of iterations before timing out. + + random_state : int, RandomState instance or None, optional, default: None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + alpha : double, default: 0. + Constant that multiplies the regularization terms. Set it to zero to + have no regularization. + + .. versionadded:: 0.17 + *alpha* used in the Coordinate Descent solver. + + l1_ratio : double, default: 0. + The regularization mixing parameter, with 0 <= l1_ratio <= 1. + For l1_ratio = 0 the penalty is an elementwise L2 penalty + (aka Frobenius Norm). + For l1_ratio = 1 it is an elementwise L1 penalty. + For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. + + .. versionadded:: 0.17 + Regularization parameter *l1_ratio* used in the Coordinate Descent + solver. + + verbose : bool, default=False + Whether to be verbose. + + shuffle : boolean, default: False + If true, randomize the order of coordinates in the CD solver. + + .. versionadded:: 0.17 + *shuffle* parameter used in the Coordinate Descent solver. + + Attributes + ---------- + components_ : array, [n_components, n_features] + Factorization matrix, sometimes called 'dictionary'. + + reconstruction_err_ : number + Frobenius norm of the matrix difference, or beta-divergence, between + the training data ``X`` and the reconstructed data ``WH`` from + the fitted model. + + n_iter_ : int + Actual number of iterations. + + Examples + -------- + >>> import numpy as np + >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) + >>> from sklearn.decomposition import NMF + >>> model = NMF(n_components=2, init='random', random_state=0) + >>> W = model.fit_transform(X) + >>> H = model.components_ + + References + ---------- + Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for + large scale nonnegative matrix and tensor factorizations." + IEICE transactions on fundamentals of electronics, communications and + computer sciences 92.3: 708-721, 2009. + + Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix + factorization with the beta-divergence. Neural Computation, 23(9). + """ + + def __init__(self, n_components=None, init=None, solver='cd', + beta_loss='frobenius', tol=1e-4, max_iter=200, + random_state=None, alpha=0., l1_ratio=0., verbose=0, + shuffle=False): + self.n_components = n_components + self.init = init + self.solver = solver + self.beta_loss = beta_loss + self.tol = tol + self.max_iter = max_iter + self.random_state = random_state + self.alpha = alpha + self.l1_ratio = l1_ratio + self.verbose = verbose + self.shuffle = shuffle + + def fit_transform(self, X, y=None, W=None, H=None): + """Learn a NMF model for the data X and returns the transformed data. + + This is more efficient than calling fit followed by transform. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Data matrix to be decomposed + + y : Ignored + + W : array-like, shape (n_samples, n_components) + If init='custom', it is used as initial guess for the solution. + + H : array-like, shape (n_components, n_features) + If init='custom', it is used as initial guess for the solution. + + Returns + ------- + W : array, shape (n_samples, n_components) + Transformed data. + """ + X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float) + + W, H, n_iter_ = non_negative_factorization( + X=X, W=W, H=H, n_components=self.n_components, init=self.init, + update_H=True, solver=self.solver, beta_loss=self.beta_loss, + tol=self.tol, max_iter=self.max_iter, alpha=self.alpha, + l1_ratio=self.l1_ratio, regularization='both', + random_state=self.random_state, verbose=self.verbose, + shuffle=self.shuffle) + + self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss, + square_root=True) + + self.n_components_ = H.shape[0] + self.components_ = H + self.n_iter_ = n_iter_ + + return W + + def fit(self, X, y=None, **params): + """Learn a NMF model for the data X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Data matrix to be decomposed + + y : Ignored + + Returns + ------- + self + """ + self.fit_transform(X, **params) + return self + + def transform(self, X): + """Transform the data X according to the fitted NMF model + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Data matrix to be transformed by the model + + Returns + ------- + W : array, shape (n_samples, n_components) + Transformed data + """ + check_is_fitted(self, 'n_components_') + + W, _, n_iter_ = non_negative_factorization( + X=X, W=None, H=self.components_, n_components=self.n_components_, + init=self.init, update_H=False, solver=self.solver, + beta_loss=self.beta_loss, tol=self.tol, max_iter=self.max_iter, + alpha=self.alpha, l1_ratio=self.l1_ratio, regularization='both', + random_state=self.random_state, verbose=self.verbose, + shuffle=self.shuffle) + + return W + + def inverse_transform(self, W): + """Transform data back to its original space. + + Parameters + ---------- + W : {array-like, sparse matrix}, shape (n_samples, n_components) + Transformed data matrix + + Returns + ------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Data matrix of original shape + + .. versionadded:: 0.18 + """ + check_is_fitted(self, 'n_components_') + return np.dot(W, self.components_)