diff --git a/benchmarks/bench_minibatch_nmf.py b/benchmarks/bench_minibatch_nmf.py
new file mode 100644
index 0000000000000..3814c1eb28bca
--- /dev/null
+++ b/benchmarks/bench_minibatch_nmf.py
@@ -0,0 +1,117 @@
+
+from time import time
+import pandas as pd
+
+from sklearn.decomposition.nmf import _beta_divergence
+from sklearn.feature_extraction.text import HashingVectorizer
+from sklearn.utils import gen_batches
+
+from nmf import NMF
+from nmf_original import NMFOriginal
+from nmf_original import non_negative_factorization
+
+import matplotlib.pyplot as plt
+
+# Download file from:
+# https://www.dropbox.com/s/n8ynmz6jxkynvyy/enwiki_1M_first_paragraphs.csv.zip?dl=0
+# https://filesender.renater.fr/?s=download&token=88222d6d-5aee-c59b-4f34-c233b4d184e1
+df = pd.read_csv('enwiki_1M_first_paragraphs.csv')
+cats = df['0'].sample(frac=1, random_state=5).astype(str)
+counter = HashingVectorizer(analyzer='word', ngram_range=(1, 1),
+                            n_features=2**12, norm=None,
+                            alternate_sign=False)
+X = counter.fit_transform(cats)
+n_components = 10
+beta_loss = 'kullback-leibler'
+n_train = 500000
+n_test = 10000
+batch_size = 10000
+random_state = 12
+n_batch = (n_train - 1) // batch_size + 1
+X_test = X[:n_test, :]
+X = X[n_test:n_train + n_test, :]
+
+max_iter_nmf = [1, 5, 10, 30, 50, 100]
+n_iter_minibatch_nmf = 50
+
+
+def get_optimal_w(X, H):
+    W, _, _ = non_negative_factorization(
+        X=X, W=None, H=H,
+        n_components=n_components,
+        init='custom', update_H=False, solver='mu',
+        beta_loss=beta_loss, tol=1e-4, max_iter=200, alpha=0.,
+        l1_ratio=0., regularization=None, random_state=None,
+        verbose=0, shuffle=False)
+    return W
+
+
+minibatch_nmf = NMF(
+    n_components=n_components, beta_loss=beta_loss, batch_size=batch_size,
+    solver='mu', random_state=random_state, max_iter=3)
+
+fig, ax = plt.subplots()
+plt.xscale('log')
+fontsize = 16
+
+total_time = 0
+time_nmf = []
+loss_nmf = []
+for n_iter in range(n_iter_minibatch_nmf):
+
+    for j, slice in enumerate(gen_batches(n=n_train,
+                                          batch_size=batch_size)):
+        t0 = time()
+        minibatch_nmf.partial_fit(X[slice])
+        tf = time() - t0
+        total_time += tf
+        if ((j % 11 == 9) and (n_iter <= 1)) or j == n_batch - 1:
+            time_nmf.append(total_time)
+            W = get_optimal_w(X_test, minibatch_nmf.components_)
+            loss = _beta_divergence(X_test, W, minibatch_nmf.components_,
+                                    minibatch_nmf.beta_loss) / n_test
+            loss_nmf.append(loss)
+            plt.plot(time_nmf, loss_nmf, 'b', marker='o',
+                     label='Mini-batch NMF')
+            plt.pause(.01)
+
+    print('Time MiniBatchNMF: %.1fs.' % total_time)
+    print('KL-div MiniBatchNMF: %.2f' % loss)
+    del W
+
+total_time = 0
+time_nmf = []
+loss_nmf = []
+for i, max_iter in enumerate(max_iter_nmf):
+    nmf = NMFOriginal(n_components=n_components, beta_loss=beta_loss,
+                      solver='mu', max_iter=max_iter,
+                      random_state=random_state, tol=0)
+    t0 = time()
+    nmf.fit(X)
+    tf = time() - t0
+    total_time += tf
+    time_nmf.append(total_time)
+    print('Time NMF: %.1fs.' % total_time)
+    W = get_optimal_w(X_test, nmf.components_)
+    loss = _beta_divergence(X_test, W, nmf.components_,
+                            nmf.beta_loss) / n_test
+    loss_nmf.append(loss)
+    print('KL-div NMF: %.2f' % loss)
+    plt.plot(time_nmf, loss_nmf, 'r', marker='o', label='NMF')
+    plt.pause(.01)
+    del W
+
+handles, labels = ax.get_legend_handles_labels()
+plt.legend(handles=(handles[-1], handles[0]),
+           labels=(labels[-1], labels[0]), fontsize=fontsize)
+plt.tick_params(axis='both', which='major', labelsize=fontsize-2)
+plt.xlabel('Time (seconds)', fontsize=fontsize)
+plt.ylabel(beta_loss, fontsize=fontsize)
+title = 'Wikipedia articles (first paragraph)'
+ax.set_title(title, fontsize=fontsize+4)
+
+figname = 'benchmark_nmf_wikipedia_articles.png'
+print('Saving: ' + figname)
+plt.savefig(figname,
+            transparent=False, bbox_inches='tight', pad_inches=0)
+plt.show()
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
index 63d9d457687eb..b1fb100c5c025 100644
--- a/sklearn/decomposition/nmf.py
+++ b/sklearn/decomposition/nmf.py
@@ -14,13 +14,13 @@
 import numpy as np
 import scipy.sparse as sp
 
-from ..base import BaseEstimator, TransformerMixin
-from ..utils import check_random_state, check_array
-from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
-from ..utils.extmath import safe_min
-from ..utils.validation import check_is_fitted, check_non_negative
-from ..exceptions import ConvergenceWarning
-from .cdnmf_fast import _update_cdnmf_fast
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils import check_random_state, check_array, gen_batches
+from sklearn.utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
+from sklearn.utils.extmath import safe_min
+from sklearn.utils.validation import check_is_fitted, check_non_negative
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.decomposition.cdnmf_fast import _update_cdnmf_fast
 
 EPSILON = np.finfo(np.float32).eps
 
@@ -328,7 +328,9 @@ def _initialize_nmf(X, n_components, init=None, eps=1e-6,
         # supported as a kwarg on ufuncs
         np.abs(H, H)
         np.abs(W, W)
-        return W, H
+        A = H.copy()
+        B = np.ones((n_components, n_features))
+        return W, H, A, B
 
     # NNDSVD initialization
     U, S, V = randomized_svd(X, n_components, random_state=random_state)
@@ -384,8 +386,9 @@ def _initialize_nmf(X, n_components, init=None, eps=1e-6,
         raise ValueError(
             'Invalid init parameter: got %r instead of one of %r' %
             (init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar')))
-
-    return W, H
+    A = H.copy()
+    B = np.ones((n_components, n_features))
+    return W, H, A, B
 
 
 def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle,
@@ -564,7 +567,8 @@ def _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma,
             WH_safe_X_data[WH_safe_X_data == 0] = EPSILON
 
         if beta_loss == 1:
-            np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data)
+            np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data,
+                      where=(WH_safe_X_data != 0))
         elif beta_loss == 0:
             # speeds up computation time
             # refer to /numpy/numpy/issues/9363
@@ -620,7 +624,9 @@ def _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma,
     return delta_W, H_sum, HHt, XHt
 
 
-def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma):
+def _multiplicative_update_h(X, W, H, A, B,
+                             beta_loss, l1_reg_H, l2_reg_H, gamma,
+                             n_iter):
     """update H in Multiplicative Update NMF"""
     if beta_loss == 2:
         numerator = safe_sparse_dot(W.T, X)
@@ -645,7 +651,8 @@ def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma):
             WH_safe_X_data[WH_safe_X_data == 0] = EPSILON
 
         if beta_loss == 1:
-            np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data)
+            np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data,
+                      where=(WH_safe_X_data != 0))
         elif beta_loss == 0:
             # speeds up computation time
             # refer to /numpy/numpy/issues/9363
@@ -692,17 +699,24 @@ def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma):
         denominator = denominator + l2_reg_H * H
     denominator[denominator == 0] = EPSILON
 
-    numerator /= denominator
-    delta_H = numerator
+    # r = .1
+    # rho = r ** (1 / n_iter)
+    rho = .99
+    A *= rho
+    B *= rho
+    A += numerator * H
+    B += denominator
+    H = np.divide(A, B)
 
     # gamma is in ]0, 1]
     if gamma != 1:
         delta_H **= gamma
 
-    return delta_H
+    return H, A, B
 
 
-def _fit_multiplicative_update(X, W, H, beta_loss='frobenius',
+def _fit_multiplicative_update(X, W, H, A, B, beta_loss='frobenius',
+                               batch_size=1024,
                                max_iter=200, tol=1e-4,
                                l1_reg_W=0, l1_reg_H=0, l2_reg_W=0, l2_reg_H=0,
                                update_H=True, verbose=0):
@@ -783,49 +797,61 @@ def _fit_multiplicative_update(X, W, H, beta_loss='frobenius',
         gamma = 1. / (beta_loss - 1.)
     else:
         gamma = 1.
-
+    n_samples = X.shape[0]
     # used for the convergence criterion
     error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True)
     previous_error = error_at_init
 
     H_sum, HHt, XHt = None, None, None
+
+    n_iter_update_h_ = 1
+    max_iter_update_w_ = 5
+
     for n_iter in range(1, max_iter + 1):
         # update W
         # H_sum, HHt and XHt are saved and reused if not update_H
-        delta_W, H_sum, HHt, XHt = _multiplicative_update_w(
-            X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma,
-            H_sum, HHt, XHt, update_H)
-        W *= delta_W
+        for i, slice in enumerate(gen_batches(n=n_samples,
+                                              batch_size=batch_size)):
 
-        # necessary for stability with beta_loss < 1
-        if beta_loss < 1:
-            W[W < np.finfo(np.float64).eps] = 0.
-
-        # update H
-        if update_H:
-            delta_H = _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H,
-                                               l2_reg_H, gamma)
-            H *= delta_H
-
-            # These values will be recomputed since H changed
-            H_sum, HHt, XHt = None, None, None
+            for j in range(max_iter_update_w_):
+                delta_W, H_sum, HHt, XHt = _multiplicative_update_w(
+                    X[slice], W[slice], H, beta_loss, l1_reg_W, l2_reg_W,
+                    gamma, H_sum, HHt, XHt, update_H)
+                W[slice] *= delta_W
 
             # necessary for stability with beta_loss < 1
-            if beta_loss <= 1:
-                H[H < np.finfo(np.float64).eps] = 0.
-
-        # test convergence criterion every 10 iterations
-        if tol > 0 and n_iter % 10 == 0:
-            error = _beta_divergence(X, W, H, beta_loss, square_root=True)
-
-            if verbose:
-                iter_time = time.time()
-                print("Epoch %02d reached after %.3f seconds, error: %f" %
-                      (n_iter, iter_time - start_time, error))
-
-            if (previous_error - error) / error_at_init < tol:
-                break
-            previous_error = error
+            if beta_loss < 1:
+                W[slice][W[slice] < np.finfo(np.float64).eps] = 0.
+
+            # update H
+            if update_H:
+                H, A, B = _multiplicative_update_h(X[slice], W[slice], H,
+                                                   A, B,
+                                                   beta_loss, l1_reg_H,
+                                                   l2_reg_H, gamma,
+                                                   n_iter_update_h_)
+                n_iter_update_h_ += 1
+
+                # These values will be recomputed since H changed
+                H_sum, HHt, XHt = None, None, None
+
+                # necessary for stability with beta_loss < 1
+                if beta_loss <= 1:
+                    H[H < np.finfo(np.float64).eps] = 0.
+
+            # test convergence criterion every 10 iterations
+            if tol > 0 and n_iter % 10 == 0:
+                error = _beta_divergence(X, W, H, beta_loss,
+                                         square_root=True)
+
+                if verbose:
+                    iter_time = time.time()
+                    print("Epoch %02d reached after %.3f seconds, error: %f" %
+                          (n_iter, iter_time - start_time, error))
+
+                if (previous_error - error) / error_at_init < tol:
+                    break
+                previous_error = error
 
     # do not print if we have already printed in the convergence test
     if verbose and (tol == 0 or n_iter % 10 != 0):
@@ -836,7 +862,9 @@ def _fit_multiplicative_update(X, W, H, beta_loss='frobenius',
     return W, H, n_iter
 
 
-def non_negative_factorization(X, W=None, H=None, n_components=None,
+def non_negative_factorization(X, W=None, H=None, A=None, B=None,
+                               n_components=None,
+                               batch_size=1024,
                                init='warn', update_H=True, solver='cd',
                                beta_loss='frobenius', tol=1e-4,
                                max_iter=200, alpha=0., l1_ratio=0.,
@@ -1031,6 +1059,8 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
     # check W and H, or initialize them
     if init == 'custom' and update_H:
         _check_init(H, (n_components, n_features), "NMF (input H)")
+        _check_init(A, (n_components, n_features), "NMF (input A)")
+        _check_init(B, (n_components, n_features), "NMF (input B)")
         _check_init(W, (n_samples, n_components), "NMF (input W)")
     elif not update_H:
         _check_init(H, (n_components, n_features), "NMF (input H)")
@@ -1040,9 +1070,11 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
             W = np.full((n_samples, n_components), avg)
         else:
             W = np.zeros((n_samples, n_components))
+        A = None
+        B = None
     else:
-        W, H = _initialize_nmf(X, n_components, init=init,
-                               random_state=random_state)
+        W, H, A, B = _initialize_nmf(X, n_components, init=init,
+                                     random_state=random_state)
 
     l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization(
         alpha, l1_ratio, regularization)
@@ -1056,7 +1088,8 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
                                                shuffle=shuffle,
                                                random_state=random_state)
     elif solver == 'mu':
-        W, H, n_iter = _fit_multiplicative_update(X, W, H, beta_loss, max_iter,
+        W, H, n_iter = _fit_multiplicative_update(X, W, H, A, B, beta_loss,
+                                                  batch_size, max_iter,
                                                   tol, l1_reg_W, l1_reg_H,
                                                   l2_reg_W, l2_reg_H, update_H,
                                                   verbose)
@@ -1068,7 +1101,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
         warnings.warn("Maximum number of iteration %d reached. Increase it to"
                       " improve convergence." % max_iter, ConvergenceWarning)
 
-    return W, H, n_iter
+    return W, H, A, B, n_iter
 
 
 class NMF(BaseEstimator, TransformerMixin):
@@ -1223,12 +1256,14 @@ class NMF(BaseEstimator, TransformerMixin):
     """
 
     def __init__(self, n_components=None, init=None, solver='cd',
+                 batch_size=1024,
                  beta_loss='frobenius', tol=1e-4, max_iter=200,
                  random_state=None, alpha=0., l1_ratio=0., verbose=0,
                  shuffle=False):
         self.n_components = n_components
         self.init = init
         self.solver = solver
+        self.batch_size = batch_size
         self.beta_loss = beta_loss
         self.tol = tol
         self.max_iter = max_iter
@@ -1263,19 +1298,22 @@ def fit_transform(self, X, y=None, W=None, H=None):
         """
         X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
 
-        W, H, n_iter_ = non_negative_factorization(
-            X=X, W=W, H=H, n_components=self.n_components, init=self.init,
+        W, H, A, B, n_iter_ = non_negative_factorization(
+            X=X, W=W, H=H, A=None, B=None, n_components=self.n_components,
+            batch_size=self.batch_size, init=self.init,
             update_H=True, solver=self.solver, beta_loss=self.beta_loss,
-            tol=self.tol, max_iter=self.max_iter, alpha=self.alpha,
+            tol=0, max_iter=1, alpha=self.alpha,
             l1_ratio=self.l1_ratio, regularization='both',
             random_state=self.random_state, verbose=self.verbose,
             shuffle=self.shuffle)
-
+        # TODO internal iters for W
         self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss,
                                                     square_root=True)
 
         self.n_components_ = H.shape[0]
         self.components_ = H
+        self.components_numerator_ = A
+        self.components_denominator_ = B
         self.n_iter_ = n_iter_
 
         return W
@@ -1297,6 +1335,38 @@ def fit(self, X, y=None, **params):
         self.fit_transform(X, **params)
         return self
 
+    def partial_fit(self, X, y=None, **params):
+        if hasattr(self, 'components_'):
+            W = np.ones((X.shape[0], self.n_components))
+            W *= np.maximum(1e-6, X.sum(axis=1).A)
+            W /= W.sum(axis=1, keepdims=True)
+            W, H, A, B, n_iter_ = non_negative_factorization(
+                X=X, W=W, H=self.components_,
+                A=self.components_numerator_, B=self.components_denominator_,
+                n_components=self.n_components,
+                batch_size=self.batch_size, init='custom',
+                update_H=True, solver=self.solver, beta_loss=self.beta_loss,
+                tol=0, max_iter=1, alpha=self.alpha,
+                l1_ratio=self.l1_ratio, regularization='both',
+                random_state=self.random_state, verbose=self.verbose,
+                shuffle=self.shuffle)
+
+            # probably not necessary to compute at each time
+            # self.reconstruction_err_ = _beta_divergence(X, W, H,
+            #                                             self.beta_loss,
+            #                                             square_root=True)
+
+            self.n_components_ = H.shape[0]
+            self.components_ = H
+            self.components_numerator_ = A
+            self.components_denominator_ = B
+            self.n_iter_ = n_iter_
+
+        else:
+            self.fit_transform(X, **params)
+
+        return self
+
     def transform(self, X):
         """Transform the data X according to the fitted NMF model
 
@@ -1312,8 +1382,10 @@ def transform(self, X):
         """
         check_is_fitted(self, 'n_components_')
 
-        W, _, n_iter_ = non_negative_factorization(
-            X=X, W=None, H=self.components_, n_components=self.n_components_,
+        W, _, _, _, n_iter_ = non_negative_factorization(
+            X=X, W=None, H=self.components_, A=None, B=None,
+            n_components=self.n_components_,
+            batch_size=self.batch_size,
             init=self.init, update_H=False, solver=self.solver,
             beta_loss=self.beta_loss, tol=self.tol, max_iter=self.max_iter,
             alpha=self.alpha, l1_ratio=self.l1_ratio, regularization='both',
diff --git a/sklearn/decomposition/nmf_original.py b/sklearn/decomposition/nmf_original.py
new file mode 100644
index 0000000000000..d568573513f5f
--- /dev/null
+++ b/sklearn/decomposition/nmf_original.py
@@ -0,0 +1,1341 @@
+""" Non-negative matrix factorization
+"""
+# Author: Vlad Niculae
+#         Lars Buitinck
+#         Mathieu Blondel <mathieu@mblondel.org>
+#         Tom Dupre la Tour
+# License: BSD 3 clause
+
+from math import sqrt
+import warnings
+import numbers
+import time
+
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils import check_random_state, check_array
+from sklearn.utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
+from sklearn.utils.extmath import safe_min
+from sklearn.utils.validation import check_is_fitted, check_non_negative
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.decomposition.cdnmf_fast import _update_cdnmf_fast
+
+EPSILON = np.finfo(np.float32).eps
+
+INTEGER_TYPES = (numbers.Integral, np.integer)
+
+
+def norm(x):
+    """Dot product-based Euclidean norm implementation
+
+    See: http://fseoane.net/blog/2011/computing-the-vector-norm/
+
+    Parameters
+    ----------
+    x : array-like
+        Vector for which to compute the norm
+    """
+    return sqrt(squared_norm(x))
+
+
+def trace_dot(X, Y):
+    """Trace of np.dot(X, Y.T).
+
+    Parameters
+    ----------
+    X : array-like
+        First matrix
+    Y : array-like
+        Second matrix
+    """
+    return np.dot(X.ravel(), Y.ravel())
+
+
+def _check_init(A, shape, whom):
+    A = check_array(A)
+    if np.shape(A) != shape:
+        raise ValueError('Array with wrong shape passed to %s. Expected %s, '
+                         'but got %s ' % (whom, shape, np.shape(A)))
+    check_non_negative(A, whom)
+    if np.max(A) == 0:
+        raise ValueError('Array passed to %s is full of zeros.' % whom)
+
+
+def _beta_divergence(X, W, H, beta, square_root=False):
+    """Compute the beta-divergence of X and dot(W, H).
+
+    Parameters
+    ----------
+    X : float or array-like, shape (n_samples, n_features)
+
+    W : float or dense array-like, shape (n_samples, n_components)
+
+    H : float or dense array-like, shape (n_components, n_features)
+
+    beta : float, string in {'frobenius', 'kullback-leibler', 'itakura-saito'}
+        Parameter of the beta-divergence.
+        If beta == 2, this is half the Frobenius *squared* norm.
+        If beta == 1, this is the generalized Kullback-Leibler divergence.
+        If beta == 0, this is the Itakura-Saito divergence.
+        Else, this is the general beta-divergence.
+
+    square_root : boolean, default False
+        If True, return np.sqrt(2 * res)
+        For beta == 2, it corresponds to the Frobenius norm.
+
+    Returns
+    -------
+        res : float
+            Beta divergence of X and np.dot(X, H)
+    """
+    beta = _beta_loss_to_float(beta)
+
+    # The method can be called with scalars
+    if not sp.issparse(X):
+        X = np.atleast_2d(X)
+    W = np.atleast_2d(W)
+    H = np.atleast_2d(H)
+
+    # Frobenius norm
+    if beta == 2:
+        # Avoid the creation of the dense np.dot(W, H) if X is sparse.
+        if sp.issparse(X):
+            norm_X = np.dot(X.data, X.data)
+            norm_WH = trace_dot(np.dot(np.dot(W.T, W), H), H)
+            cross_prod = trace_dot((X * H.T), W)
+            res = (norm_X + norm_WH - 2. * cross_prod) / 2.
+        else:
+            res = squared_norm(X - np.dot(W, H)) / 2.
+
+        if square_root:
+            return np.sqrt(res * 2)
+        else:
+            return res
+
+    if sp.issparse(X):
+        # compute np.dot(W, H) only where X is nonzero
+        WH_data = _special_sparse_dot(W, H, X).data
+        X_data = X.data
+    else:
+        WH = np.dot(W, H)
+        WH_data = WH.ravel()
+        X_data = X.ravel()
+
+    # do not affect the zeros: here 0 ** (-1) = 0 and not infinity
+    indices = X_data > EPSILON
+    WH_data = WH_data[indices]
+    X_data = X_data[indices]
+
+    # used to avoid division by zero
+    WH_data[WH_data == 0] = EPSILON
+
+    # generalized Kullback-Leibler divergence
+    if beta == 1:
+        # fast and memory efficient computation of np.sum(np.dot(W, H))
+        sum_WH = np.dot(np.sum(W, axis=0), np.sum(H, axis=1))
+        # computes np.sum(X * log(X / WH)) only where X is nonzero
+        div = X_data / WH_data
+        res = np.dot(X_data, np.log(div))
+        # add full np.sum(np.dot(W, H)) - np.sum(X)
+        res += sum_WH - X_data.sum()
+
+    # Itakura-Saito divergence
+    elif beta == 0:
+        div = X_data / WH_data
+        res = np.sum(div) - np.product(X.shape) - np.sum(np.log(div))
+
+    # beta-divergence, beta not in (0, 1, 2)
+    else:
+        if sp.issparse(X):
+            # slow loop, but memory efficient computation of :
+            # np.sum(np.dot(W, H) ** beta)
+            sum_WH_beta = 0
+            for i in range(X.shape[1]):
+                sum_WH_beta += np.sum(np.dot(W, H[:, i]) ** beta)
+
+        else:
+            sum_WH_beta = np.sum(WH ** beta)
+
+        sum_X_WH = np.dot(X_data, WH_data ** (beta - 1))
+        res = (X_data ** beta).sum() - beta * sum_X_WH
+        res += sum_WH_beta * (beta - 1)
+        res /= beta * (beta - 1)
+
+    if square_root:
+        return np.sqrt(2 * res)
+    else:
+        return res
+
+
+def _special_sparse_dot(W, H, X):
+    """Computes np.dot(W, H), only where X is non zero."""
+    if sp.issparse(X):
+        ii, jj = X.nonzero()
+        dot_vals = np.multiply(W[ii, :], H.T[jj, :]).sum(axis=1)
+        WH = sp.coo_matrix((dot_vals, (ii, jj)), shape=X.shape)
+        return WH.tocsr()
+    else:
+        return np.dot(W, H)
+
+
+def _compute_regularization(alpha, l1_ratio, regularization):
+    """Compute L1 and L2 regularization coefficients for W and H"""
+    alpha_H = 0.
+    alpha_W = 0.
+    if regularization in ('both', 'components'):
+        alpha_H = float(alpha)
+    if regularization in ('both', 'transformation'):
+        alpha_W = float(alpha)
+
+    l1_reg_W = alpha_W * l1_ratio
+    l1_reg_H = alpha_H * l1_ratio
+    l2_reg_W = alpha_W * (1. - l1_ratio)
+    l2_reg_H = alpha_H * (1. - l1_ratio)
+    return l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H
+
+
+def _check_string_param(solver, regularization, beta_loss, init):
+    allowed_solver = ('cd', 'mu')
+    if solver not in allowed_solver:
+        raise ValueError(
+            'Invalid solver parameter: got %r instead of one of %r' %
+            (solver, allowed_solver))
+
+    allowed_regularization = ('both', 'components', 'transformation', None)
+    if regularization not in allowed_regularization:
+        raise ValueError(
+            'Invalid regularization parameter: got %r instead of one of %r' %
+            (regularization, allowed_regularization))
+
+    # 'mu' is the only solver that handles other beta losses than 'frobenius'
+    if solver != 'mu' and beta_loss not in (2, 'frobenius'):
+        raise ValueError(
+            'Invalid beta_loss parameter: solver %r does not handle beta_loss'
+            ' = %r' % (solver, beta_loss))
+
+    if solver == 'mu' and init == 'nndsvd':
+        warnings.warn("The multiplicative update ('mu') solver cannot update "
+                      "zeros present in the initialization, and so leads to "
+                      "poorer results when used jointly with init='nndsvd'. "
+                      "You may try init='nndsvda' or init='nndsvdar' instead.",
+                      UserWarning)
+
+    beta_loss = _beta_loss_to_float(beta_loss)
+    return beta_loss
+
+
+def _beta_loss_to_float(beta_loss):
+    """Convert string beta_loss to float"""
+    allowed_beta_loss = {'frobenius': 2,
+                         'kullback-leibler': 1,
+                         'itakura-saito': 0}
+    if isinstance(beta_loss, str) and beta_loss in allowed_beta_loss:
+        beta_loss = allowed_beta_loss[beta_loss]
+
+    if not isinstance(beta_loss, numbers.Number):
+        raise ValueError('Invalid beta_loss parameter: got %r instead '
+                         'of one of %r, or a float.' %
+                         (beta_loss, allowed_beta_loss.keys()))
+    return beta_loss
+
+
+def _initialize_nmf(X, n_components, init=None, eps=1e-6,
+                    random_state=None):
+    """Algorithms for NMF initialization.
+
+    Computes an initial guess for the non-negative
+    rank k matrix approximation for X: X = WH
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        The data matrix to be decomposed.
+
+    n_components : integer
+        The number of components desired in the approximation.
+
+    init :  None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar'
+        Method used to initialize the procedure.
+        Default: None.
+        Valid options:
+
+        - None: 'nndsvd' if n_components <= min(n_samples, n_features),
+            otherwise 'random'.
+
+        - 'random': non-negative random matrices, scaled with:
+            sqrt(X.mean() / n_components)
+
+        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
+            initialization (better for sparseness)
+
+        - 'nndsvda': NNDSVD with zeros filled with the average of X
+            (better when sparsity is not desired)
+
+        - 'nndsvdar': NNDSVD with zeros filled with small random values
+            (generally faster, less accurate alternative to NNDSVDa
+            for when sparsity is not desired)
+
+        - 'custom': use custom matrices W and H
+
+    eps : float
+        Truncate all values less then this in output to zero.
+
+    random_state : int, RandomState instance or None, optional, default: None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`. Used when ``random`` == 'nndsvdar' or 'random'.
+
+    Returns
+    -------
+    W : array-like, shape (n_samples, n_components)
+        Initial guesses for solving X ~= WH
+
+    H : array-like, shape (n_components, n_features)
+        Initial guesses for solving X ~= WH
+
+    References
+    ----------
+    C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for
+    nonnegative matrix factorization - Pattern Recognition, 2008
+    http://tinyurl.com/nndsvd
+    """
+    check_non_negative(X, "NMF initialization")
+    n_samples, n_features = X.shape
+
+    if (init is not None and init != 'random'
+            and n_components > min(n_samples, n_features)):
+        raise ValueError("init = '{}' can only be used when "
+                         "n_components <= min(n_samples, n_features)"
+                         .format(init))
+
+    if init is None:
+        if n_components <= min(n_samples, n_features):
+            init = 'nndsvd'
+        else:
+            init = 'random'
+
+    # Random initialization
+    if init == 'random':
+        avg = np.sqrt(X.mean() / n_components)
+        rng = check_random_state(random_state)
+        H = avg * rng.randn(n_components, n_features)
+        W = avg * rng.randn(n_samples, n_components)
+        # we do not write np.abs(H, out=H) to stay compatible with
+        # numpy 1.5 and earlier where the 'out' keyword is not
+        # supported as a kwarg on ufuncs
+        np.abs(H, H)
+        np.abs(W, W)
+        return W, H
+
+    # NNDSVD initialization
+    U, S, V = randomized_svd(X, n_components, random_state=random_state)
+    W, H = np.zeros(U.shape), np.zeros(V.shape)
+
+    # The leading singular triplet is non-negative
+    # so it can be used as is for initialization.
+    W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
+    H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])
+
+    for j in range(1, n_components):
+        x, y = U[:, j], V[j, :]
+
+        # extract positive and negative parts of column vectors
+        x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
+        x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))
+
+        # and their norms
+        x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
+        x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)
+
+        m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm
+
+        # choose update
+        if m_p > m_n:
+            u = x_p / x_p_nrm
+            v = y_p / y_p_nrm
+            sigma = m_p
+        else:
+            u = x_n / x_n_nrm
+            v = y_n / y_n_nrm
+            sigma = m_n
+
+        lbd = np.sqrt(S[j] * sigma)
+        W[:, j] = lbd * u
+        H[j, :] = lbd * v
+
+    W[W < eps] = 0
+    H[H < eps] = 0
+
+    if init == "nndsvd":
+        pass
+    elif init == "nndsvda":
+        avg = X.mean()
+        W[W == 0] = avg
+        H[H == 0] = avg
+    elif init == "nndsvdar":
+        rng = check_random_state(random_state)
+        avg = X.mean()
+        W[W == 0] = abs(avg * rng.randn(len(W[W == 0])) / 100)
+        H[H == 0] = abs(avg * rng.randn(len(H[H == 0])) / 100)
+    else:
+        raise ValueError(
+            'Invalid init parameter: got %r instead of one of %r' %
+            (init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar')))
+
+    return W, H
+
+
+def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle,
+                               random_state):
+    """Helper function for _fit_coordinate_descent
+
+    Update W to minimize the objective function, iterating once over all
+    coordinates. By symmetry, to update H, one can call
+    _update_coordinate_descent(X.T, Ht, W, ...)
+
+    """
+    n_components = Ht.shape[1]
+
+    HHt = np.dot(Ht.T, Ht)
+    XHt = safe_sparse_dot(X, Ht)
+
+    # L2 regularization corresponds to increase of the diagonal of HHt
+    if l2_reg != 0.:
+        # adds l2_reg only on the diagonal
+        HHt.flat[::n_components + 1] += l2_reg
+    # L1 regularization corresponds to decrease of each element of XHt
+    if l1_reg != 0.:
+        XHt -= l1_reg
+
+    if shuffle:
+        permutation = random_state.permutation(n_components)
+    else:
+        permutation = np.arange(n_components)
+    # The following seems to be required on 64-bit Windows w/ Python 3.5.
+    permutation = np.asarray(permutation, dtype=np.intp)
+    return _update_cdnmf_fast(W, HHt, XHt, permutation)
+
+
+def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0,
+                            l1_reg_H=0, l2_reg_W=0, l2_reg_H=0, update_H=True,
+                            verbose=0, shuffle=False, random_state=None):
+    """Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent
+
+    The objective function is minimized with an alternating minimization of W
+    and H. Each minimization is done with a cyclic (up to a permutation of the
+    features) Coordinate Descent.
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        Constant matrix.
+
+    W : array-like, shape (n_samples, n_components)
+        Initial guess for the solution.
+
+    H : array-like, shape (n_components, n_features)
+        Initial guess for the solution.
+
+    tol : float, default: 1e-4
+        Tolerance of the stopping condition.
+
+    max_iter : integer, default: 200
+        Maximum number of iterations before timing out.
+
+    l1_reg_W : double, default: 0.
+        L1 regularization parameter for W.
+
+    l1_reg_H : double, default: 0.
+        L1 regularization parameter for H.
+
+    l2_reg_W : double, default: 0.
+        L2 regularization parameter for W.
+
+    l2_reg_H : double, default: 0.
+        L2 regularization parameter for H.
+
+    update_H : boolean, default: True
+        Set to True, both W and H will be estimated from initial guesses.
+        Set to False, only W will be estimated.
+
+    verbose : integer, default: 0
+        The verbosity level.
+
+    shuffle : boolean, default: False
+        If true, randomize the order of coordinates in the CD solver.
+
+    random_state : int, RandomState instance or None, optional, default: None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    Returns
+    -------
+    W : array-like, shape (n_samples, n_components)
+        Solution to the non-negative least squares problem.
+
+    H : array-like, shape (n_components, n_features)
+        Solution to the non-negative least squares problem.
+
+    n_iter : int
+        The number of iterations done by the algorithm.
+
+    References
+    ----------
+    Cichocki, Andrzej, and Phan, Anh-Huy. "Fast local algorithms for
+    large scale nonnegative matrix and tensor factorizations."
+    IEICE transactions on fundamentals of electronics, communications and
+    computer sciences 92.3: 708-721, 2009.
+    """
+    # so W and Ht are both in C order in memory
+    Ht = check_array(H.T, order='C')
+    X = check_array(X, accept_sparse='csr')
+
+    rng = check_random_state(random_state)
+
+    for n_iter in range(max_iter):
+        violation = 0.
+
+        # Update W
+        violation += _update_coordinate_descent(X, W, Ht, l1_reg_W,
+                                                l2_reg_W, shuffle, rng)
+        # Update H
+        if update_H:
+            violation += _update_coordinate_descent(X.T, Ht, W, l1_reg_H,
+                                                    l2_reg_H, shuffle, rng)
+
+        if n_iter == 0:
+            violation_init = violation
+
+        if violation_init == 0:
+            break
+
+        if verbose:
+            print("violation:", violation / violation_init)
+
+        if violation / violation_init <= tol:
+            if verbose:
+                print("Converged at iteration", n_iter + 1)
+            break
+
+    return W, Ht.T, n_iter
+
+
+def _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma,
+                             H_sum=None, HHt=None, XHt=None, update_H=True):
+    """update W in Multiplicative Update NMF"""
+    if beta_loss == 2:
+        # Numerator
+        if XHt is None:
+            XHt = safe_sparse_dot(X, H.T)
+        if update_H:
+            # avoid a copy of XHt, which will be re-computed (update_H=True)
+            numerator = XHt
+        else:
+            # preserve the XHt, which is not re-computed (update_H=False)
+            numerator = XHt.copy()
+
+        # Denominator
+        if HHt is None:
+            HHt = np.dot(H, H.T)
+        denominator = np.dot(W, HHt)
+
+    else:
+        # Numerator
+        # if X is sparse, compute WH only where X is non zero
+        WH_safe_X = _special_sparse_dot(W, H, X)
+        if sp.issparse(X):
+            WH_safe_X_data = WH_safe_X.data
+            X_data = X.data
+        else:
+            WH_safe_X_data = WH_safe_X
+            X_data = X
+            # copy used in the Denominator
+            WH = WH_safe_X.copy()
+            if beta_loss - 1. < 0:
+                WH[WH == 0] = EPSILON
+
+        # to avoid taking a negative power of zero
+        if beta_loss - 2. < 0:
+            WH_safe_X_data[WH_safe_X_data == 0] = EPSILON
+
+        if beta_loss == 1:
+            np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data)
+        elif beta_loss == 0:
+            # speeds up computation time
+            # refer to /numpy/numpy/issues/9363
+            WH_safe_X_data **= -1
+            WH_safe_X_data **= 2
+            # element-wise multiplication
+            WH_safe_X_data *= X_data
+        else:
+            WH_safe_X_data **= beta_loss - 2
+            # element-wise multiplication
+            WH_safe_X_data *= X_data
+
+        # here numerator = dot(X * (dot(W, H) ** (beta_loss - 2)), H.T)
+        numerator = safe_sparse_dot(WH_safe_X, H.T)
+
+        # Denominator
+        if beta_loss == 1:
+            if H_sum is None:
+                H_sum = np.sum(H, axis=1)  # shape(n_components, )
+            denominator = H_sum[np.newaxis, :]
+
+        else:
+            # computation of WHHt = dot(dot(W, H) ** beta_loss - 1, H.T)
+            if sp.issparse(X):
+                # memory efficient computation
+                # (compute row by row, avoiding the dense matrix WH)
+                WHHt = np.empty(W.shape)
+                for i in range(X.shape[0]):
+                    WHi = np.dot(W[i, :], H)
+                    if beta_loss - 1 < 0:
+                        WHi[WHi == 0] = EPSILON
+                    WHi **= beta_loss - 1
+                    WHHt[i, :] = np.dot(WHi, H.T)
+            else:
+                WH **= beta_loss - 1
+                WHHt = np.dot(WH, H.T)
+            denominator = WHHt
+
+    # Add L1 and L2 regularization
+    if l1_reg_W > 0:
+        denominator += l1_reg_W
+    if l2_reg_W > 0:
+        denominator = denominator + l2_reg_W * W
+    denominator[denominator == 0] = EPSILON
+
+    numerator /= denominator
+    delta_W = numerator
+
+    # gamma is in ]0, 1]
+    if gamma != 1:
+        delta_W **= gamma
+
+    return delta_W, H_sum, HHt, XHt
+
+
+def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma):
+    """update H in Multiplicative Update NMF"""
+    if beta_loss == 2:
+        numerator = safe_sparse_dot(W.T, X)
+        denominator = np.dot(np.dot(W.T, W), H)
+
+    else:
+        # Numerator
+        WH_safe_X = _special_sparse_dot(W, H, X)
+        if sp.issparse(X):
+            WH_safe_X_data = WH_safe_X.data
+            X_data = X.data
+        else:
+            WH_safe_X_data = WH_safe_X
+            X_data = X
+            # copy used in the Denominator
+            WH = WH_safe_X.copy()
+            if beta_loss - 1. < 0:
+                WH[WH == 0] = EPSILON
+
+        # to avoid division by zero
+        if beta_loss - 2. < 0:
+            WH_safe_X_data[WH_safe_X_data == 0] = EPSILON
+
+        if beta_loss == 1:
+            np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data)
+        elif beta_loss == 0:
+            # speeds up computation time
+            # refer to /numpy/numpy/issues/9363
+            WH_safe_X_data **= -1
+            WH_safe_X_data **= 2
+            # element-wise multiplication
+            WH_safe_X_data *= X_data
+        else:
+            WH_safe_X_data **= beta_loss - 2
+            # element-wise multiplication
+            WH_safe_X_data *= X_data
+
+        # here numerator = dot(W.T, (dot(W, H) ** (beta_loss - 2)) * X)
+        numerator = safe_sparse_dot(W.T, WH_safe_X)
+
+        # Denominator
+        if beta_loss == 1:
+            W_sum = np.sum(W, axis=0)  # shape(n_components, )
+            W_sum[W_sum == 0] = 1.
+            denominator = W_sum[:, np.newaxis]
+
+        # beta_loss not in (1, 2)
+        else:
+            # computation of WtWH = dot(W.T, dot(W, H) ** beta_loss - 1)
+            if sp.issparse(X):
+                # memory efficient computation
+                # (compute column by column, avoiding the dense matrix WH)
+                WtWH = np.empty(H.shape)
+                for i in range(X.shape[1]):
+                    WHi = np.dot(W, H[:, i])
+                    if beta_loss - 1 < 0:
+                        WHi[WHi == 0] = EPSILON
+                    WHi **= beta_loss - 1
+                    WtWH[:, i] = np.dot(W.T, WHi)
+            else:
+                WH **= beta_loss - 1
+                WtWH = np.dot(W.T, WH)
+            denominator = WtWH
+
+    # Add L1 and L2 regularization
+    if l1_reg_H > 0:
+        denominator += l1_reg_H
+    if l2_reg_H > 0:
+        denominator = denominator + l2_reg_H * H
+    denominator[denominator == 0] = EPSILON
+
+    numerator /= denominator
+    delta_H = numerator
+
+    # gamma is in ]0, 1]
+    if gamma != 1:
+        delta_H **= gamma
+
+    return delta_H
+
+
+def _fit_multiplicative_update(X, W, H, beta_loss='frobenius',
+                               max_iter=200, tol=1e-4,
+                               l1_reg_W=0, l1_reg_H=0, l2_reg_W=0, l2_reg_H=0,
+                               update_H=True, verbose=0):
+    """Compute Non-negative Matrix Factorization with Multiplicative Update
+
+    The objective function is _beta_divergence(X, WH) and is minimized with an
+    alternating minimization of W and H. Each minimization is done with a
+    Multiplicative Update.
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        Constant input matrix.
+
+    W : array-like, shape (n_samples, n_components)
+        Initial guess for the solution.
+
+    H : array-like, shape (n_components, n_features)
+        Initial guess for the solution.
+
+    beta_loss : float or string, default 'frobenius'
+        String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.
+        Beta divergence to be minimized, measuring the distance between X
+        and the dot product WH. Note that values different from 'frobenius'
+        (or 2) and 'kullback-leibler' (or 1) lead to significantly slower
+        fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
+        matrix X cannot contain zeros.
+
+    max_iter : integer, default: 200
+        Number of iterations.
+
+    tol : float, default: 1e-4
+        Tolerance of the stopping condition.
+
+    l1_reg_W : double, default: 0.
+        L1 regularization parameter for W.
+
+    l1_reg_H : double, default: 0.
+        L1 regularization parameter for H.
+
+    l2_reg_W : double, default: 0.
+        L2 regularization parameter for W.
+
+    l2_reg_H : double, default: 0.
+        L2 regularization parameter for H.
+
+    update_H : boolean, default: True
+        Set to True, both W and H will be estimated from initial guesses.
+        Set to False, only W will be estimated.
+
+    verbose : integer, default: 0
+        The verbosity level.
+
+    Returns
+    -------
+    W : array, shape (n_samples, n_components)
+        Solution to the non-negative least squares problem.
+
+    H : array, shape (n_components, n_features)
+        Solution to the non-negative least squares problem.
+
+    n_iter : int
+        The number of iterations done by the algorithm.
+
+    References
+    ----------
+    Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
+    factorization with the beta-divergence. Neural Computation, 23(9).
+    """
+    start_time = time.time()
+
+    beta_loss = _beta_loss_to_float(beta_loss)
+
+    # gamma for Maximization-Minimization (MM) algorithm [Fevotte 2011]
+    if beta_loss < 1:
+        gamma = 1. / (2. - beta_loss)
+    elif beta_loss > 2:
+        gamma = 1. / (beta_loss - 1.)
+    else:
+        gamma = 1.
+
+    # used for the convergence criterion
+    error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True)
+    previous_error = error_at_init
+
+    H_sum, HHt, XHt = None, None, None
+    for n_iter in range(1, max_iter + 1):
+        # update W
+        # H_sum, HHt and XHt are saved and reused if not update_H
+        delta_W, H_sum, HHt, XHt = _multiplicative_update_w(
+            X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma,
+            H_sum, HHt, XHt, update_H)
+        W *= delta_W
+
+        # necessary for stability with beta_loss < 1
+        if beta_loss < 1:
+            W[W < np.finfo(np.float64).eps] = 0.
+
+        # update H
+        if update_H:
+            delta_H = _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H,
+                                               l2_reg_H, gamma)
+            H *= delta_H
+
+            # These values will be recomputed since H changed
+            H_sum, HHt, XHt = None, None, None
+
+            # necessary for stability with beta_loss < 1
+            if beta_loss <= 1:
+                H[H < np.finfo(np.float64).eps] = 0.
+
+        # test convergence criterion every 10 iterations
+        if tol > 0 and n_iter % 10 == 0:
+            error = _beta_divergence(X, W, H, beta_loss, square_root=True)
+
+            if verbose:
+                iter_time = time.time()
+                print("Epoch %02d reached after %.3f seconds, error: %f" %
+                      (n_iter, iter_time - start_time, error))
+
+            if (previous_error - error) / error_at_init < tol:
+                break
+            previous_error = error
+
+    # do not print if we have already printed in the convergence test
+    if verbose and (tol == 0 or n_iter % 10 != 0):
+        end_time = time.time()
+        print("Epoch %02d reached after %.3f seconds." %
+              (n_iter, end_time - start_time))
+
+    return W, H, n_iter
+
+
+def non_negative_factorization(X, W=None, H=None, n_components=None,
+                               init='warn', update_H=True, solver='cd',
+                               beta_loss='frobenius', tol=1e-4,
+                               max_iter=200, alpha=0., l1_ratio=0.,
+                               regularization=None, random_state=None,
+                               verbose=0, shuffle=False):
+    r"""Compute Non-negative Matrix Factorization (NMF)
+
+    Find two non-negative matrices (W, H) whose product approximates the non-
+    negative matrix X. This factorization can be used for example for
+    dimensionality reduction, source separation or topic extraction.
+
+    The objective function is::
+
+        0.5 * ||X - WH||_Fro^2
+        + alpha * l1_ratio * ||vec(W)||_1
+        + alpha * l1_ratio * ||vec(H)||_1
+        + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
+        + 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2
+
+    Where::
+
+        ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm)
+        ||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm)
+
+    For multiplicative-update ('mu') solver, the Frobenius norm
+    (0.5 * ||X - WH||_Fro^2) can be changed into another beta-divergence loss,
+    by changing the beta_loss parameter.
+
+    The objective function is minimized with an alternating minimization of W
+    and H. If H is given and update_H=False, it solves for W only.
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        Constant matrix.
+
+    W : array-like, shape (n_samples, n_components)
+        If init='custom', it is used as initial guess for the solution.
+
+    H : array-like, shape (n_components, n_features)
+        If init='custom', it is used as initial guess for the solution.
+        If update_H=False, it is used as a constant, to solve for W only.
+
+    n_components : integer
+        Number of components, if n_components is not set all features
+        are kept.
+
+    init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom'
+        Method used to initialize the procedure.
+        Default: 'random'.
+
+        The default value will change from 'random' to None in version 0.23
+        to make it consistent with decomposition.NMF.
+
+        Valid options:
+
+        - None: 'nndsvd' if n_components < n_features, otherwise 'random'.
+
+        - 'random': non-negative random matrices, scaled with:
+            sqrt(X.mean() / n_components)
+
+        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
+            initialization (better for sparseness)
+
+        - 'nndsvda': NNDSVD with zeros filled with the average of X
+            (better when sparsity is not desired)
+
+        - 'nndsvdar': NNDSVD with zeros filled with small random values
+            (generally faster, less accurate alternative to NNDSVDa
+            for when sparsity is not desired)
+
+        - 'custom': use custom matrices W and H
+
+    update_H : boolean, default: True
+        Set to True, both W and H will be estimated from initial guesses.
+        Set to False, only W will be estimated.
+
+    solver : 'cd' | 'mu'
+        Numerical solver to use:
+        'cd' is a Coordinate Descent solver that uses Fast Hierarchical
+            Alternating Least Squares (Fast HALS).
+        'mu' is a Multiplicative Update solver.
+
+        .. versionadded:: 0.17
+           Coordinate Descent solver.
+
+        .. versionadded:: 0.19
+           Multiplicative Update solver.
+
+    beta_loss : float or string, default 'frobenius'
+        String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.
+        Beta divergence to be minimized, measuring the distance between X
+        and the dot product WH. Note that values different from 'frobenius'
+        (or 2) and 'kullback-leibler' (or 1) lead to significantly slower
+        fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
+        matrix X cannot contain zeros. Used only in 'mu' solver.
+
+        .. versionadded:: 0.19
+
+    tol : float, default: 1e-4
+        Tolerance of the stopping condition.
+
+    max_iter : integer, default: 200
+        Maximum number of iterations before timing out.
+
+    alpha : double, default: 0.
+        Constant that multiplies the regularization terms.
+
+    l1_ratio : double, default: 0.
+        The regularization mixing parameter, with 0 <= l1_ratio <= 1.
+        For l1_ratio = 0 the penalty is an elementwise L2 penalty
+        (aka Frobenius Norm).
+        For l1_ratio = 1 it is an elementwise L1 penalty.
+        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
+
+    regularization : 'both' | 'components' | 'transformation' | None
+        Select whether the regularization affects the components (H), the
+        transformation (W), both or none of them.
+
+    random_state : int, RandomState instance or None, optional, default: None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    verbose : integer, default: 0
+        The verbosity level.
+
+    shuffle : boolean, default: False
+        If true, randomize the order of coordinates in the CD solver.
+
+    Returns
+    -------
+    W : array-like, shape (n_samples, n_components)
+        Solution to the non-negative least squares problem.
+
+    H : array-like, shape (n_components, n_features)
+        Solution to the non-negative least squares problem.
+
+    n_iter : int
+        Actual number of iterations.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+    >>> from sklearn.decomposition import non_negative_factorization
+    >>> W, H, n_iter = non_negative_factorization(X, n_components=2,
+    ... init='random', random_state=0)
+
+    References
+    ----------
+    Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
+    large scale nonnegative matrix and tensor factorizations."
+    IEICE transactions on fundamentals of electronics, communications and
+    computer sciences 92.3: 708-721, 2009.
+
+    Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
+    factorization with the beta-divergence. Neural Computation, 23(9).
+    """
+
+    X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
+    check_non_negative(X, "NMF (input X)")
+    beta_loss = _check_string_param(solver, regularization, beta_loss, init)
+
+    if safe_min(X) == 0 and beta_loss <= 0:
+        raise ValueError("When beta_loss <= 0 and X contains zeros, "
+                         "the solver may diverge. Please add small values to "
+                         "X, or use a positive beta_loss.")
+
+    n_samples, n_features = X.shape
+    if n_components is None:
+        n_components = n_features
+
+    if not isinstance(n_components, INTEGER_TYPES) or n_components <= 0:
+        raise ValueError("Number of components must be a positive integer;"
+                         " got (n_components=%r)" % n_components)
+    if not isinstance(max_iter, INTEGER_TYPES) or max_iter < 0:
+        raise ValueError("Maximum number of iterations must be a positive "
+                         "integer; got (max_iter=%r)" % max_iter)
+    if not isinstance(tol, numbers.Number) or tol < 0:
+        raise ValueError("Tolerance for stopping criteria must be "
+                         "positive; got (tol=%r)" % tol)
+
+    if init == "warn":
+        if n_components < n_features:
+            warnings.warn("The default value of init will change from "
+                          "random to None in 0.23 to make it consistent "
+                          "with decomposition.NMF.", FutureWarning)
+        init = "random"
+
+    # check W and H, or initialize them
+    if init == 'custom' and update_H:
+        _check_init(H, (n_components, n_features), "NMF (input H)")
+        _check_init(W, (n_samples, n_components), "NMF (input W)")
+    elif not update_H:
+        _check_init(H, (n_components, n_features), "NMF (input H)")
+        # 'mu' solver should not be initialized by zeros
+        if solver == 'mu':
+            avg = np.sqrt(X.mean() / n_components)
+            W = np.full((n_samples, n_components), avg)
+        else:
+            W = np.zeros((n_samples, n_components))
+    else:
+        W, H = _initialize_nmf(X, n_components, init=init,
+                               random_state=random_state)
+
+    l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization(
+        alpha, l1_ratio, regularization)
+
+    if solver == 'cd':
+        W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter,
+                                               l1_reg_W, l1_reg_H,
+                                               l2_reg_W, l2_reg_H,
+                                               update_H=update_H,
+                                               verbose=verbose,
+                                               shuffle=shuffle,
+                                               random_state=random_state)
+    elif solver == 'mu':
+        W, H, n_iter = _fit_multiplicative_update(X, W, H, beta_loss, max_iter,
+                                                  tol, l1_reg_W, l1_reg_H,
+                                                  l2_reg_W, l2_reg_H, update_H,
+                                                  verbose)
+
+    else:
+        raise ValueError("Invalid solver parameter '%s'." % solver)
+
+    if n_iter == max_iter and tol > 0:
+        warnings.warn("Maximum number of iteration %d reached. Increase it to"
+                      " improve convergence." % max_iter, ConvergenceWarning)
+
+    return W, H, n_iter
+
+
+class NMFOriginal(BaseEstimator, TransformerMixin):
+    r"""Non-Negative Matrix Factorization (NMF)
+
+    Find two non-negative matrices (W, H) whose product approximates the non-
+    negative matrix X. This factorization can be used for example for
+    dimensionality reduction, source separation or topic extraction.
+
+    The objective function is::
+
+        0.5 * ||X - WH||_Fro^2
+        + alpha * l1_ratio * ||vec(W)||_1
+        + alpha * l1_ratio * ||vec(H)||_1
+        + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
+        + 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2
+
+    Where::
+
+        ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm)
+        ||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm)
+
+    For multiplicative-update ('mu') solver, the Frobenius norm
+    (0.5 * ||X - WH||_Fro^2) can be changed into another beta-divergence loss,
+    by changing the beta_loss parameter.
+
+    The objective function is minimized with an alternating minimization of W
+    and H.
+
+    Read more in the :ref:`User Guide <NMF>`.
+
+    Parameters
+    ----------
+    n_components : int or None
+        Number of components, if n_components is not set all features
+        are kept.
+
+    init : None | 'random' | 'nndsvd' |  'nndsvda' | 'nndsvdar' | 'custom'
+        Method used to initialize the procedure.
+        Default: None.
+        Valid options:
+
+        - None: 'nndsvd' if n_components <= min(n_samples, n_features),
+            otherwise random.
+
+        - 'random': non-negative random matrices, scaled with:
+            sqrt(X.mean() / n_components)
+
+        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
+            initialization (better for sparseness)
+
+        - 'nndsvda': NNDSVD with zeros filled with the average of X
+            (better when sparsity is not desired)
+
+        - 'nndsvdar': NNDSVD with zeros filled with small random values
+            (generally faster, less accurate alternative to NNDSVDa
+            for when sparsity is not desired)
+
+        - 'custom': use custom matrices W and H
+
+    solver : 'cd' | 'mu'
+        Numerical solver to use:
+        'cd' is a Coordinate Descent solver.
+        'mu' is a Multiplicative Update solver.
+
+        .. versionadded:: 0.17
+           Coordinate Descent solver.
+
+        .. versionadded:: 0.19
+           Multiplicative Update solver.
+
+    beta_loss : float or string, default 'frobenius'
+        String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.
+        Beta divergence to be minimized, measuring the distance between X
+        and the dot product WH. Note that values different from 'frobenius'
+        (or 2) and 'kullback-leibler' (or 1) lead to significantly slower
+        fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
+        matrix X cannot contain zeros. Used only in 'mu' solver.
+
+        .. versionadded:: 0.19
+
+    tol : float, default: 1e-4
+        Tolerance of the stopping condition.
+
+    max_iter : integer, default: 200
+        Maximum number of iterations before timing out.
+
+    random_state : int, RandomState instance or None, optional, default: None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    alpha : double, default: 0.
+        Constant that multiplies the regularization terms. Set it to zero to
+        have no regularization.
+
+        .. versionadded:: 0.17
+           *alpha* used in the Coordinate Descent solver.
+
+    l1_ratio : double, default: 0.
+        The regularization mixing parameter, with 0 <= l1_ratio <= 1.
+        For l1_ratio = 0 the penalty is an elementwise L2 penalty
+        (aka Frobenius Norm).
+        For l1_ratio = 1 it is an elementwise L1 penalty.
+        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
+
+        .. versionadded:: 0.17
+           Regularization parameter *l1_ratio* used in the Coordinate Descent
+           solver.
+
+    verbose : bool, default=False
+        Whether to be verbose.
+
+    shuffle : boolean, default: False
+        If true, randomize the order of coordinates in the CD solver.
+
+        .. versionadded:: 0.17
+           *shuffle* parameter used in the Coordinate Descent solver.
+
+    Attributes
+    ----------
+    components_ : array, [n_components, n_features]
+        Factorization matrix, sometimes called 'dictionary'.
+
+    reconstruction_err_ : number
+        Frobenius norm of the matrix difference, or beta-divergence, between
+        the training data ``X`` and the reconstructed data ``WH`` from
+        the fitted model.
+
+    n_iter_ : int
+        Actual number of iterations.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+    >>> from sklearn.decomposition import NMF
+    >>> model = NMF(n_components=2, init='random', random_state=0)
+    >>> W = model.fit_transform(X)
+    >>> H = model.components_
+
+    References
+    ----------
+    Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
+    large scale nonnegative matrix and tensor factorizations."
+    IEICE transactions on fundamentals of electronics, communications and
+    computer sciences 92.3: 708-721, 2009.
+
+    Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
+    factorization with the beta-divergence. Neural Computation, 23(9).
+    """
+
+    def __init__(self, n_components=None, init=None, solver='cd',
+                 beta_loss='frobenius', tol=1e-4, max_iter=200,
+                 random_state=None, alpha=0., l1_ratio=0., verbose=0,
+                 shuffle=False):
+        self.n_components = n_components
+        self.init = init
+        self.solver = solver
+        self.beta_loss = beta_loss
+        self.tol = tol
+        self.max_iter = max_iter
+        self.random_state = random_state
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+        self.verbose = verbose
+        self.shuffle = shuffle
+
+    def fit_transform(self, X, y=None, W=None, H=None):
+        """Learn a NMF model for the data X and returns the transformed data.
+
+        This is more efficient than calling fit followed by transform.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Data matrix to be decomposed
+
+        y : Ignored
+
+        W : array-like, shape (n_samples, n_components)
+            If init='custom', it is used as initial guess for the solution.
+
+        H : array-like, shape (n_components, n_features)
+            If init='custom', it is used as initial guess for the solution.
+
+        Returns
+        -------
+        W : array, shape (n_samples, n_components)
+            Transformed data.
+        """
+        X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
+
+        W, H, n_iter_ = non_negative_factorization(
+            X=X, W=W, H=H, n_components=self.n_components, init=self.init,
+            update_H=True, solver=self.solver, beta_loss=self.beta_loss,
+            tol=self.tol, max_iter=self.max_iter, alpha=self.alpha,
+            l1_ratio=self.l1_ratio, regularization='both',
+            random_state=self.random_state, verbose=self.verbose,
+            shuffle=self.shuffle)
+
+        self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss,
+                                                    square_root=True)
+
+        self.n_components_ = H.shape[0]
+        self.components_ = H
+        self.n_iter_ = n_iter_
+
+        return W
+
+    def fit(self, X, y=None, **params):
+        """Learn a NMF model for the data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Data matrix to be decomposed
+
+        y : Ignored
+
+        Returns
+        -------
+        self
+        """
+        self.fit_transform(X, **params)
+        return self
+
+    def transform(self, X):
+        """Transform the data X according to the fitted NMF model
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Data matrix to be transformed by the model
+
+        Returns
+        -------
+        W : array, shape (n_samples, n_components)
+            Transformed data
+        """
+        check_is_fitted(self, 'n_components_')
+
+        W, _, n_iter_ = non_negative_factorization(
+            X=X, W=None, H=self.components_, n_components=self.n_components_,
+            init=self.init, update_H=False, solver=self.solver,
+            beta_loss=self.beta_loss, tol=self.tol, max_iter=self.max_iter,
+            alpha=self.alpha, l1_ratio=self.l1_ratio, regularization='both',
+            random_state=self.random_state, verbose=self.verbose,
+            shuffle=self.shuffle)
+
+        return W
+
+    def inverse_transform(self, W):
+        """Transform data back to its original space.
+
+        Parameters
+        ----------
+        W : {array-like, sparse matrix}, shape (n_samples, n_components)
+            Transformed data matrix
+
+        Returns
+        -------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Data matrix of original shape
+
+        .. versionadded:: 0.18
+        """
+        check_is_fitted(self, 'n_components_')
+        return np.dot(W, self.components_)