scikit-learn · TomDLT · Jun 10, 2015 · Oct 20, 2020
diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
@@ -744,20 +744,24 @@ to regularize the model. The L2 prior uses the Frobenius norm, while the L1
 prior uses an elementwise L1 norm. As in :class:`ElasticNet`, we control the
 combination of L1 and L2 with the :attr:`l1_ratio` (:math:`\rho`) parameter,
 and the intensity of the regularization with the :attr:`alpha`
-(:math:`\alpha`) parameter. Then the priors terms are:
+(:math:`\alpha`) parameter. The priors are appropriately scaled by the number
+of samples (:math:`n_{samples}`) and the number of features
+(:math:`n_{features}`). Then the priors terms are:
 
 .. math::
-    \alpha \rho ||W||_1 + \alpha \rho ||H||_1
-    + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2
-    + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2
+
+    \alpha \rho ||W||_1 * n_{features} + \alpha \rho ||H||_1 * n_{samples}
+    + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2 * n_{features}
+    + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2 * n_{samples}
 
 and the regularized objective function is:
 
 .. math::
+
     d_{\mathrm{Fro}}(X, WH)
-    + \alpha \rho ||W||_1 + \alpha \rho ||H||_1
-    + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2
-    + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2
+    + \alpha \rho ||W||_1 * n_{features} + \alpha \rho ||H||_1 * n_{samples}
+    + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2 * n_{features}
+    + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2 * n_{samples}
 
 :class:`NMF` regularizes both W and H by default. The :attr:`regularization`
 parameter allows for finer control, with which only W, only H,

diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
@@ -186,14 +186,15 @@ def _special_sparse_dot(W, H, X):
         return np.dot(W, H)
 
 
-def _compute_regularization(alpha, l1_ratio, regularization):
+def _compute_regularization(alpha, l1_ratio, regularization, n_samples,
+                            n_features):
     """Compute L1 and L2 regularization coefficients for W and H."""
     alpha_H = 0.
     alpha_W = 0.
     if regularization in ('both', 'components'):
-        alpha_H = float(alpha)
+        alpha_H = float(alpha) * n_samples
     if regularization in ('both', 'transformation'):
-        alpha_W = float(alpha)
+        alpha_W = float(alpha) * n_features
 
     l1_reg_W = alpha_W * l1_ratio
     l1_reg_H = alpha_H * l1_ratio
@@ -866,13 +867,15 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, *,
 
         .. math::
 
-            0.5 * ||X - WH||_{Fro}^2 + alpha * l1_{ratio} * ||vec(W)||_1
+            0.5 * ||X - WH||_{Fro}^2
 
-            + alpha * l1_{ratio} * ||vec(H)||_1
+            + alpha * l1_{ratio} * ||vec(W)||_1 * n_features
 
-            + 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2
+            + alpha * l1_{ratio} * ||vec(H)||_1 * n_samples
 
-            + 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2
+            + 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2 * n_features
+
+            + 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2 * n_samples
 
     Where:
 
@@ -1066,7 +1069,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, *,
                                random_state=random_state)
 
     l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization(
-        alpha, l1_ratio, regularization)
+        alpha, l1_ratio, regularization, n_samples, n_features)
 
     if solver == 'cd':
         W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter,
@@ -1103,13 +1106,15 @@ class NMF(TransformerMixin, BaseEstimator):
 
         .. math::
 
-            0.5 * ||X - WH||_{Fro}^2 + alpha * l1_{ratio} * ||vec(W)||_1
+            0.5 * ||X - WH||_{Fro}^2
+
+            + alpha * l1_{ratio} * ||vec(W)||_1 * n_features
 
-            + alpha * l1_{ratio} * ||vec(H)||_1
+            + alpha * l1_{ratio} * ||vec(H)||_1 * n_samples
 
-            + 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2
+            + 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2 * n_features
 
-            + 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2
+            + 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2 * n_samples
 
     Where: