diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index 7e8e79d9d8bdd..96825873b0a86 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -744,20 +744,24 @@ to regularize the model. The L2 prior uses the Frobenius norm, while the L1 prior uses an elementwise L1 norm. As in :class:`ElasticNet`, we control the combination of L1 and L2 with the :attr:`l1_ratio` (:math:`\rho`) parameter, and the intensity of the regularization with the :attr:`alpha` -(:math:`\alpha`) parameter. Then the priors terms are: +(:math:`\alpha`) parameter. The priors are appropriately scaled by the number +of samples (:math:`n_{samples}`) and the number of features +(:math:`n_{features}`). Then the priors terms are: .. math:: - \alpha \rho ||W||_1 + \alpha \rho ||H||_1 - + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2 - + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2 + + \alpha \rho ||W||_1 * n_{features} + \alpha \rho ||H||_1 * n_{samples} + + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2 * n_{features} + + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2 * n_{samples} and the regularized objective function is: .. math:: + d_{\mathrm{Fro}}(X, WH) - + \alpha \rho ||W||_1 + \alpha \rho ||H||_1 - + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2 - + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2 + + \alpha \rho ||W||_1 * n_{features} + \alpha \rho ||H||_1 * n_{samples} + + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2 * n_{features} + + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2 * n_{samples} :class:`NMF` regularizes both W and H by default. The :attr:`regularization` parameter allows for finer control, with which only W, only H, diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 4431b4fd44b15..0a4e891056346 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -186,14 +186,15 @@ def _special_sparse_dot(W, H, X): return np.dot(W, H) -def _compute_regularization(alpha, l1_ratio, regularization): +def _compute_regularization(alpha, l1_ratio, regularization, n_samples, + n_features): """Compute L1 and L2 regularization coefficients for W and H.""" alpha_H = 0. alpha_W = 0. if regularization in ('both', 'components'): - alpha_H = float(alpha) + alpha_H = float(alpha) * n_samples if regularization in ('both', 'transformation'): - alpha_W = float(alpha) + alpha_W = float(alpha) * n_features l1_reg_W = alpha_W * l1_ratio l1_reg_H = alpha_H * l1_ratio @@ -866,13 +867,15 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, *, .. math:: - 0.5 * ||X - WH||_{Fro}^2 + alpha * l1_{ratio} * ||vec(W)||_1 + 0.5 * ||X - WH||_{Fro}^2 - + alpha * l1_{ratio} * ||vec(H)||_1 + + alpha * l1_{ratio} * ||vec(W)||_1 * n_features - + 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2 + + alpha * l1_{ratio} * ||vec(H)||_1 * n_samples - + 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2 + + 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2 * n_features + + + 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2 * n_samples Where: @@ -1066,7 +1069,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, *, random_state=random_state) l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization( - alpha, l1_ratio, regularization) + alpha, l1_ratio, regularization, n_samples, n_features) if solver == 'cd': W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter, @@ -1103,13 +1106,15 @@ class NMF(TransformerMixin, BaseEstimator): .. math:: - 0.5 * ||X - WH||_{Fro}^2 + alpha * l1_{ratio} * ||vec(W)||_1 + 0.5 * ||X - WH||_{Fro}^2 + + + alpha * l1_{ratio} * ||vec(W)||_1 * n_features - + alpha * l1_{ratio} * ||vec(H)||_1 + + alpha * l1_{ratio} * ||vec(H)||_1 * n_samples - + 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2 + + 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2 * n_features - + 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2 + + 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2 * n_samples Where: