scikit-learn · wallygauze · Mar 1, 2017 · Mar 6, 2017 · Mar 8, 2017 · Mar 8, 2017
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
@@ -134,8 +134,11 @@ class PCA(_BasePCA):
         to guess the dimension
         if ``0 < n_components < 1`` and svd_solver == 'full', select the number
         of components such that the amount of variance that needs to be
-        explained is greater than the percentage specified by n_components
-        n_components cannot be equal to n_features for svd_solver == 'arpack'.
+        explained is greater than the percentage specified by n_components.
+        If svd_solver == 'arpack', the number of components must be strictly
+        less than the minimum of n_features and n_samples:
+
+            n_components == min(n_samples, n_features)
 
     copy : bool (default True)
         If False, data passed to fit are overwritten and running
@@ -166,7 +169,7 @@ class PCA(_BasePCA):
         arpack :
             run SVD truncated to n_components calling ARPACK solver via
             `scipy.sparse.linalg.svds`. It requires strictly
-            0 < n_components < X.shape[1]
+            0 < n_components < min(X.shape)
         randomized :
             run randomized SVD by the method of Halko et al.
 
@@ -205,7 +208,7 @@ class PCA(_BasePCA):
         Percentage of variance explained by each of the selected components.
 
         If ``n_components`` is not set then all components are stored and the
-        sum of explained variances is equal to 1.0.
+        sum of the ratios is equal to 1.0.
 
     singular_values_ : array, shape (n_components,)
         The singular values corresponding to each of the selected components.
@@ -221,7 +224,8 @@ class PCA(_BasePCA):
         The estimated number of components. When n_components is set
         to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
         number is estimated from input data. Otherwise it equals the parameter
-        n_components, or n_features if n_components is None.
+        n_components, or the lesser value of n_features and n_samples
+        if n_components is None.
 
     noise_variance_ : float
         The estimated noise covariance following the Probabilistic PCA model
@@ -365,7 +369,10 @@ def _fit(self, X):
 
         # Handle n_components==None
         if self.n_components is None:
-            n_components = X.shape[1]
+            if self.svd_solver is not 'arpack':
+                n_components = min(X.shape)
+            else:
+                n_components = min(X.shape) - 1
         else:
             n_components = self.n_components
 
@@ -395,10 +402,11 @@ def _fit_full(self, X, n_components):
             if n_samples < n_features:
                 raise ValueError("n_components='mle' is only supported "
                                  "if n_samples >= n_features")
-        elif not 0 <= n_components <= n_features:
+        elif not 0 <= n_components <= min(n_samples, n_features):
             raise ValueError("n_components=%r must be between 0 and "
-                             "n_features=%r with svd_solver='full'"
-                             % (n_components, n_features))
+                             "min(n_samples, n_features)=%r with "
+                             "svd_solver='full'"
+                             % (n_components, min(n_samples, n_features)))
 
         # Center data
         self.mean_ = np.mean(X, axis=0)
@@ -453,14 +461,19 @@ def _fit_truncated(self, X, n_components, svd_solver):
             raise ValueError("n_components=%r cannot be a string "
                              "with svd_solver='%s'"
                              % (n_components, svd_solver))
-        elif not 1 <= n_components <= n_features:
+        elif not 1 <= n_components <= min(n_samples, n_features):
             raise ValueError("n_components=%r must be between 1 and "
-                             "n_features=%r with svd_solver='%s'"
-                             % (n_components, n_features, svd_solver))
-        elif svd_solver == 'arpack' and n_components == n_features:
+                             "min(n_samples, n_features)=%r with "
+                             "svd_solver='%s'"
+                             % (n_components, min(n_samples, n_features),
+                                svd_solver))
+        elif svd_solver == 'arpack' and n_components == min(n_samples,
+                                                            n_features):
             raise ValueError("n_components=%r must be stricly less than "
-                             "n_features=%r with svd_solver='%s'"
-                             % (n_components, n_features, svd_solver))
+                             "min(n_samples, n_features)=%r with "
+                             "svd_solver='%s'"
+                             % (n_components, min(n_samples, n_features),
+                                svd_solver))
 
         random_state = check_random_state(self.random_state)
 
@@ -495,7 +508,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
         self.explained_variance_ratio_ = \
             self.explained_variance_ / total_var.sum()
         self.singular_values_ = S.copy()  # Store the singular values.
-        if self.n_components_ < n_features:
+        if self.n_components_ < min(n_samples, n_features):
             self.noise_variance_ = (total_var.sum() -
                                     self.explained_variance_.sum())
         else:

diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
@@ -8,6 +8,7 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_no_warnings
 from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import ignore_warnings
@@ -340,11 +341,29 @@ def test_pca_inverse():
 
 
 def test_pca_validation():
-    X = [[0, 1], [1, 0]]
+    # Ensures that extreme inputs for n_components common to all solvers
+    # (less than 0 or more than the lesser dimension of the input
+    # matrix X) raise errors.
+    X = np.array([[0, 1, 0], [1, 0, 0]])
     for solver in solver_list:
         for n_components in [-1, 3]:
-            assert_raises(ValueError,
-                          PCA(n_components, svd_solver=solver).fit, X)
+            assert_raises_regex(ValueError,
+                                "n_components\=.* must be between .* and min\("
+                                "n_samples, n_features\)\=.* with svd_solver"
+                                "\=\'(?:full|arpack|randomized|auto)\'$",
+                                PCA(n_components, svd_solver=solver).fit, X)
+
+
+def test_n_components_none():
+    # Ensures that n_components == None is handled correctly
+    X = iris.data
+    for solver in solver_list:
+        pca = PCA(svd_solver=solver)
+        pca.fit(X)
+        if solver == 'arpack':
+            assert_equal(pca.n_components_, min(X.shape)-1)
+        else:
+            assert_equal(pca.n_components_, min(X.shape))
 
 
 def test_randomized_pca_check_projection():