From deefdd85bf5bc3c40556c69dd10a7695d87d275b Mon Sep 17 00:00:00 2001 From: Wally Date: Thu, 23 Feb 2017 17:03:28 +0000 Subject: [PATCH 1/3] my test script --- sklearn/decomposition/test.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 sklearn/decomposition/test.py diff --git a/sklearn/decomposition/test.py b/sklearn/decomposition/test.py new file mode 100644 index 0000000000000..180d5f11ca13b --- /dev/null +++ b/sklearn/decomposition/test.py @@ -0,0 +1,14 @@ +import numpy as np +from .pca import PCA +import pandas as pd + +X = np.array([[-1, -1,3,4,-1, -1,3,4], [-2, -1,5,-1, -1,3,4,2], [-3, -2,1,-1, -1,3,4,1], +[1, 1,4,-1, -1,3,4,2], [2, 1,0,-1, -1,3,4,2], [3, 2,10,-1, -1,3,4,10]]) + +ipca = PCA(n_components = 7, svd_solver= "arpack") + +ipca.fit(X) +result = ipca.transform(X) + +print result.shape +print ipca.n_components_ From a903ef5ff5d9f27cba001108ca708997ef9b1120 Mon Sep 17 00:00:00 2001 From: Wally Date: Tue, 28 Feb 2017 14:08:56 +0000 Subject: [PATCH 2/3] r. commit --- sklearn/decomposition/test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/decomposition/test.py b/sklearn/decomposition/test.py index 180d5f11ca13b..4c255517e799e 100644 --- a/sklearn/decomposition/test.py +++ b/sklearn/decomposition/test.py @@ -1,3 +1,5 @@ +#Script to test PCA + import numpy as np from .pca import PCA import pandas as pd From 7f18df2ab4634f35394c3e79668bab9e4417b7e1 Mon Sep 17 00:00:00 2001 From: Wally Date: Wed, 1 Mar 2017 20:02:34 +0000 Subject: [PATCH 3/3] introduced pca_modifications and deleted test.py file --- sklearn/decomposition/pca.py | 42 ++++++++++++++++++++++------------- sklearn/decomposition/test.py | 16 ------------- 2 files changed, 26 insertions(+), 32 deletions(-) delete mode 100644 sklearn/decomposition/test.py diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index a3abaa6217df8..3d5018d118102 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -134,8 +134,11 @@ class PCA(_BasePCA): to guess the dimension if ``0 < n_components < 1`` and svd_solver == 'full', select the number of components such that the amount of variance that needs to be - explained is greater than the percentage specified by n_components - n_components cannot be equal to n_features for svd_solver == 'arpack'. + explained is greater than the percentage specified by n_components. + if svd_solver == 'arpack', the number of components must be strictly + less than the minimum of n_features and n_samples: + + n_components == min(n_samples, n_features) copy : bool (default True) If False, data passed to fit are overwritten and running @@ -166,7 +169,7 @@ class PCA(_BasePCA): arpack : run SVD truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`. It requires strictly - 0 < n_components < X.shape[1] + 0 < n_components < min(X.shape) randomized : run randomized SVD by the method of Halko et al. @@ -205,7 +208,7 @@ class PCA(_BasePCA): Percentage of variance explained by each of the selected components. If ``n_components`` is not set then all components are stored and the - sum of explained variances is equal to 1.0. + sum of the ratios is equal to 1.0. singular_values_ : array, shape (n_components,) The singular values corresponding to each of the selected components. @@ -221,7 +224,8 @@ class PCA(_BasePCA): The estimated number of components. When n_components is set to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this number is estimated from input data. Otherwise it equals the parameter - n_components, or n_features if n_components is None. + n_components, or the lesser value of n_features and n_samples + if n_components is None. noise_variance_ : float The estimated noise covariance following the Probabilistic PCA model @@ -365,7 +369,7 @@ def _fit(self, X): # Handle n_components==None if self.n_components is None: - n_components = X.shape[1] + n_components = min(X.shape) else: n_components = self.n_components @@ -395,10 +399,11 @@ def _fit_full(self, X, n_components): if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") - elif not 0 <= n_components <= n_features: + elif not 0 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 0 and " - "n_features=%r with svd_solver='full'" - % (n_components, n_features)) + "min(n_samples, n_features)=%r with " + "svd_solver='full'" + % (n_components, min(n_samples, n_features))) # Center data self.mean_ = np.mean(X, axis=0) @@ -453,14 +458,19 @@ def _fit_truncated(self, X, n_components, svd_solver): raise ValueError("n_components=%r cannot be a string " "with svd_solver='%s'" % (n_components, svd_solver)) - elif not 1 <= n_components <= n_features: + elif not 1 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 1 and " - "n_features=%r with svd_solver='%s'" - % (n_components, n_features, svd_solver)) - elif svd_solver == 'arpack' and n_components == n_features: + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), + svd_solver)) + elif svd_solver == 'arpack' and n_components == min(n_samples, + n_features): raise ValueError("n_components=%r must be stricly less than " - "n_features=%r with svd_solver='%s'" - % (n_components, n_features, svd_solver)) + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), + svd_solver)) random_state = check_random_state(self.random_state) @@ -495,7 +505,7 @@ def _fit_truncated(self, X, n_components, svd_solver): self.explained_variance_ratio_ = \ self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. - if self.n_components_ < n_features: + if self.n_components_ < min(n_samples, n_features): self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum()) else: diff --git a/sklearn/decomposition/test.py b/sklearn/decomposition/test.py deleted file mode 100644 index 4c255517e799e..0000000000000 --- a/sklearn/decomposition/test.py +++ /dev/null @@ -1,16 +0,0 @@ -#Script to test PCA - -import numpy as np -from .pca import PCA -import pandas as pd - -X = np.array([[-1, -1,3,4,-1, -1,3,4], [-2, -1,5,-1, -1,3,4,2], [-3, -2,1,-1, -1,3,4,1], -[1, 1,4,-1, -1,3,4,2], [2, 1,0,-1, -1,3,4,2], [3, 2,10,-1, -1,3,4,10]]) - -ipca = PCA(n_components = 7, svd_solver= "arpack") - -ipca.fit(X) -result = ipca.transform(X) - -print result.shape -print ipca.n_components_