From c47227dcd3c412966a8b83deccf8d511d298d6ac Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 2 Jul 2019 16:51:37 -0400 Subject: [PATCH 01/27] start on n_features_out_ --- sklearn/base.py | 29 +++++++++++++++++++ sklearn/ensemble/forest.py | 4 ++- sklearn/impute/_base.py | 4 ++- sklearn/impute/_iterative.py | 2 ++ sklearn/kernel_approximation.py | 1 + sklearn/preprocessing/_discretization.py | 4 ++- .../preprocessing/_function_transformer.py | 1 + sklearn/preprocessing/data.py | 12 ++++++-- sklearn/utils/estimator_checks.py | 1 + 9 files changed, 53 insertions(+), 5 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index fb0818efc8248..4cf0b5554ee4b 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -558,6 +558,35 @@ def fit_transform(self, X, y=None, **fit_params): # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X) + @property + def n_features_out_(self): + if hasattr(self, '_n_features_out'): + return self._n_features_out + # Ideally this would be done in each class. + if hasattr(self, 'n_clusters'): + # this is before n_components_ + # because n_components_ means something else + # in agglomerative clustering + n_features = self.n_clusters + elif hasattr(self, '_max_components'): + # special case for LinearDiscriminantAnalysis + n_components = self.n_components or np.inf + n_features = min(self._max_components, n_components) + elif hasattr(self, 'n_components_'): + # n_components could be auto or None + # this is more likely to be an int + n_features = self.n_components_ + elif hasattr(self, 'n_components') and self.n_components is not None: + n_features = self.n_components + elif hasattr(self, 'components_'): + n_features = self.components_.shape[0] + elif hasattr(self, 'get_support'): + # that should only be done in the OneToOneMixin really + n_features = self.get_support().sum() + elif hasattr(self, 'scale_'): + n_features = self.scale_.shape[0] + return n_features + class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 6050fd2773a5f..7cf5648e8ec22 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -1993,7 +1993,9 @@ def fit_transform(self, X, y=None, sample_weight=None): super().fit(X, y, sample_weight=sample_weight) self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output) - return self.one_hot_encoder_.fit_transform(self.apply(X)) + res = self.one_hot_encoder_.fit_transform(self.apply(X)) + self.n_features_out_ = res.shape[1] + return res def transform(self, X): """Transform dataset. diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 51fb223860daf..b26c71c1e6650 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -267,10 +267,12 @@ def fit(self, X, y=None): self.missing_values, fill_value) + self._n_features_out = np.sum(~np.isnan(self.statistics_)) if self.add_indicator: self.indicator_ = MissingIndicator( missing_values=self.missing_values, error_on_new=False) self.indicator_.fit(X) + self._n_features_out += self.indicator_._n_features_out else: self.indicator_ = None @@ -613,7 +615,7 @@ def fit(self, X, y=None): "Got {!r} instead.".format(self.sparse)) self.features_ = self._get_missing_features_info(X)[1] - + self._n_features_out = len(self.features_) return self def transform(self, X): diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 4c3fa4f2c1872..dafe5d259ca22 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -605,6 +605,8 @@ def fit_transform(self, X, y=None): if self.add_indicator: Xt = np.hstack((Xt, X_trans_indicator)) + + self._n_features_out = Xt.shape[1] return Xt def transform(self, X): diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 4f8fd96a10fa6..b3e48d6617f52 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -332,6 +332,7 @@ def fit(self, X, y=None): " you need to provide sample_interval") else: self.sample_interval_ = self.sample_interval + self._n_features_out = (self.sample_steps + 1) * X.shape[1] return self def transform(self, X): diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index b7ffd96032d2a..d3c7af374001f 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -203,7 +203,9 @@ def fit(self, X, y=None): # Fit the OneHotEncoder with toy datasets # so that it's ready for use after the KBinsDiscretizer is fitted self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int)) - + self._n_features_out = np.sum(self.n_bins_) + else: + self.n_features_out = n_features return self def _validate_n_bins(self, n_features): diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index a079612c045d6..707497dc98cff 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -117,6 +117,7 @@ def fit(self, X, y=None): if (self.check_inverse and not (self.func is None or self.inverse_func is None)): self._check_inverse_transform(X) + self.n_features_out_ = None return self def transform(self, X): diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 823eedc8b7dd9..8fbd6ce39b6c9 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1468,6 +1468,7 @@ def fit(self, X, y=None): self.include_bias) self.n_input_features_ = n_features self.n_output_features_ = sum(1 for _ in combinations) + self._n_features_out = self.n_output_features_ return self def transform(self, X): @@ -1721,6 +1722,7 @@ def fit(self, X, y=None): X : array-like """ check_array(X, accept_sparse='csr') + self._n_features_out = X.shape[1] return self def transform(self, X, copy=None): @@ -1855,6 +1857,7 @@ def fit(self, X, y=None): X : array-like """ check_array(X, accept_sparse='csr') + self._n_features_out = X.shape[1] return self def transform(self, X, copy=None): @@ -1927,8 +1930,13 @@ def fit(self, K, y=None): """ K = check_array(K, dtype=FLOAT_DTYPES) n_samples = K.shape[0] + if K.shape[1] != n_samples: + raise ValueError( + "KernelCenterer requires square kernel" + "matrix for training, got kernel of shape {}".format(K.shape)) self.K_fit_rows_ = np.sum(K, axis=0) / n_samples self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples + self._n_features_out = n_samples return self def transform(self, K, copy=True): @@ -2248,7 +2256,7 @@ def fit(self, X, y=None): self._sparse_fit(X, rng) else: self._dense_fit(X, rng) - + self._n_features_out = np.sum(self.n_quantiles_) return self def _transform_col(self, X_col, quantiles, inverse): @@ -2702,7 +2710,7 @@ def _fit(self, X, y=None, force_transform=False): X = self._scaler.fit_transform(X) else: self._scaler.fit(X) - + self._n_features_out = X.shape[1] return X def transform(self, X): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 249cb022f8e87..d1d31f8dc809c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1020,6 +1020,7 @@ def _check_transformer(name, transformer_orig, X, y): else: # check for consistent n_samples assert X_pred.shape[0] == n_samples + assert X_pred.shape[1] == transformer_clone.n_features_out_ if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: From 06b4a08cbac38a21812cd539a61740fd7da1519e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 2 Jul 2019 17:22:57 -0400 Subject: [PATCH 02/27] make sure common tests for transformers respect pairwise --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d1d31f8dc809c..33088ad4e4886 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -962,7 +962,7 @@ def check_transformer_general(name, transformer, readonly_memmap=False): random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) X -= X.min() - + X = pairwise_estimator_convert_X(X, transformer, kernel=rbf_kernel) if readonly_memmap: X, y = create_memmap_backed_data([X, y]) @@ -1067,7 +1067,7 @@ def _check_transformer(name, transformer_orig, X, y): "features in transform is different from" " the number of features in " "fit.".format(name)): - transformer.transform(X.T) + transformer.transform(X[:, :-1]) @ignore_warnings From b13b57ed9860ee36afd5fefdd6190fa4427b9f32 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 2 Jul 2019 17:23:15 -0400 Subject: [PATCH 03/27] fix number of features in quantile transformer --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 8fbd6ce39b6c9..205cae690bcca 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2256,7 +2256,7 @@ def fit(self, X, y=None): self._sparse_fit(X, rng) else: self._dense_fit(X, rng) - self._n_features_out = np.sum(self.n_quantiles_) + self._n_features_out = X.shape[1] return self def _transform_col(self, X_col, quantiles, inverse): From ac8d2430534f31da2b294cf8c48ca474d8226f70 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 2 Jul 2019 17:25:10 -0400 Subject: [PATCH 04/27] only check n_features_out_ if it's not None? --- sklearn/preprocessing/_function_transformer.py | 2 +- sklearn/utils/estimator_checks.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 707497dc98cff..57e07129cdba6 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -117,7 +117,7 @@ def fit(self, X, y=None): if (self.check_inverse and not (self.func is None or self.inverse_func is None)): self._check_inverse_transform(X) - self.n_features_out_ = None + self._n_features_out = None return self def transform(self, X): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 33088ad4e4886..4c2524daa0366 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1020,7 +1020,9 @@ def _check_transformer(name, transformer_orig, X, y): else: # check for consistent n_samples assert X_pred.shape[0] == n_samples - assert X_pred.shape[1] == transformer_clone.n_features_out_ + n_features_out = getattr(transformer_clone, 'n_features_out_', None) + if n_features_out is not None: + assert X_pred.shape[1] == n_features_out if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: From 822dae6e7f9b0a2162780f680fae945f79f5387a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 13:46:58 -0400 Subject: [PATCH 05/27] provide setter for n_features_out_ --- sklearn/base.py | 3 +++ sklearn/impute/_base.py | 6 +++--- sklearn/impute/_iterative.py | 2 +- sklearn/kernel_approximation.py | 2 +- sklearn/preprocessing/_discretization.py | 2 +- sklearn/preprocessing/_function_transformer.py | 2 +- sklearn/preprocessing/data.py | 12 ++++++------ 7 files changed, 16 insertions(+), 13 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 4cf0b5554ee4b..79691a9bfc269 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -587,6 +587,9 @@ def n_features_out_(self): n_features = self.scale_.shape[0] return n_features + @n_features_out_.setter + def n_features_out_(self, val): + self._n_features_out = val class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index b26c71c1e6650..5106691ed2f48 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -267,12 +267,12 @@ def fit(self, X, y=None): self.missing_values, fill_value) - self._n_features_out = np.sum(~np.isnan(self.statistics_)) + self.n_features_out = np.sum(~np.isnan(self.statistics_)) if self.add_indicator: self.indicator_ = MissingIndicator( missing_values=self.missing_values, error_on_new=False) self.indicator_.fit(X) - self._n_features_out += self.indicator_._n_features_out + self.n_features_out += self.indicator.n_features_out else: self.indicator_ = None @@ -615,7 +615,7 @@ def fit(self, X, y=None): "Got {!r} instead.".format(self.sparse)) self.features_ = self._get_missing_features_info(X)[1] - self._n_features_out = len(self.features_) + self.n_features_out = len(self.features_) return self def transform(self, X): diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index dafe5d259ca22..76f38f864afc0 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -606,7 +606,7 @@ def fit_transform(self, X, y=None): if self.add_indicator: Xt = np.hstack((Xt, X_trans_indicator)) - self._n_features_out = Xt.shape[1] + self.n_features_out = Xt.shape[1] return Xt def transform(self, X): diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index b3e48d6617f52..7ea6d4667503c 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -332,7 +332,7 @@ def fit(self, X, y=None): " you need to provide sample_interval") else: self.sample_interval_ = self.sample_interval - self._n_features_out = (self.sample_steps + 1) * X.shape[1] + self.n_features_out = (self.sample_steps + 1) * X.shape[1] return self def transform(self, X): diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index d3c7af374001f..193f70a2c51db 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -203,7 +203,7 @@ def fit(self, X, y=None): # Fit the OneHotEncoder with toy datasets # so that it's ready for use after the KBinsDiscretizer is fitted self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int)) - self._n_features_out = np.sum(self.n_bins_) + self.n_features_out = np.sum(self.n_bins_) else: self.n_features_out = n_features return self diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 57e07129cdba6..2d24ef0a8e3f4 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -117,7 +117,7 @@ def fit(self, X, y=None): if (self.check_inverse and not (self.func is None or self.inverse_func is None)): self._check_inverse_transform(X) - self._n_features_out = None + self.n_features_out = None return self def transform(self, X): diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 205cae690bcca..28774c5163336 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1468,7 +1468,7 @@ def fit(self, X, y=None): self.include_bias) self.n_input_features_ = n_features self.n_output_features_ = sum(1 for _ in combinations) - self._n_features_out = self.n_output_features_ + self.n_features_out = self.n_output_features_ return self def transform(self, X): @@ -1722,7 +1722,7 @@ def fit(self, X, y=None): X : array-like """ check_array(X, accept_sparse='csr') - self._n_features_out = X.shape[1] + self.n_features_out = X.shape[1] return self def transform(self, X, copy=None): @@ -1857,7 +1857,7 @@ def fit(self, X, y=None): X : array-like """ check_array(X, accept_sparse='csr') - self._n_features_out = X.shape[1] + self.n_features_out = X.shape[1] return self def transform(self, X, copy=None): @@ -1936,7 +1936,7 @@ def fit(self, K, y=None): "matrix for training, got kernel of shape {}".format(K.shape)) self.K_fit_rows_ = np.sum(K, axis=0) / n_samples self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples - self._n_features_out = n_samples + self.n_features_out = n_samples return self def transform(self, K, copy=True): @@ -2256,7 +2256,7 @@ def fit(self, X, y=None): self._sparse_fit(X, rng) else: self._dense_fit(X, rng) - self._n_features_out = X.shape[1] + self.n_features_out = X.shape[1] return self def _transform_col(self, X_col, quantiles, inverse): @@ -2710,7 +2710,7 @@ def _fit(self, X, y=None, force_transform=False): X = self._scaler.fit_transform(X) else: self._scaler.fit(X) - self._n_features_out = X.shape[1] + self.n_features_out = X.shape[1] return X def transform(self, X): From 42e50179e37792120b7902c23d9fc4aa79e050a9 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 13:57:04 -0400 Subject: [PATCH 06/27] typo --- sklearn/impute/_base.py | 6 +++--- sklearn/impute/_iterative.py | 2 +- sklearn/kernel_approximation.py | 2 +- sklearn/preprocessing/_discretization.py | 4 ++-- sklearn/preprocessing/_function_transformer.py | 2 +- sklearn/preprocessing/data.py | 12 ++++++------ sklearn/utils/estimator_checks.py | 4 ++-- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index a6a788c75fdf3..18fc07fa38171 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -267,12 +267,12 @@ def fit(self, X, y=None): self.missing_values, fill_value) - self.n_features_out = np.sum(~np.isnan(self.statistics_)) + self.n_features_out_ = np.sum(~np.isnan(self.statistics_)) if self.add_indicator: self.indicator_ = MissingIndicator( missing_values=self.missing_values, error_on_new=False) self.indicator_.fit(X) - self.n_features_out += self.indicator.n_features_out + self.n_features_out_ += self.indicator.n_features_out else: self.indicator_ = None @@ -618,7 +618,7 @@ def _fit(self, X, y=None): missing_features_info = self._get_missing_features_info(X) self.features_ = missing_features_info[1] - self.n_features_out = len(self.features_) + self.n_features_out_ = len(self.features_) return missing_features_info[0] diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index d3a9c8968c2d6..5421a37386b59 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -610,7 +610,7 @@ def fit_transform(self, X, y=None): if self.add_indicator: Xt = np.hstack((Xt, X_trans_indicator)) - self.n_features_out = Xt.shape[1] + self.n_features_out_ = Xt.shape[1] return Xt def transform(self, X): diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 4d98a026a417c..d9b6925465ac5 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -332,7 +332,7 @@ def fit(self, X, y=None): " you need to provide sample_interval") else: self.sample_interval_ = self.sample_interval - self.n_features_out = (self.sample_steps + 1) * X.shape[1] + self.n_features_out_ = (self.sample_steps + 1) * X.shape[1] return self def transform(self, X): diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 193f70a2c51db..55777532a61ed 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -203,9 +203,9 @@ def fit(self, X, y=None): # Fit the OneHotEncoder with toy datasets # so that it's ready for use after the KBinsDiscretizer is fitted self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int)) - self.n_features_out = np.sum(self.n_bins_) + self.n_features_out_ = np.sum(self.n_bins_) else: - self.n_features_out = n_features + self.n_features_out_ = n_features return self def _validate_n_bins(self, n_features): diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 2d24ef0a8e3f4..3605b329d6cbc 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -117,7 +117,7 @@ def fit(self, X, y=None): if (self.check_inverse and not (self.func is None or self.inverse_func is None)): self._check_inverse_transform(X) - self.n_features_out = None + self.n_features_out_ = None return self def transform(self, X): diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 28960b092d88e..a583ba8824f4c 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1473,7 +1473,7 @@ def fit(self, X, y=None): self.include_bias) self.n_input_features_ = n_features self.n_output_features_ = sum(1 for _ in combinations) - self.n_features_out = self.n_output_features_ + self.n_features_out_ = self.n_output_features_ return self def transform(self, X): @@ -1775,7 +1775,7 @@ def fit(self, X, y=None): X : array-like """ check_array(X, accept_sparse='csr') - self.n_features_out = X.shape[1] + self.n_features_out_ = X.shape[1] return self def transform(self, X, copy=None): @@ -1910,7 +1910,7 @@ def fit(self, X, y=None): X : array-like """ check_array(X, accept_sparse='csr') - self.n_features_out = X.shape[1] + self.n_features_out_ = X.shape[1] return self def transform(self, X, copy=None): @@ -2004,7 +2004,7 @@ def fit(self, K, y=None): "matrix for training, got kernel of shape {}".format(K.shape)) self.K_fit_rows_ = np.sum(K, axis=0) / n_samples self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples - self.n_features_out = n_samples + self.n_features_out_ = n_samples return self def transform(self, K, copy=True): @@ -2324,7 +2324,7 @@ def fit(self, X, y=None): self._sparse_fit(X, rng) else: self._dense_fit(X, rng) - self.n_features_out = X.shape[1] + self.n_features_out_ = X.shape[1] return self def _transform_col(self, X_col, quantiles, inverse): @@ -2778,7 +2778,7 @@ def _fit(self, X, y=None, force_transform=False): X = self._scaler.fit_transform(X) else: self._scaler.fit(X) - self.n_features_out = X.shape[1] + self.n_features_out_ = X.shape[1] return X def transform(self, X): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3a77807dd9ef6..4d7b701023182 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1026,8 +1026,8 @@ def _check_transformer(name, transformer_orig, X, y): else: # check for consistent n_samples assert X_pred.shape[0] == n_samples - n_features_out = getattr(transformer_clone, 'n_features_out_', None) - if n_features_out is not None: + n_features_out_ = getattr(transformer_clone, 'n_features_out_', None) + if n_features_out_ is not None: assert X_pred.shape[1] == n_features_out if hasattr(transformer, 'transform'): From 124e3251add0f0b911af24f1daa0dca53e09e122 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 13:59:36 -0400 Subject: [PATCH 07/27] more typos --- sklearn/impute/_base.py | 2 +- sklearn/utils/estimator_checks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 18fc07fa38171..9555f12a81b6b 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -272,7 +272,7 @@ def fit(self, X, y=None): self.indicator_ = MissingIndicator( missing_values=self.missing_values, error_on_new=False) self.indicator_.fit(X) - self.n_features_out_ += self.indicator.n_features_out + self.n_features_out_ += self.indicator.n_features_out_ else: self.indicator_ = None diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4d7b701023182..132cf784e98fc 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1028,7 +1028,7 @@ def _check_transformer(name, transformer_orig, X, y): assert X_pred.shape[0] == n_samples n_features_out_ = getattr(transformer_clone, 'n_features_out_', None) if n_features_out_ is not None: - assert X_pred.shape[1] == n_features_out + assert X_pred.shape[1] == n_features_out_ if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: From 041dfff425ea2ef45adda6388df9fae5ea9433be Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 14:01:47 -0400 Subject: [PATCH 08/27] fix some input validation --- sklearn/kernel_approximation.py | 2 +- sklearn/preprocessing/data.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index d9b6925465ac5..f1729411a4a02 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -318,7 +318,7 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - check_array(X, accept_sparse='csr') + X = check_array(X, accept_sparse='csr') if self.sample_interval is None: # See reference, figure 2 c) if self.sample_steps == 1: diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index a583ba8824f4c..8aa10227a9e6d 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1774,7 +1774,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - check_array(X, accept_sparse='csr') + X = check_array(X, accept_sparse='csr') self.n_features_out_ = X.shape[1] return self @@ -1909,7 +1909,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - check_array(X, accept_sparse='csr') + X = check_array(X, accept_sparse='csr') self.n_features_out_ = X.shape[1] return self From c1d47a117d12c8d402491e37729e18353e98fd59 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 14:05:32 -0400 Subject: [PATCH 09/27] move feature selection n_features_out_ to mixin --- sklearn/base.py | 3 --- sklearn/feature_selection/base.py | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 02cdf069eb250..9d39284376b0d 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -588,9 +588,6 @@ def n_features_out_(self): n_features = self.n_components elif hasattr(self, 'components_'): n_features = self.components_.shape[0] - elif hasattr(self, 'get_support'): - # that should only be done in the OneToOneMixin really - n_features = self.get_support().sum() elif hasattr(self, 'scale_'): n_features = self.scale_.shape[0] return n_features diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py index 5add330188f78..45c60e004047f 100644 --- a/sklearn/feature_selection/base.py +++ b/sklearn/feature_selection/base.py @@ -46,6 +46,10 @@ def get_support(self, indices=False): mask = self._get_support_mask() return mask if not indices else np.where(mask)[0] + @property + def n_features_out_(self): + return self.get_support().sum() + @abstractmethod def _get_support_mask(self): """ From 7735d18cdaf624747b19e9b6b92cad6dbb1a3cdc Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 14:08:27 -0400 Subject: [PATCH 10/27] remove linear discriminant analysis special case --- sklearn/base.py | 4 ---- sklearn/discriminant_analysis.py | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 9d39284376b0d..8a751476f3b09 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -576,10 +576,6 @@ def n_features_out_(self): # because n_components_ means something else # in agglomerative clustering n_features = self.n_clusters - elif hasattr(self, '_max_components'): - # special case for LinearDiscriminantAnalysis - n_components = self.n_components or np.inf - n_features = min(self._max_components, n_components) elif hasattr(self, 'n_components_'): # n_components could be auto or None # this is more likely to be an int diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 9634b303ea946..5ce45ec9e14b3 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -552,6 +552,11 @@ def predict_log_proba(self, X): """ return np.log(self.predict_proba(X)) + @property + def n_features_out_(self): + n_components = self.n_components or np.inf + n_features = min(self._max_components, n_components) + class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin): """Quadratic Discriminant Analysis From aef2283bc88f0e5c7ccc7e22cdda416adc58e377 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 14:14:16 -0400 Subject: [PATCH 11/27] remove special case for clustering --- sklearn/base.py | 14 ++++++++------ sklearn/cluster/birch.py | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 8a751476f3b09..08c85552fa98e 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -458,6 +458,14 @@ def fit_predict(self, X, y=None): self.fit(X) return self.labels_ + @property + def n_features_out_(self): + if not hasattr(self, 'transform'): + raise AttributeError("{} doesn't have n_features_out_" + " as it doesn't define transform.".format( + self.__class__.__name__)) + return self.n_clusters + class BiclusterMixin: """Mixin class for all bicluster estimators in scikit-learn""" @@ -570,12 +578,6 @@ def fit_transform(self, X, y=None, **fit_params): def n_features_out_(self): if hasattr(self, '_n_features_out'): return self._n_features_out - # Ideally this would be done in each class. - if hasattr(self, 'n_clusters'): - # this is before n_components_ - # because n_components_ means something else - # in agglomerative clustering - n_features = self.n_clusters elif hasattr(self, 'n_components_'): # n_components could be auto or None # this is more likely to be an int diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py index 27b5038bb67a3..e595c6b1cafb2 100644 --- a/sklearn/cluster/birch.py +++ b/sklearn/cluster/birch.py @@ -319,7 +319,7 @@ def radius(self): self.sq_norm_) -class Birch(BaseEstimator, TransformerMixin, ClusterMixin): +class Birch(BaseEstimator, ClusterMixin, TransformerMixin): """Implements the Birch clustering algorithm. It is a memory-efficient, online-learning algorithm provided as an From 6a56572cc641a62ff16c6939a4c6989100ab94ad Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 14:15:47 -0400 Subject: [PATCH 12/27] I have no idea how this passed?! --- sklearn/discriminant_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 5ce45ec9e14b3..42647cc9d610c 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -555,7 +555,7 @@ def predict_log_proba(self, X): @property def n_features_out_(self): n_components = self.n_components or np.inf - n_features = min(self._max_components, n_components) + return min(self._max_components, n_components) class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin): From 5c267ce95f0efcd0ffde10e446f899746bb8c2b3 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 14:42:32 -0400 Subject: [PATCH 13/27] remove scaler special case, fix in imputation --- sklearn/base.py | 2 -- sklearn/impute/_base.py | 4 ++-- sklearn/preprocessing/data.py | 14 ++++++++++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 08c85552fa98e..cade0079bbb31 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -586,8 +586,6 @@ def n_features_out_(self): n_features = self.n_components elif hasattr(self, 'components_'): n_features = self.components_.shape[0] - elif hasattr(self, 'scale_'): - n_features = self.scale_.shape[0] return n_features @n_features_out_.setter diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 9555f12a81b6b..bc1bc57171e97 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -267,12 +267,12 @@ def fit(self, X, y=None): self.missing_values, fill_value) - self.n_features_out_ = np.sum(~np.isnan(self.statistics_)) + self.n_features_out_ = np.sum(~_get_mask(self.statistics_, np.nan)) if self.add_indicator: self.indicator_ = MissingIndicator( missing_values=self.missing_values, error_on_new=False) self.indicator_.fit(X) - self.n_features_out_ += self.indicator.n_features_out_ + self.n_features_out_ += self.indicator_.n_features_out_ else: self.indicator_ = None diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 8aa10227a9e6d..03ac2bfb4205a 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -196,7 +196,13 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): return X -class MinMaxScaler(BaseEstimator, TransformerMixin): +class _BaseScaler(BaseEstimator, TransformerMixin): + @property + def n_features_out_(self): + return self.scale_.shape[0] + + +class MinMaxScaler(_BaseScaler): """Transforms features by scaling each feature to a given range. This estimator scales and translates each feature individually such @@ -493,7 +499,7 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True): return X -class StandardScaler(BaseEstimator, TransformerMixin): +class StandardScaler(_BaseScaler): """Standardize features by removing the mean and scaling to unit variance The standard score of a sample `x` is calculated as: @@ -821,7 +827,7 @@ def _more_tags(self): return {'allow_nan': True} -class MaxAbsScaler(BaseEstimator, TransformerMixin): +class MaxAbsScaler(_BaseScaler): """Scale each feature by its maximum absolute value. This estimator scales and translates each feature individually such @@ -1050,7 +1056,7 @@ def maxabs_scale(X, axis=0, copy=True): return X -class RobustScaler(BaseEstimator, TransformerMixin): +class RobustScaler(_BaseScaler): """Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data according to From 9a2e80c300cc9d9b5f7ab884bc00cd13da4a1b7d Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 14:59:36 -0400 Subject: [PATCH 14/27] removed the last bit of magic --- sklearn/base.py | 17 +++++++++++------ sklearn/decomposition/base.py | 4 ++-- sklearn/decomposition/dict_learning.py | 4 ++-- sklearn/decomposition/factor_analysis.py | 4 ++-- sklearn/decomposition/fastica_.py | 4 ++-- sklearn/decomposition/kernel_pca.py | 4 ++-- sklearn/decomposition/nmf.py | 4 ++-- sklearn/decomposition/online_lda.py | 4 ++-- sklearn/decomposition/sparse_pca.py | 4 ++-- sklearn/decomposition/truncated_svd.py | 4 ++-- sklearn/kernel_approximation.py | 8 ++++---- sklearn/manifold/isomap.py | 4 ++-- sklearn/manifold/locally_linear.py | 5 +++-- sklearn/neighbors/nca.py | 4 ++-- sklearn/neural_network/rbm.py | 4 ++-- sklearn/random_projection.py | 5 +++-- sklearn/utils/estimator_checks.py | 6 +++--- 17 files changed, 48 insertions(+), 41 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index cade0079bbb31..1c445f1a96f98 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -576,9 +576,17 @@ def fit_transform(self, X, y=None, **fit_params): @property def n_features_out_(self): - if hasattr(self, '_n_features_out'): - return self._n_features_out - elif hasattr(self, 'n_components_'): + return self._n_features_out + + @n_features_out_.setter + def n_features_out_(self, val): + self._n_features_out = val + + +class ComponentsMixin: + @property + def n_features_out_(self): + if hasattr(self, 'n_components_'): # n_components could be auto or None # this is more likely to be an int n_features = self.n_components_ @@ -588,9 +596,6 @@ def n_features_out_(self): n_features = self.components_.shape[0] return n_features - @n_features_out_.setter - def n_features_out_(self, val): - self._n_features_out = val class DensityMixin: """Mixin class for all density estimators in scikit-learn.""" diff --git a/sklearn/decomposition/base.py b/sklearn/decomposition/base.py index 3cbdb29723825..761b8c19b6a07 100644 --- a/sklearn/decomposition/base.py +++ b/sklearn/decomposition/base.py @@ -11,13 +11,13 @@ import numpy as np from scipy import linalg -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from ..utils import check_array from ..utils.validation import check_is_fitted from abc import ABCMeta, abstractmethod -class _BasePCA(BaseEstimator, TransformerMixin, metaclass=ABCMeta): +class _BasePCA(BaseEstimator, ComponentsMixin, TransformerMixin, metaclass=ABCMeta): """Base class for PCA methods. Warning: This class should not be used directly. diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 56187948f8554..430994d740875 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -13,7 +13,7 @@ from scipy import linalg from joblib import Parallel, delayed, effective_n_jobs -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from ..utils import (check_array, check_random_state, gen_even_slices, gen_batches) from ..utils.extmath import randomized_svd, row_norms @@ -875,7 +875,7 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, return dictionary.T -class SparseCodingMixin(TransformerMixin): +class SparseCodingMixin(ComponentsMixin, TransformerMixin): """Sparse coding mixin""" def _set_sparse_coding_params(self, n_components, diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py index f9d81737850ff..17d80eebec144 100644 --- a/sklearn/decomposition/factor_analysis.py +++ b/sklearn/decomposition/factor_analysis.py @@ -25,14 +25,14 @@ from scipy import linalg -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, ComponentsMixin, TransformerMixin from ..utils import check_array, check_random_state from ..utils.extmath import fast_logdet, randomized_svd, squared_norm from ..utils.validation import check_is_fitted from ..exceptions import ConvergenceWarning -class FactorAnalysis(BaseEstimator, TransformerMixin): +class FactorAnalysis(BaseEstimator, ComponentsMixin, TransformerMixin): """Factor Analysis (FA) A simple linear generative model with Gaussian latent variables. diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index d841926cdfc87..e386bad765294 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -14,7 +14,7 @@ import numpy as np from scipy import linalg -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, as_float_array, check_random_state @@ -380,7 +380,7 @@ def g(x, fun_args): return None, W, S -class FastICA(BaseEstimator, TransformerMixin): +class FastICA(BaseEstimator, ComponentsMixin, TransformerMixin): """FastICA: a fast algorithm for Independent Component Analysis. Read more in the :ref:`User Guide `. diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 555bd619c5a62..14c7e70260938 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -11,12 +11,12 @@ from ..utils.extmath import svd_flip from ..utils.validation import check_is_fitted, check_array from ..exceptions import NotFittedError -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels -class KernelPCA(BaseEstimator, TransformerMixin): +class KernelPCA(BaseEstimator, ComponentsMixin, TransformerMixin): """Kernel Principal component analysis (KPCA) Non-linear dimensionality reduction through the use of kernels (see diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index f64bc34b7fad7..8dd1e2ddd83d3 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -14,7 +14,7 @@ import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from ..utils import check_random_state, check_array from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm from ..utils.extmath import safe_min @@ -1069,7 +1069,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, return W, H, n_iter -class NMF(BaseEstimator, TransformerMixin): +class NMF(BaseEstimator, ComponentsMixin, TransformerMixin): r"""Non-Negative Matrix Factorization (NMF) Find two non-negative matrices (W, H) whose product approximates the non- diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index c1d482f0a46c6..35e4acd25933b 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -16,7 +16,7 @@ from scipy.special import gammaln from joblib import Parallel, delayed, effective_n_jobs -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from ..utils import (check_random_state, check_array, gen_batches, gen_even_slices) from ..utils.fixes import logsumexp @@ -132,7 +132,7 @@ def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, return (doc_topic_distr, suff_stats) -class LatentDirichletAllocation(BaseEstimator, TransformerMixin): +class LatentDirichletAllocation(BaseEstimator, ComponentsMixin, TransformerMixin): """Latent Dirichlet Allocation with online variational Bayes algorithm .. versionadded:: 0.17 diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py index 238f6cc4ef403..7a872b61a113c 100644 --- a/sklearn/decomposition/sparse_pca.py +++ b/sklearn/decomposition/sparse_pca.py @@ -9,7 +9,7 @@ from ..utils import check_random_state, check_array from ..utils.validation import check_is_fitted from ..linear_model import ridge_regression -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from .dict_learning import dict_learning, dict_learning_online @@ -29,7 +29,7 @@ def _check_normalize_components(normalize_components, estimator_name): ) -class SparsePCA(BaseEstimator, TransformerMixin): +class SparsePCA(BaseEstimator, ComponentsMixin, TransformerMixin): """Sparse Principal Components Analysis (SparsePCA) Finds the set of sparse components that can optimally reconstruct diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index ce79fba2fad1d..e0cd5cedb508b 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -10,7 +10,7 @@ import scipy.sparse as sp from scipy.sparse.linalg import svds -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from ..utils import check_array, check_random_state from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis @@ -18,7 +18,7 @@ __all__ = ["TruncatedSVD"] -class TruncatedSVD(BaseEstimator, TransformerMixin): +class TruncatedSVD(BaseEstimator, ComponentsMixin, TransformerMixin): """Dimensionality reduction using truncated SVD (aka LSA). This transformer performs linear dimensionality reduction by means of diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index f1729411a4a02..7d51ad13c0ab6 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -14,14 +14,14 @@ from scipy.linalg import svd from .base import BaseEstimator -from .base import TransformerMixin +from .base import TransformerMixin, ComponentsMixin from .utils import check_array, check_random_state, as_float_array from .utils.extmath import safe_sparse_dot from .utils.validation import check_is_fitted from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS -class RBFSampler(BaseEstimator, TransformerMixin): +class RBFSampler(BaseEstimator, ComponentsMixin, TransformerMixin): """Approximates feature map of an RBF kernel by Monte Carlo approximation of its Fourier transform. @@ -125,7 +125,7 @@ def transform(self, X): return projection -class SkewedChi2Sampler(BaseEstimator, TransformerMixin): +class SkewedChi2Sampler(BaseEstimator, ComponentsMixin, TransformerMixin): """Approximates feature map of the "skewed chi-squared" kernel by Monte Carlo approximation of its Fourier transform. @@ -424,7 +424,7 @@ def _more_tags(self): return {'stateless': True} -class Nystroem(BaseEstimator, TransformerMixin): +class Nystroem(BaseEstimator, ComponentsMixin, TransformerMixin): """Approximate a kernel map using a subset of the training data. Constructs an approximate feature map for an arbitrary kernel diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py index 88c979c0e1fdb..deb245ae8dfad 100644 --- a/sklearn/manifold/isomap.py +++ b/sklearn/manifold/isomap.py @@ -4,7 +4,7 @@ # License: BSD 3 clause (C) 2011 import numpy as np -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from ..neighbors import NearestNeighbors, kneighbors_graph from ..utils import check_array from ..utils.graph import graph_shortest_path @@ -12,7 +12,7 @@ from ..preprocessing import KernelCenterer -class Isomap(BaseEstimator, TransformerMixin): +class Isomap(BaseEstimator, ComponentsMixin, TransformerMixin): """Isomap Embedding Non-linear dimensionality reduction through Isometric Mapping diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py index cf3c58486c27a..1d6ff45a9e6f1 100644 --- a/sklearn/manifold/locally_linear.py +++ b/sklearn/manifold/locally_linear.py @@ -9,7 +9,8 @@ from scipy.sparse import eye, csr_matrix from scipy.sparse.linalg import eigsh -from ..base import BaseEstimator, TransformerMixin, _UnstableArchMixin +from ..base import (BaseEstimator, TransformerMixin, + ComponentsMixin, _UnstableArchMixin) from ..utils import check_random_state, check_array from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted @@ -518,7 +519,7 @@ def locally_linear_embedding( tol=tol, max_iter=max_iter, random_state=random_state) -class LocallyLinearEmbedding(BaseEstimator, TransformerMixin, +class LocallyLinearEmbedding(BaseEstimator, ComponentsMixin, TransformerMixin, _UnstableArchMixin): """Locally Linear Embedding diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py index 5060270ce1e61..72745d99fa6ba 100644 --- a/sklearn/neighbors/nca.py +++ b/sklearn/neighbors/nca.py @@ -17,7 +17,7 @@ from scipy.optimize import minimize from ..utils.extmath import softmax from ..metrics import pairwise_distances -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, ComponentsMixin from ..preprocessing import LabelEncoder from ..decomposition import PCA from ..utils.multiclass import check_classification_targets @@ -27,7 +27,7 @@ from ..exceptions import ConvergenceWarning -class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin): +class NeighborhoodComponentsAnalysis(BaseEstimator, ComponentsMixin, TransformerMixin): """Neighborhood Components Analysis Neighborhood Component Analysis (NCA) is a machine learning algorithm for diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py index b2b6166d4d253..8e184d930b475 100644 --- a/sklearn/neural_network/rbm.py +++ b/sklearn/neural_network/rbm.py @@ -14,7 +14,7 @@ from scipy.special import expit # logistic function from ..base import BaseEstimator -from ..base import TransformerMixin +from ..base import TransformerMixin, ComponentsMixin from ..utils import check_array from ..utils import check_random_state from ..utils import gen_even_slices @@ -23,7 +23,7 @@ from ..utils.validation import check_is_fitted -class BernoulliRBM(BaseEstimator, TransformerMixin): +class BernoulliRBM(BaseEstimator, ComponentsMixin, TransformerMixin): """Bernoulli Restricted Boltzmann Machine (RBM). A Restricted Boltzmann Machine with binary visible units and diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 8297a42ab17f8..ca72973262b4e 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -33,7 +33,7 @@ import numpy as np import scipy.sparse as sp -from .base import BaseEstimator, TransformerMixin +from .base import BaseEstimator, TransformerMixin, ComponentsMixin from .utils import check_random_state from .utils.extmath import safe_sparse_dot @@ -289,7 +289,8 @@ def sparse_random_matrix(n_components, n_features, density='auto', return np.sqrt(1 / density) / np.sqrt(n_components) * components -class BaseRandomProjection(BaseEstimator, TransformerMixin, metaclass=ABCMeta): +class BaseRandomProjection(BaseEstimator, ComponentsMixin, + TransformerMixin, metaclass=ABCMeta): """Base class for random projections. Warning: This class should not be used directly. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 132cf784e98fc..36417c031ed3b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1026,9 +1026,9 @@ def _check_transformer(name, transformer_orig, X, y): else: # check for consistent n_samples assert X_pred.shape[0] == n_samples - n_features_out_ = getattr(transformer_clone, 'n_features_out_', None) - if n_features_out_ is not None: - assert X_pred.shape[1] == n_features_out_ + if transformer_clone.n_features_out_ is not None: + assert X_pred.shape[1] == transformer_clone.n_features_out_ + if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: From 375c1305197e2fd82e5ee23bd769934bf4451a76 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 15:03:40 -0400 Subject: [PATCH 15/27] pep8 --- sklearn/decomposition/base.py | 5 +++-- sklearn/decomposition/online_lda.py | 3 ++- sklearn/neighbors/nca.py | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/base.py b/sklearn/decomposition/base.py index 761b8c19b6a07..761263293b2f5 100644 --- a/sklearn/decomposition/base.py +++ b/sklearn/decomposition/base.py @@ -17,7 +17,8 @@ from abc import ABCMeta, abstractmethod -class _BasePCA(BaseEstimator, ComponentsMixin, TransformerMixin, metaclass=ABCMeta): +class _BasePCA(BaseEstimator, ComponentsMixin, TransformerMixin, + metaclass=ABCMeta): """Base class for PCA methods. Warning: This class should not be used directly. @@ -154,6 +155,6 @@ def inverse_transform(self, X): """ if self.whiten: return np.dot(X, np.sqrt(self.explained_variance_[:, np.newaxis]) * - self.components_) + self.mean_ + self.components_) + self.mean_ else: return np.dot(X, self.components_) + self.mean_ diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 35e4acd25933b..6f51083286912 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -132,7 +132,8 @@ def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, return (doc_topic_distr, suff_stats) -class LatentDirichletAllocation(BaseEstimator, ComponentsMixin, TransformerMixin): +class LatentDirichletAllocation(BaseEstimator, ComponentsMixin, + TransformerMixin): """Latent Dirichlet Allocation with online variational Bayes algorithm .. versionadded:: 0.17 diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py index 72745d99fa6ba..537c832f94657 100644 --- a/sklearn/neighbors/nca.py +++ b/sklearn/neighbors/nca.py @@ -27,7 +27,8 @@ from ..exceptions import ConvergenceWarning -class NeighborhoodComponentsAnalysis(BaseEstimator, ComponentsMixin, TransformerMixin): +class NeighborhoodComponentsAnalysis(BaseEstimator, ComponentsMixin, + TransformerMixin): """Neighborhood Components Analysis Neighborhood Component Analysis (NCA) is a machine learning algorithm for From 0c2bb8a2dc76350ea410974f93afc561e2c77b54 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 30 Jul 2019 16:31:36 -0400 Subject: [PATCH 16/27] add n_features_out_ to voting classifier --- sklearn/ensemble/voting.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py index 7900d28c1f782..416bb1c34383c 100644 --- a/sklearn/ensemble/voting.py +++ b/sklearn/ensemble/voting.py @@ -279,7 +279,9 @@ def fit(self, X, y, sample_weight=None): self.le_ = LabelEncoder().fit(y) self.classes_ = self.le_.classes_ transformed_y = self.le_.transform(y) - + self.n_features_out_ = len(self.estimators) + if self.voting == 'soft': + self.n_features_out_ *= len(self.classes_) return super().fit(X, transformed_y, sample_weight) def predict(self, X): @@ -459,6 +461,7 @@ def fit(self, X, y, sample_weight=None): self : object """ y = column_or_1d(y, warn=True) + self.n_features_out_ = len(self.estimators) return super().fit(X, y, sample_weight) def predict(self, X): From 9ccf0acd5895106f9ac851db904bdc445dec8c10 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 31 Jul 2019 14:18:46 -0400 Subject: [PATCH 17/27] add n_features_out_ to estimator in testing --- sklearn/utils/tests/test_estimator_checks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index b4bd2daac00d7..6c024d46323e6 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -265,6 +265,7 @@ def fit(self, X, y): class SparseTransformer(BaseEstimator): def fit(self, X, y=None): self.X_shape_ = check_array(X).shape + self.n_features_out_ = self.X_shape_[1] return self def fit_transform(self, X, y=None): From 6043a5ceaa8b29ab6d3972728b4d16006eb05c32 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 1 Aug 2019 12:38:06 -0400 Subject: [PATCH 18/27] check that n_components is integer --- sklearn/base.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 1c445f1a96f98..2dbbc44e02a6d 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -6,6 +6,7 @@ import copy import warnings from collections import defaultdict +import numbers import platform import inspect import re @@ -590,10 +591,14 @@ def n_features_out_(self): # n_components could be auto or None # this is more likely to be an int n_features = self.n_components_ - elif hasattr(self, 'n_components') and self.n_components is not None: - n_features = self.n_components elif hasattr(self, 'components_'): n_features = self.components_.shape[0] + elif (hasattr(self, 'n_components') + and isinstance(self.n_components, numbers.Integral)): + n_features = self.n_components + else: + raise AttributeError("{} has no attribute 'n_features_out_'".format( + type(self).__name__)) return n_features From a39ab075e983cdca358079c958e04085d2e4491f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 1 Aug 2019 14:43:42 -0400 Subject: [PATCH 19/27] pep8 --- sklearn/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 2dbbc44e02a6d..0c9ff58daa4c4 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -597,8 +597,9 @@ def n_features_out_(self): and isinstance(self.n_components, numbers.Integral)): n_features = self.n_components else: - raise AttributeError("{} has no attribute 'n_features_out_'".format( - type(self).__name__)) + raise AttributeError( + "{} has no attribute 'n_features_out_'".format( + type(self).__name__)) return n_features From 344d01e65ceb02c1e712b9d4f10a2a862427b0f7 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 9 Sep 2019 11:54:14 -0400 Subject: [PATCH 20/27] explitictly set n_features_out_ in clustering --- sklearn/base.py | 8 -------- sklearn/cluster/birch.py | 2 ++ sklearn/cluster/hierarchical.py | 1 + sklearn/cluster/k_means_.py | 3 +++ 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 37ed1234703f2..4efbcddc6cc7c 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -448,14 +448,6 @@ def fit_predict(self, X, y=None): self.fit(X) return self.labels_ - @property - def n_features_out_(self): - if not hasattr(self, 'transform'): - raise AttributeError("{} doesn't have n_features_out_" - " as it doesn't define transform.".format( - self.__class__.__name__)) - return self.n_clusters - class BiclusterMixin: """Mixin class for all bicluster estimators in scikit-learn""" diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py index 2593d2cfcc3a5..78ec84aaf6eb7 100644 --- a/sklearn/cluster/birch.py +++ b/sklearn/cluster/birch.py @@ -493,6 +493,8 @@ def _fit(self, X): self.subcluster_centers_ = centroids self._global_clustering(X) + self.n_features_out_ = self.n_clusters + return self def _get_leaves(self): diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 36ccf95253e96..2cbddb49c2fcf 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -1036,6 +1036,7 @@ def fit(self, X, y=None, **params): """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], ensure_min_features=2, estimator=self) + self.n_features_out_ = self.n_clusters return AgglomerativeClustering.fit(self, X.T, **params) @property diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index a83df9c836b86..21cdefdd8daa8 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -962,6 +962,7 @@ def fit(self, X, y=None, sample_weight=None): tol=self.tol, random_state=random_state, copy_x=self.copy_x, n_jobs=self.n_jobs, algorithm=self.algorithm, return_n_iter=True) + self.n_features_out_ = self.n_clusters return self def fit_predict(self, X, y=None, sample_weight=None): @@ -1611,6 +1612,7 @@ def fit(self, X, y=None, sample_weight=None): if self.compute_labels: self.labels_, self.inertia_ = \ self._labels_inertia_minibatch(X, sample_weight) + self.n_features_out_ = self.n_clusters return self @@ -1710,6 +1712,7 @@ def partial_fit(self, X, y=None, sample_weight=None): if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia( X, sample_weight, x_squared_norms, self.cluster_centers_) + self.n_features_out_ = self.n_clusters return self From df82f6487f199bb709efabb46bc6d5dd5c5dc552 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 9 Sep 2019 11:54:29 -0400 Subject: [PATCH 21/27] add n_features_out_ to knnimputer --- sklearn/impute/_knn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 0837cc9750e0a..ac19cdb3cf8d4 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -163,7 +163,7 @@ def fit(self, X, y=None): _check_weights(self.weights) self._fit_X = X self._mask_fit_X = _get_mask(self._fit_X, self.missing_values) - + self.n_features_out_ = np.sum(~np.all(self._mask_fit_X, axis=0)) return self def transform(self, X): From dcba7601cd5194b20408454ffd52f36b1f4a5c08 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 24 Sep 2019 17:11:14 -0400 Subject: [PATCH 22/27] add n_features_out_ to neighbors transformers --- sklearn/neighbors/graph.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/neighbors/graph.py b/sklearn/neighbors/graph.py index da3954ff909c7..8bea2cd122afd 100644 --- a/sklearn/neighbors/graph.py +++ b/sklearn/neighbors/graph.py @@ -330,6 +330,10 @@ def fit_transform(self, X, y=None): """ return self.fit(X).transform(X) + @property + def n_features_out_(self): + return self.n_samples_fit_ + class RadiusNeighborsTransformer(NeighborsBase, RadiusNeighborsMixin, UnsupervisedMixin, TransformerMixin): @@ -467,3 +471,7 @@ def fit_transform(self, X, y=None): The diagonal is always explicit. """ return self.fit(X).transform(X) + + @property + def n_features_out_(self): + return self.n_samples_fit_ From d3b02e4d08bed2c64f408219ffd71d2f4448cf00 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 24 Sep 2019 17:25:14 -0400 Subject: [PATCH 23/27] add n_features_out_ to feature union, add test --- sklearn/pipeline.py | 4 ++++ sklearn/tests/test_pipeline.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index a58979142ae7c..ab5665041622d 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -931,6 +931,10 @@ def fit_transform(self, X, y=None, **fit_params): Xs = np.hstack(Xs) return Xs + @property + def n_features_out_(self): + return sum(trans.n_features_out_ for _, trans, _ in self._iter()) + def _log_message(self, name, idx, total): if not self.verbose: return None diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index e02b5ef96b7b0..bbbcfb8ff1c9d 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -20,6 +20,7 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_no_warnings +from sklearn.utils.validation import check_array from sklearn.base import clone, BaseEstimator from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union @@ -93,6 +94,7 @@ def __init__(self, mult=1): self.mult = mult def fit(self, X, y): + self.n_features_out_ = check_array(X).shape[1] return self def transform(self, X): @@ -466,6 +468,7 @@ def test_feature_union(): fs.fit(X, y) X_transformed = fs.transform(X) assert X_transformed.shape == (X.shape[0], 3) + assert fs.n_features_out_ == X_transformed.shape[1] # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) @@ -919,6 +922,7 @@ def test_set_feature_union_step_drop(drop): assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert ['m2__x2', 'm3__x3'] == ft.get_feature_names() + assert ft.n_features_out_ == len(ft.get_feature_names()) ft.set_params(m2=drop) assert_array_equal([[3]], ft.fit(X).transform(X)) @@ -929,6 +933,7 @@ def test_set_feature_union_step_drop(drop): assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert [] == ft.get_feature_names() + assert ft.n_features_out_ == len(ft.get_feature_names()) # check we can change back ft.set_params(m3=mult3) @@ -939,6 +944,7 @@ def test_set_feature_union_step_drop(drop): assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert ['m3__x3'] == ft.get_feature_names() + assert ft.n_features_out_ == len(ft.get_feature_names()) def test_step_name_validation(): From e02d1187206cf918c5b6789cfb071b7ced18549b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 24 Sep 2019 17:35:58 -0400 Subject: [PATCH 24/27] add n_features_out_ to ColumnTransformer and DictVectorizer --- sklearn/compose/_column_transformer.py | 12 ++++++++++++ sklearn/compose/tests/test_column_transformer.py | 3 +++ sklearn/feature_extraction/dict_vectorizer.py | 2 ++ 3 files changed, 17 insertions(+) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 6335fd7a4b20d..c3e78c89df599 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -360,6 +360,18 @@ def get_feature_names(self): trans.get_feature_names()]) return feature_names + @property + def n_features_out_(self): + n_features_out = 0 + for name, trans, column, _ in self._iter(fitted=True): + if trans == 'drop': + continue + elif trans == 'passthrough': + n_features_out += len(column) + else: + n_features_out += trans.n_features_out_ + return n_features_out + def _update_fitted_transformers(self, transformers): # transformers are fitted; excludes 'drop' cases fitted_transformers = iter(transformers) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 094b2769de369..46471c179c7cc 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -662,10 +662,12 @@ def test_column_transformer_get_feature_names(): [('col' + str(i), DictVectorizer(), i) for i in range(2)]) ct.fit(X) assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1__c'] + assert ct.n_features_out_ == len(ct.get_feature_names()) # passthrough transformers not supported ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) ct.fit(X) + assert ct.n_features_out_ == 2 assert_raise_message( NotImplementedError, 'get_feature_names is not yet supported', ct.get_feature_names) @@ -682,6 +684,7 @@ def test_column_transformer_get_feature_names(): [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)]) ct.fit(X) assert ct.get_feature_names() == ['col0__a', 'col0__b'] + assert ct.n_features_out_ == len(ct.get_feature_names()) def test_column_transformer_special_strings(): diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py index 857806c892806..a3b89cf1dbfa9 100644 --- a/sklearn/feature_extraction/dict_vectorizer.py +++ b/sklearn/feature_extraction/dict_vectorizer.py @@ -128,6 +128,7 @@ def fit(self, X, y=None): vocab = {f: i for i, f in enumerate(feature_names)} self.feature_names_ = feature_names + self.n_features_out_ = len(self.feature_names_) self.vocabulary_ = vocab return self @@ -205,6 +206,7 @@ def _transform(self, X, fitting): if fitting: self.feature_names_ = feature_names self.vocabulary_ = vocab + self.n_features_out_ = len(self.feature_names_) return result_matrix From c8db1027fa9abfc23835656ee11a713c606a301f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 25 Sep 2019 17:03:46 -0400 Subject: [PATCH 25/27] add n_features_out_ to stacking regressor and stacking classifier --- sklearn/ensemble/_stacking.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 97f66aa077772..7ca5366e1603f 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -458,7 +458,12 @@ def fit(self, X, y, sample_weight=None): check_classification_targets(y) self._le = LabelEncoder().fit(y) self.classes_ = self._le.classes_ - return super().fit(X, self._le.transform(y), sample_weight) + super().fit(X, self._le.transform(y), sample_weight) + if len(self.classes_) == 2: + self.n_features_out_ = len(self.estimators_) + else: + self.n_features_out_ = len(self.estimators_) * len(self.classes_) + return self @if_delegate_has_method(delegate='final_estimator_') def predict(self, X, **predict_params): @@ -691,7 +696,9 @@ def fit(self, X, y, sample_weight=None): self : object """ y = column_or_1d(y, warn=True) - return super().fit(X, y, sample_weight) + super().fit(X, y, sample_weight) + self.n_features_out_ = len(self.estimators_) + return self def transform(self, X): """Return the predictions for X for each estimator. From 4b539af874f24c59199a9fbd3c8228db6d38de35 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 25 Sep 2019 17:18:21 -0400 Subject: [PATCH 26/27] add _last_non_passthrough_estimator --- sklearn/pipeline.py | 9 +++++++++ sklearn/tests/test_pipeline.py | 2 ++ 2 files changed, 11 insertions(+) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index ab5665041622d..ad9dc88fd4f98 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -244,6 +244,15 @@ def _final_estimator(self): estimator = self.steps[-1][1] return 'passthrough' if estimator is None else estimator + @property + def _final_non_passthrough_estimator(self): + final_estimator = None + for name, est in reversed(self.steps): + if est not in [None, 'passthrough']: + final_estimator = est + break + return final_estimator + def _log_message(self, step_idx): if not self.verbose: return None diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index bbbcfb8ff1c9d..80a5b9b10c4af 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -702,6 +702,7 @@ def make(): assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) + assert pipeline._final_non_passthrough_estimator is mult5 pipeline = make() pipeline.set_params(last=passthrough) @@ -710,6 +711,7 @@ def make(): assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) + assert pipeline._final_non_passthrough_estimator is mult3 assert_raise_message(AttributeError, "'str' object has no attribute 'predict'", getattr, pipeline, 'predict') From 43e9c966e53d2b64a3caa667a18835cffdb68f7e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 25 Sep 2019 17:27:59 -0400 Subject: [PATCH 27/27] add n_features_out_ to pipeline --- sklearn/pipeline.py | 8 ++++++++ sklearn/tests/test_pipeline.py | 11 +++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index ad9dc88fd4f98..23a0d69867e50 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -632,6 +632,14 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) + @property + def n_features_out_(self): + final_est = self._final_non_passthrough_estimator + if final_est is not None: + return final_est.n_features_out_ + else: + return None + def _name_estimators(estimators): """Generate names for estimators.""" diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 80a5b9b10c4af..03bcc52289d76 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -530,13 +530,19 @@ def test_make_union_kwargs(): ) -def test_pipeline_transform(): +@pytest.mark.parametrize('add_passthrough', [True, False]) +def test_pipeline_transform(add_passthrough): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2, svd_solver='full') - pipeline = Pipeline([('pca', pca)]) + if add_passthrough: + pipeline = Pipeline([('first', 'passthrough'), + ('pca', pca), + ('last', 'passthrough')]) + else: + pipeline = Pipeline([('pca', pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) @@ -548,6 +554,7 @@ def test_pipeline_transform(): X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2) + assert pipeline.n_features_out_ == X_trans.shape[1] def test_pipeline_fit_transform():